In [27]:
import pandas as pd
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [2]:
data = pd.read_csv("FlightDelays.csv")
data.head()

Unnamed: 0,DEP_TIME,CARRIER,DEST,ORIGIN,Weather,DAY_WEEK,Flight Status
0,600,MQ,JFK,DCA,0,4,0
1,600,MQ,JFK,DCA,0,5,0
2,600,MQ,JFK,DCA,0,6,0
3,600,MQ,JFK,DCA,0,7,0
4,600,MQ,JFK,DCA,0,1,0


In [3]:
data.dtypes

DEP_TIME          int64
CARRIER          object
DEST             object
ORIGIN           object
Weather           int64
DAY_WEEK          int64
Flight Status     int64
dtype: object

In [4]:
data.nunique()

DEP_TIME         59
CARRIER           8
DEST              3
ORIGIN            3
Weather           2
DAY_WEEK          7
Flight Status     2
dtype: int64

In [5]:
data.describe()

Unnamed: 0,DEP_TIME,Weather,DAY_WEEK,Flight Status
count,2201.0,2201.0,2201.0,2201.0
mean,1371.938664,0.014539,3.905498,0.194457
std,432.697149,0.119725,1.903149,0.395872
min,600.0,0.0,1.0,0.0
25%,1000.0,0.0,2.0,0.0
50%,1455.0,0.0,4.0,0.0
75%,1710.0,0.0,5.0,0.0
max,2130.0,1.0,7.0,1.0


In [6]:
bins = [600,1000,1500,1750,2000]
levels = ['level1','level2','level3','level4','level5']
d = dict(enumerate(levels,1))
data['Dep_tim_levels'] = np.vectorize(d.get)(np.digitize(data.DEP_TIME,bins))

In [7]:
np.vectorize(d.get)

<numpy.vectorize at 0x20422593f48>

In [8]:
X = data.drop(["DEP_TIME","Flight Status"],axis=1).astype("category")
y = data["Flight Status"].copy().astype("category")

In [9]:
X.isna().sum()

CARRIER           0
DEST              0
ORIGIN            0
Weather           0
DAY_WEEK          0
Dep_tim_levels    0
dtype: int64

In [28]:
lab_enc=LabelEncoder()

In [32]:
lab_enc.fit_transform(X_train['Dep_tim_levels'])

array([3, 4, 0, ..., 2, 0, 1])

In [35]:
lab_enc.inverse_transform(lab_enc.fit_transform(X_train['Dep_tim_levels']))

array(['level4', 'level5', 'level1', ..., 'level3', 'level1', 'level2'],
      dtype=object)

In [39]:
for i in range(len(X.columns)):
    X.iloc[:,i] = lab_enc.fit_transform(X.iloc[:,i])

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,stratify=y)

In [42]:
y.value_counts()

0    1773
1     428
Name: Flight Status, dtype: int64

In [43]:
list(y_train.value_counts())[0]/sum(list(y_train.value_counts()))

0.8056818181818182

In [44]:
list(y_test.value_counts())[0]/sum(list(y_test.value_counts()))

0.8049886621315193

In [45]:
nb_cl = BernoulliNB()
mn_nb = MultinomialNB()

In [46]:
nb_cl.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [47]:
nb_cl.score(X_train,y_train)

0.821590909090909

In [48]:
nb_cl.score(X_test,y_test)

0.8140589569160998