In [1]:
# Import libraries 
import numpy as np 
import pandas as pd 
# Import label encoder 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report

# Import dataset 
df = pd.read_csv('Processed_data15.csv') 

In [2]:
df.head()

Unnamed: 0,year,month,day,carrier,origin,dest,delayed
0,2013,1,1,UA,EWR,IAH,0
1,2013,1,1,UA,LGA,IAH,1
2,2013,1,1,AA,JFK,MIA,1
3,2013,1,1,B6,JFK,BQN,0
4,2013,1,1,DL,LGA,ATL,0


In [3]:
columns= ['carrier','dest', 'origin']
le=LabelEncoder()
for i in columns:
     df[i]=le.fit_transform(df[i])

In [4]:
df['carrier'].unique()

array([11,  1,  3,  4,  5,  9, 12, 14, 13,  7,  2,  0,  6,  8, 15, 10])

In [5]:
df['origin'].unique()

array([0, 2, 1])

In [6]:
df['dest'].unique()

array([ 43,  57,  12,   4,  68,  35,  42,  53,  70,  99,  49,  89,  30,
        11,  48,  60,  32,  82,  91,  73,  16,  23,  14,  29,  94,  61,
        92, 103,  58,  88,  81,  98,  95,  79,  24,  47,  21,  55,  74,
        83,  28,  22,  96,  64,  46,  54,  40,  10,   5,  13,  72,  97,
        33,   6,  78,  45,  85,  19,  41,  51,  27,   2,   7,  56,  59,
        37,  26,  15,  80,  38,  36,  52,  69,  84,  87,  71,  90,  67,
        25,  65,  93, 102,  77,  31,  75, 100,   9,  66,  18,  39,  17,
        62,  34,  76,   1,   8,   0,  44,  63,  86,  50,  20, 101,   3])

In [7]:
X = df.iloc[:, 0:6].values # from column(years) to column(distance)
X[0:5]

array([[2013,    1,    1,   11,    0,   43],
       [2013,    1,    1,   11,    2,   43],
       [2013,    1,    1,    1,    1,   57],
       [2013,    1,    1,    3,    1,   12],
       [2013,    1,    1,    4,    2,    4]], dtype=int64)

In [8]:
y = df['delayed']
y.head().to_frame()

Unnamed: 0,delayed
0,0
1,1
2,1
3,0
4,0


In [9]:
for i in range(0, 20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    #creating Logistic Regression classifier
    clf = LogisticRegression(random_state=i)
    clf.fit(X_train, y_train)
    
    #determining the score
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print("Test: {}, Train: {} and Random State: {}".format(test_score, train_score, i))

Test: 0.7190163433633725, Train: 0.7207533336388214 and Random State: 0
Test: 0.7213991140980602, Train: 0.7201576318562984 and Random State: 1
Test: 0.7213227432411792, Train: 0.7201767248621485 and Random State: 2
Test: 0.7206506797006262, Train: 0.7203447433136293 and Random State: 3
Test: 0.7202230029020925, Train: 0.72045166414639 and Random State: 4
Test: 0.7167557659996945, Train: 0.7213184866119843 and Random State: 5
Test: 0.7202993737589736, Train: 0.7204325711405398 and Random State: 6
Test: 0.7207881472430121, Train: 0.7203103759030992 and Random State: 7
Test: 0.7204521154727356, Train: 0.7203943851288396 and Random State: 8
Test: 0.719199633419887, Train: 0.7207075104247812 and Random State: 9
Test: 0.7200549870169544, Train: 0.7204936687592601 and Random State: 10
Test: 0.7185275698793341, Train: 0.720875528876262 and Random State: 11
Test: 0.720482663815488, Train: 0.7203867479264996 and Random State: 12
Test: 0.7207423247288834, Train: 0.7203218317066092 and Random Sta

In [10]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (261876, 6) (261876,)
Test set: (65470, 6) (65470,)


In [11]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

In [12]:
#Now we can predict using our test set:

In [13]:
yhat = LR.predict(X_test)
yhat

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob

array([[0.7141788 , 0.2858212 ],
       [0.72200126, 0.27799874],
       [0.73900085, 0.26099915],
       ...,
       [0.71691979, 0.28308021],
       [0.70767286, 0.29232714],
       [0.72115322, 0.27884678]])

In [15]:
accuracy_score(y_test, yhat)

0.7202230029020925

In [16]:
#confusion matrix :

In [17]:
confusion_matrix(yhat,y_test)

array([[47153, 18317],
       [    0,     0]], dtype=int64)

In [18]:
print (classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.72      1.00      0.84     47153
           1       0.00      0.00      0.00     18317

    accuracy                           0.72     65470
   macro avg       0.36      0.50      0.42     65470
weighted avg       0.52      0.72      0.60     65470



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
