# REGRESSION MODELS

In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [131]:
dataset = pd.read_csv('Flight-Delay.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Year,Quarter,Month,DayofMonth,FlightDate,OriginAirportID,Origin,DestAirportID,Dest,...,pressure,cloudcover,DewPointF,WindGustKmph,tempF,WindChillF,humidity,date,time_y,airports
0,0,2016,1,1,1,2016-01-01,14747,SEA,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
1,1,2016,1,1,1,2016-01-01,13303,MIA,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
2,2,2016,1,1,1,2016-01-01,13204,MCO,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
3,3,2016,1,1,1,2016-01-01,12892,LAX,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
4,4,2016,1,1,1,2016-01-01,13930,ORD,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK


In [4]:
dataset.columns

Index(['Unnamed: 0', 'Year', 'Quarter', 'Month', 'DayofMonth', 'FlightDate',
       'OriginAirportID', 'Origin', 'DestAirportID', 'Dest', 'CRSDepTime',
       'DepTime', 'DepDelayMinutes', 'DepDel15', 'CRSArrTime', 'ArrTime',
       'ArrDelayMinutes', 'ArrDel15', 'time_x', 'primary', 'windspeedKmph',
       'winddirDegree', 'weatherCode', 'precipMM', 'visibility', 'pressure',
       'cloudcover', 'DewPointF', 'WindGustKmph', 'tempF', 'WindChillF',
       'humidity', 'date', 'time_y', 'airports'],
      dtype='object')

In [5]:
col = ['Unnamed: 0','FlightDate','time_y','date','primary','Origin','Dest','time_x','airports','ArrTime','CRSArrTime']

In [6]:
dataset.drop(col,axis=1,inplace=True)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851433 entries, 0 to 1851432
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   OriginAirportID  int64  
 5   DestAirportID    int64  
 6   CRSDepTime       int64  
 7   DepTime          float64
 8   DepDelayMinutes  float64
 9   DepDel15         float64
 10  ArrDelayMinutes  float64
 11  ArrDel15         float64
 12  windspeedKmph    int64  
 13  winddirDegree    int64  
 14  weatherCode      int64  
 15  precipMM         float64
 16  visibility       int64  
 17  pressure         int64  
 18  cloudcover       int64  
 19  DewPointF        int64  
 20  WindGustKmph     int64  
 21  tempF            int64  
 22  WindChillF       int64  
 23  humidity         int64  
dtypes: float64(6), int64(18)
memory usage: 339.0 MB


# Training and Test Split

In [8]:
dataset = dataset[dataset['ArrDel15'] == 1]

In [9]:
dataset.drop('ArrDel15',axis=1,inplace=True)

In [10]:
X = dataset.drop('ArrDelayMinutes',axis=1).values
Y = dataset['ArrDelayMinutes'].values

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

# Linear Regression

In [13]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

print('RMSE: ',np.sqrt(mean_squared_error(Y_test,Y_pred)))
print('R Square: ',r2_score(Y_test,Y_pred))
print('MAE: ',mean_absolute_error(Y_test, Y_pred))

RMSE:  17.530573949116594
R Square:  0.9401869294592831
MAE:  12.169075724729154


# Decision Tree Regressor

In [17]:
reg = DecisionTreeRegressor()
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)

print('RMSE: ',np.sqrt(mean_squared_error(Y_test,Y_pred)))
print('R Square: ',r2_score(Y_test,Y_pred))
print('MAE: ',mean_absolute_error(Y_test, Y_pred))

RMSE:  24.158812563976852
R Square:  0.8864061036764228
MAE:  16.6497577694171


# Extra Tress Regressor

In [18]:
reg = ExtraTreesRegressor()
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)

In [17]:
print('RMSE: ',np.sqrt(mean_squared_error(Y_test,Y_pred)))
print('R Square: ',r2_score(Y_test,Y_pred))
print('MAE: ',mean_absolute_error(Y_test, Y_pred))

RMSE:  16.88511057786315
R Square:  0.9445103934048745
MAE:  11.854980286553626


# Gradient Boost Regressor

In [19]:
model = GradientBoostingRegressor()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

print('RMSE: ',np.sqrt(mean_squared_error(Y_test,Y_pred)))
print('R Square: ',r2_score(Y_test,Y_pred))
print('MAE: ',mean_absolute_error(Y_test, Y_pred))

RMSE:  16.864084876386293
R Square:  0.9446485010581376
MAE:  11.65592999318567


# Pipeline 

In [96]:
df = pd.read_csv('Flight-Delay.csv')

In [97]:
col = ['Unnamed: 0','FlightDate','time_y','date','ArrDelayMinutes','primary','Origin','Dest','time_x','airports','ArrTime','CRSArrTime']

In [98]:
df.drop(col,axis=1,inplace=True)

In [99]:
pipe_train = df[df['Year'] == 2016]
pipe_test = df[df['Year'] == 2017]

x_train = pipe_train.drop('ArrDel15',axis=1)
y_train = pipe_train['ArrDel15']

x_test = pipe_test.drop('ArrDel15',axis=1)
y_test = pipe_test['ArrDel15']

### Classifier

In [100]:
clf = GradientBoostingClassifier()

In [101]:
sm = SMOTE(random_state=42)
x_sm, y_sm = sm.fit_resample(x_train, y_train)
clf.fit(x_sm, y_sm)
y_pred_class = clf.predict(x_test)

In [103]:
report = classification_report(y_test, y_pred_class,output_dict=True)
df_rep = pd.DataFrame(report).transpose()
df_rep

Unnamed: 0,precision,recall,f1-score,support
0.0,0.930785,0.95489,0.942683,718933.0
1.0,0.816694,0.738931,0.775869,195542.0
accuracy,0.908712,0.908712,0.908712,0.908712
macro avg,0.873739,0.84691,0.859276,914475.0
weighted avg,0.906389,0.908712,0.907013,914475.0


### Regressor

In [123]:
train = df[df['Year'] == 2017]
train.drop('ArrDel15',axis=1,inplace=True)

In [124]:
train['ArrDel15'] = y_pred_class

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [133]:
train['ArrDelayMinutes'] = dataset[dataset['Year'] == 2017]['ArrDelayMinutes']
train = train[pipe_test['ArrDel15'] == 1]

In [134]:
train.drop('ArrDel15',axis=1,inplace=True)
 
X = train.drop('ArrDelayMinutes',axis=1).values
Y = train['ArrDelayMinutes'].values 

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [135]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('RMSE: ',np.sqrt(mean_squared_error(y_test,y_pred)))
print('R Square: ',r2_score(y_test,y_pred))
print('MAE: ',mean_absolute_error(y_test, y_pred))

RMSE:  17.613856337324105
R Square:  0.948524643745758
MAE:  12.532500904060852
