In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# Data Analysis & Data processing

## Data Analysis

In [None]:
X_dtype = {
    'ID'                   : int,
    'YEAR'                 : int,  
    'MONTH'                : int,  
    'DAY'                  : int,  
    'DAY_OF_WEEK'          : int,  
    'AIRLINE'              : str, 
    'FLIGHT_NUMBER'        : str,  
    'TAIL_NUMBER'          : str, 
    'ORIGIN_AIRPORT'       : str, 
    'DESTINATION_AIRPORT'  : str, 
    'SCHEDULED_DEPARTURE'  : str,  
    'DEPARTURE_TIME'       : str, 
    'DEPARTURE_DELAY'      : float,
    'TAXI_OUT'             : str, 
    'WHEELS_OFF'           : str,
    'SCHEDULED_TIME'       : float,
    'AIR_TIME'             : float,
    'DISTANCE'             : int,
    'SCHEDULED_ARRIVAL'    : str,
    'DIVERTED'             : int,  
    'CANCELLED'            : int,  
    'CANCELLATION_REASON'  : str
}

y_dtype = {
    'ID'                   : int,
    "ARRIVAL_DELAY"        : float
}

X_train_df = pd.read_csv("/kaggle/input/eurecom-aml-2021-challenge-1/data/train_features.csv", dtype=X_dtype)
y_train_df = pd.read_csv("/kaggle/input/eurecom-aml-2021-challenge-1/data/train_targets.csv", dtype=y_dtype)

In [None]:
# Merge feature dataframe and target dataframe for data exploration
df = pd.merge(X_train_df, y_train_df, on='ID')

In [None]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

In [None]:
df.head()

In [None]:
df[['ID', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']].head(10)

--> some airports (origin and destination) have IATA_code in the wrong format

## Creation of corrected dataset

In [None]:
number_name = pd.read_csv("/kaggle/input/bts/L_AIRPORT_ID.csv")
name_IATA = pd.read_csv("/kaggle/input/bts/L_AIRPORT.csv")
number_IATA =  number_name.merge(name_IATA, on="Description")
number_IATA.Code_x = number_IATA.Code_x.astype(str)
number_IATA = number_IATA.set_index('Code_x')

def convert_airports(df):
    cond_origin = (df.ORIGIN_AIRPORT.apply(len)==5)
    f= lambda x: number_IATA.loc[str(x), 'Code_y']
    df.loc[cond_origin, 'ORIGIN_AIRPORT'] = df.loc[cond_origin, 'ORIGIN_AIRPORT'].map(f)
    cond_dest = (df.DESTINATION_AIRPORT.apply(len)==5)
    df.loc[cond_dest, 'DESTINATION_AIRPORT'] = df.loc[cond_dest, 'DESTINATION_AIRPORT'].map(f)
    return df

In [None]:
X_train_df = convert_airports(X_train_df)

X_test_df = pd.read_csv("/kaggle/input/eurecom-aml-2021-challenge-1/data/test_features.csv", dtype=X_dtype)
X_test_df = convert_airports(X_test_df)

#Fixing some issues with the conversion
X_train_df.loc[(X_train_df.ORIGIN_AIRPORT == 'Code_x\n16218    NYL\n16218    YUM\nName: Code_y, dtype: object'),'ORIGIN_AIRPORT']='NYL'
X_train_df.loc[(X_train_df.ORIGIN_AIRPORT == 'Code_x\n10423    AUS\n10423    BSM\nName: Code_y, dtype: object'),'ORIGIN_AIRPORT']='AUS'
X_train_df.loc[(X_train_df.DESTINATION_AIRPORT == 'Code_x\n16218    NYL\n16218    YUM\nName: Code_y, dtype: object'),'DESTINATION_AIRPORT']='NYL'
X_train_df.loc[(X_train_df.DESTINATION_AIRPORT == 'Code_x\n10423    AUS\n10423    BSM\nName: Code_y, dtype: object'),'DESTINATION_AIRPORT']='AUS'
X_test_df.to_csv("/kaggle/working/X_train_df.csv")

X_test_df.loc[(X_test_df.ORIGIN_AIRPORT == 'Code_x\n16218    NYL\n16218    YUM\nName: Code_y, dtype: object'),'ORIGIN_AIRPORT']='NYL'
X_test_df.loc[(X_test_df.ORIGIN_AIRPORT == 'Code_x\n10423    AUS\n10423    BSM\nName: Code_y, dtype: object'),'ORIGIN_AIRPORT']='AUS'
X_test_df.loc[(X_test_df.DESTINATION_AIRPORT == 'Code_x\n16218    NYL\n16218    YUM\nName: Code_y, dtype: object'),'DESTINATION_AIRPORT']='NYL'
X_test_df.loc[(X_test_df.DESTINATION_AIRPORT == 'Code_x\n10423    AUS\n10423    BSM\nName: Code_y, dtype: object'),'DESTINATION_AIRPORT']='AUS'
X_test_df.to_csv("/kaggle/working/X_test_df.csv")


In [None]:
#After saving the new dataset, to work with it
X_train_df = pd.read_csv("/kaggle/input/corrected-dataset-airports/X_train_df.csv", dtype=X_dtype)
y_train_df = pd.read_csv("/kaggle/input/eurecom-aml-2021-challenge-1/data/train_targets.csv", dtype=y_dtype)

X_test_df = pd.read_csv("/kaggle/input/corrected-dataset-airports/X_test_df.csv", dtype=X_dtype)

## Data Analysis : study of parameters

In [None]:
# Convert date-time features from str to datetime format
def parse_hhmm(x):
    try: return pd.datetime.strptime(x, '%H%M')#.time()
    except: return pd.NaT

X_train_df.DEPARTURE_TIME = X_train_df.DEPARTURE_TIME.apply(parse_hhmm)
X_train_df.SCHEDULED_ARRIVAL = X_train_df.SCHEDULED_ARRIVAL.apply(parse_hhmm)

# Merge feature dataframe and target dataframe for data exploration
df = pd.merge(X_train_df, y_train_df, on='ID')

df['DELAYED'] = df.ARRIVAL_DELAY > 0
df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

In [None]:
df[['ID', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']].head(10)

In [None]:
def get_stats_delay(group):
    return {'mean_delay': group.mean(), 'ratio_delay': len(group[group>0])/len(group)}

### Departure time

In [None]:
delayed = df['DEPARTURE_DELAY'].groupby(df['DEPARTURE_TIME'].dt.hour).apply(get_stats_delay).unstack()
plt.xlabel("Hours")
plt.ylabel("Ratio of departure delay")
plt.title('The radio of departure delay over hours in day')
plt.grid(True,which="both",ls="-")
bars = plt.bar(range(0,24), delayed['ratio_delay'], align='center', edgecolor = "black")

for i in range(0, len(bars)):
    color = 'red'
    if delayed['mean_delay'][i] < 0:
        color = 'lightgreen'
    elif delayed['mean_delay'][i] < 2:
        color = 'green'
    elif delayed['mean_delay'][i] < 4:
        color = 'yellow'
    elif delayed['mean_delay'][i] < 8:
        color = 'orange'

    bars[i].set_color(color)
        
patch1 = mpatches.Patch(color='lightgreen', label='Depart earlier')
patch2 = mpatches.Patch(color='green', label='departure delay < 2 minutes')
patch3 = mpatches.Patch(color='yellow', label='departure delay < 4 minutes')
patch4 = mpatches.Patch(color='orange', label='departure delay < 8 minutes')
patch5 = mpatches.Patch(color='red', label='departure delay >= 8 minutes')
plt.legend(handles=[patch1, patch2, patch3, patch4, patch5], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.margins(0.05, 0)
plt.show()

### Airlines

In [None]:
delayed_airlines = df['ARRIVAL_DELAY'].groupby(df['AIRLINE']).apply(get_stats_delay).unstack()
plt.xlabel("Airlines")
plt.ylabel("Ratio of arrival delay")
plt.title('The radio of arrival delay depending on airlines')
plt.grid(True,which="both",ls="-")
bars = plt.bar(delayed_airlines.index, delayed_airlines['ratio_delay'], align='center', edgecolor = "black")

for i in range(0, len(bars)):
    color = 'red'
    if delayed_airlines['mean_delay'][i] < 0:
        color = 'lightgreen'
    elif delayed_airlines['mean_delay'][i] < 2:
        color = 'green'
    elif delayed_airlines['mean_delay'][i] < 4:
        color = 'yellow'
    elif delayed_airlines['mean_delay'][i] < 8:
        color = 'orange'

    bars[i].set_color(color)
        
patch1 = mpatches.Patch(color='lightgreen', label='Arrival earlier')
patch2 = mpatches.Patch(color='green', label='arrival delay < 2 minutes')
patch3 = mpatches.Patch(color='yellow', label='arrival delay < 4 minutes')
patch4 = mpatches.Patch(color='orange', label='arrival delay < 8 minutes')
patch5 = mpatches.Patch(color='red', label='arrival delay >= 8 minutes')
plt.legend(handles=[patch1, patch2, patch3, patch4, patch5], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

plt.margins(0.05, 0)
plt.show()

### Airports

In [None]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}
#ORIGIN AIRPORT
inbound = df['ARRIVAL_DELAY'].groupby(df['ORIGIN_AIRPORT']).apply(get_stats).unstack()
inbound = inbound.sort_values('count')

#DESTINATION AIPORT
outbound = df['ARRIVAL_DELAY'].groupby(df['DESTINATION_AIRPORT']).apply(get_stats).unstack()
outbound = outbound.sort_values('count')

_df1 = df.groupby('ORIGIN_AIRPORT').agg({'ARRIVAL_DELAY':'count'}).rename(columns={'ARRIVAL_DELAY': 'COUNT'}).rename_axis('AIRPORT')
_df1=_df1.sort_values('COUNT')
_df2 = df.groupby('DESTINATION_AIRPORT').agg({'ARRIVAL_DELAY':'count'}).rename(columns={'ARRIVAL_DELAY': 'COUNT'}).rename_axis('AIRPORT')
_df2=_df2.sort_values('COUNT')

n=15
top_airports = _df1.join(_df2, rsuffix='_ORIGIN', lsuffix='_DEST').sum(axis=1).sort_values(ascending=False).index[:n]
bottom_airports = _df1.join(_df2, rsuffix='_ORIGIN', lsuffix='_DEST').sum(axis=1).sort_values(ascending=True).index[:n]

_df1 = df.groupby('ORIGIN_AIRPORT').agg({'ARRIVAL_DELAY':'count'}).rename(columns={'ARRIVAL_DELAY': 'COUNT'}).rename_axis('AIRPORT')
_df1=_df1.sort_values('COUNT')
_df2 = df.groupby('DESTINATION_AIRPORT').agg({'ARRIVAL_DELAY':'count'}).rename(columns={'ARRIVAL_DELAY': 'COUNT'}).rename_axis('AIRPORT')
_df2=_df2.sort_values('COUNT')

n=15
top_airports = _df1.join(_df2, rsuffix='_ORIGIN', lsuffix='_DEST').sum(axis=1).sort_values(ascending=False).index[:n]
bottom_airports = _df1.join(_df2, rsuffix='_ORIGIN', lsuffix='_DEST').sum(axis=1).sort_values(ascending=True).index[:n]
_airports = [0]*n*2;
for i in range (n):
    _airports[i]=top_airports[i]
for i in range (n):
    _airports[n+i]=bottom_airports[n-i-1]
    
delayed_flights_a = [df[(df.ARRIVAL_DELAY > 0) & (df.ORIGIN_AIRPORT == _airports[i])].count()[1] for i in range(2*n)]
total_flights_dest_a = [df[(df.DESTINATION_AIRPORT == _airports[i])].count()[1] for i in range(2*n)]
total_flights_src_a = [df[(df.ORIGIN_AIRPORT == _airports[i])].count()[1] for i in range(2*n)]
total_flights = [total_flights_dest_a[i] + total_flights_src_a[i] for i in range (2*n)]

percent_delay_a = [delayed_flights_a[i]/total_flights_src_a[i] for i in range(2*n)] 

plt.figure(figsize=(18,6))
bars=plt.bar(_airports,percent_delay_a)
for i in range (0,n):
    bars[i].set_facecolor('purple')
for j in range (n, 2*n):
    bars[j].set_facecolor('olive')
plt.title('Delay probability for each of the 15 busiest airports and 15 freest airports')
plt.show()

plt.figure(figsize=(18,6))
plt.plot(_airports, percent_delay_a, 'r--')
plt.title('Flight volume for each of the 15 busiest airports and 15 freest airports')
plt.show()

### Flight number

In [None]:
def get_stats2(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean(), 'variance': group.var()}
fl_nb = df['ARRIVAL_DELAY'].groupby(df['FLIGHT_NUMBER']).apply(get_stats2).unstack()
fl_nb = fl_nb.sort_values('mean')

# Preparation of the data

In [None]:
X_train_df, X_val_df, y_train_df, y_val_df = train_test_split(
    X_train_df, y_train_df, random_state=1, test_size=0.2)

In [None]:
# Convert the arrival time and departure time into minutes since the midnight
def minutes_since_midnight(dt):
    return dt.hour * 60 + dt.minute

X_train_df.SCHEDULED_ARRIVAL = X_train_df.SCHEDULED_ARRIVAL.apply(minutes_since_midnight)
X_train_df.DEPARTURE_TIME = X_train_df.DEPARTURE_TIME.apply(minutes_since_midnight)

X_val_df.SCHEDULED_ARRIVAL = X_val_df.SCHEDULED_ARRIVAL.apply(minutes_since_midnight)
X_val_df.DEPARTURE_TIME = X_val_df.DEPARTURE_TIME.apply(minutes_since_midnight)

In [None]:
def preprocess_data(df, feature_names, imputer, scaler):
    """Preprocess data.

    Parameters
    ----------
    df: pandas DataFrame.
        The input data.
    feature_names: list of strings.
        The names of selected features.
    imputer: sklearn.impute.SimpleImputer
        The imputation transformer for completing missing values.
    scaler: sklearn.preprocessing.StandardScaler.
        The scaler used to normalize the features.

    Returns
    -------
    X: numpy array.
        The preprocessed data.
    """
    # Select features
    X_df = df[feature_names]
    
    # Pre-process datetime features
    X_df.DEPARTURE_TIME = X_df.DEPARTURE_TIME.apply(parse_hhmm)
    X_df.SCHEDULED_ARRIVAL = X_df.SCHEDULED_ARRIVAL.apply(parse_hhmm)

    X_df.SCHEDULED_ARRIVAL = X_df.SCHEDULED_ARRIVAL.apply(minutes_since_midnight)
    X_df.DEPARTURE_TIME = X_df.DEPARTURE_TIME.apply(minutes_since_midnight)

    # Impute missing values
    X = imputer.transform(X_df)
    
    # Normalize features
    X = scaler.transform(X)

    return X

## Comparing encodings

In [None]:
#### Numeric encoding
from sklearn import preprocessing
feature_names = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DISTANCE', 'AIR_TIME', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIR_TIME', 'AIRLINE']
df_numeric_train = X_train_df.copy()

df_numeric_val = X_val_df.copy()

le = preprocessing.LabelEncoder()
trained_le = le.fit(df_numeric_train['AIRLINE'])
df_numeric_train['AIRLINE'] = trained_le.transform(df_numeric_train['AIRLINE'])
df_numeric_val['AIRLINE'] = trained_le.transform(df_numeric_val['AIRLINE'])

In [None]:
#### Delay mean
#feature_names = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DISTANCE', 'AIR_TIME', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIR_TIME', 'AIRLINE']
def get_stats(group):
    return {'mean': group.mean()}
#DEPARTURE DELAY
global_stats_airline = df['ARRIVAL_DELAY'].groupby(df['AIRLINE']).apply(get_stats).unstack()

df_numeric_train = X_train_df.copy()
df_numeric_train.AIRLINE = df_numeric_train.AIRLINE.map(global_stats_airline['mean'])

df_numeric_val = X_val_df.copy()
df_numeric_val.AIRLINE = df_numeric_train.AIRLINE.map(global_stats_airline['mean'])

In [None]:
#### Ordinal encoding(delay mean)

def get_stats(group):
    return {'mean': group.mean()}
#DEPARTURE DELAY
global_stats_airline = df['ARRIVAL_DELAY'].groupby(df['AIRLINE']).apply(get_stats).unstack()

df_numeric_train = X_train_df.copy()
df_numeric_train.AIRLINE = df_numeric_train.AIRLINE.map(global_stats_airline['mean'])

df_numeric_val = X_val_df.copy()
df_numeric_val.AIRLINE = df_numeric_val.AIRLINE.map(global_stats_airline['mean'])

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
df_numeric_train['AIRLINE'] = enc.fit_transform(df_numeric_train[['AIRLINE']])
df_numeric_val['AIRLINE'] = enc.fit_transform(df_numeric_val[['AIRLINE']])


In [None]:
#### One-hot encoding
df_numeric_train = X_train_df.copy()
df_numeric_val = X_val_df.copy()

df_hot_train =pd.get_dummies(df_numeric_train.AIRLINE)
df_hot_val =pd.get_dummies(df_numeric_val.AIRLINE)

df_numeric_train = df_numeric_train.merge(df_hot_train,left_index=True, right_index=True)
df_numeric_val = df_numeric_val.merge(df_hot_val,left_index=True, right_index=True)

-> choose one hot encoding of airline and numeric encoding of airports

In [None]:
def encoding(df):
    df_hot_air = pd.get_dummies(df.AIRLINE)
    df = df.merge(df_hot_air,left_index=True, right_index=True)
    #feature_names = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DISTANCE', 'AIR_TIME', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', 'ORIGIN_AIRPORT', 'DEPARTURE_DELAY', 'AIR_TIME']+list(df_hot_air.columns)
    feature_names = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DISTANCE', 'AIR_TIME', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIR_TIME', 'AIRLINE']
    return df, feature_names

In [None]:
from sklearn import preprocessing


#df_numeric_train, feature_names = encoding(df_numeric_train)

#df_numeric_val,_ = encoding(df_numeric_val)

le = preprocessing.LabelEncoder()
trained_le = le.fit(df_numeric_train['ORIGIN_AIRPORT'])
df_numeric_train['ORIGIN_AIRPORT'] = trained_le.transform(df_numeric_train['ORIGIN_AIRPORT'])
df_numeric_val['ORIGIN_AIRPORT'] = trained_le.transform(df_numeric_val['ORIGIN_AIRPORT'])
df_numeric_train['DESTINATION_AIRPORT'] = trained_le.transform(df_numeric_train['DESTINATION_AIRPORT'])
df_numeric_val['DESTINATION_AIRPORT'] = trained_le.transform(df_numeric_val['DESTINATION_AIRPORT'])

In [None]:
target_name = ['ARRIVAL_DELAY']
X_train_df_num = df_numeric_train[feature_names]
y_train_df_num = y_train_df[target_name]
X_val_df_num = df_numeric_val[feature_names]
y_val_df_num = y_val_df[target_name]

# Filling missing values by the mean along each column.
# These statistics should be estimated by using the training set.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_train_df_num)
X_train_num = imputer.transform(X_train_df_num)
X_val_num = imputer.transform(X_val_df_num)

# Standardize the features by removing the mean and scaling to unit variance
# Similarly to the preivous step, the statistics used for standardization
# should be computed across the training set only
X_scaler = StandardScaler()
X_scaler.fit(X_train_num)
X_train_num = X_scaler.transform(X_train_num)
X_val_num = X_scaler.transform(X_val_num)


# We should also standardize the targets.
y_scaler = StandardScaler()
y_scaler.fit(y_train_df_num)
y_train_num = y_scaler.transform(y_train_df_num)
y_val_num = y_scaler.transform(y_val_df_num)

# Model

In [None]:
def make_prediction(X, model, scaler):
    """Makes predictions given a preprocessed dataset.

    Parameters
    ----------
    X: numpy array.
        The input data, which already is pre-processed.
    model: an hypopt or sklearn model.
        The trained model used for making predictions.
    scaler: sklearn.preprocessing.StandardScaler.
        The scaler used to normalize the targets.

    Returns
    -------
    y_pred: numpy array.
        The unnormalized predictions.
    """
    y_pred = scaler.inverse_transform(model.predict(X))
    return y_pred

## Linear regression

### Hand made

In [None]:
def prepare_data(array):
    variable=array.copy()
    #variable=variable.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    #variable_array = np.array(variable)
    variable_pol = PolynomialFeatures(1)
    variable_pol = variable_pol.fit_transform(variable)

    return variable_pol

input_train = prepare_data(X_train)
input_val = prepare_data(X_val)
output_train = np.array(y_train)
output_val = np.array(y_val)

In [None]:
class my_linear_regression:
    def __init__(self) : # initialize constructor for the object to assign the object its properties
        self.X_train = []
        self.y_train = []
        self.weights = []
        
    def fit(self, X, y) :
        self.X_train = X
        self.y_train = y
        self.weights = np.linalg.solve(X.T@X,X.T@y)
        #print(len(self.weights))
        #print(self.weights)
        self.weights=np.transpose(self.weights)
        #print(len(self.weights[0]))
        #print(self.weights)
    
    def predict(self,x_test,y_test) : # method of the object that can be used
        self.y_hat=np.sum(x_test*self.weights,axis=1)
        #self.MSE= (np.sum((y_test-self.y_hat)**2))/len(y_test)
        
        return self.y_hat
model_1 = my_linear_regression()
model_1.fit(input_train, output_train)
model_1.predict(input_val, output_val)

#MSE_1_train = model_1.MSE
print("weight[0] : {}, weight[1:] : {}".format(model_1.weights[0][0], model_1.weights[0][1:]))
#print("MSE : {}".format(MSE_1_train))

### With Library

In [None]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression(
   fit_intercept = True, normalize = True, copy_X = True, n_jobs = 2
).fit(X_train,y_train)
regr.predict(X_val)
print(regr.score(X_train,y_train))
#print("Weights:")
#print(regr.coef_)
#print(regr.intercept_)

In [None]:
coeff_df = pd.DataFrame(np.concatenate((regr.coef_[0],regr.intercept_)), feature_names+['w[0]'] ,columns=['Weights'])

In [None]:
y_train_pred = make_prediction(X_train, regr, y_scaler)
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train_df.values))
print("Training RMSE: {:.5f}".format(float(train_rmse)))

y_val_pred = make_prediction(X_val, regr, y_scaler)
val_rmse = np.sqrt(mean_squared_error(y_val_pred, y_val_df.values))
print("Validation RMSE: {:.5f}".format(float(val_rmse)))

## Elastic Net

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
clf = ElasticNet()
grid_values = {'alpha': [0.001, 0.01, 0.1, 0., 1.0, 10.0, 100.],'l1_ratio':[0.001,0.001,0.01,0.1,0.3,0.7,1]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall')
grid_clf_acc.fit(X_train, y_train)

In [None]:
# Print the optimal hyper-parameters
print(grid_clf_acc.best_estimator_)

In [None]:
from sklearn.linear_model import ElasticNet
ela = ElasticNet(alpha=0, l1_ratio=0 ,fit_intercept = True, normalize = True, copy_X = True, random_state=0).fit(X_train,y_train)
ela.predict(X_val)
print(ela.score(X_train,y_train))
print("Weights:")
print(ela.coef_)
print(ela.intercept_)

In [None]:
coeff_df_ela = pd.DataFrame(np.concatenate((ela.coef_,ela.intercept_)), feature_names+['w[0]'] ,columns=['Weights'])
coeff_df_ela

In [None]:
y_train_pred_ela = make_prediction(X_train, ela, y_scaler)
train_rmse_ela = np.sqrt(mean_squared_error(y_train_pred_ela, y_train_df.values))
print("Training RMSE: {:.5f}".format(float(train_rmse_ela)))

y_val_pred_ela = make_prediction(X_val, ela, y_scaler)
val_rmse_ela = np.sqrt(mean_squared_error(y_val_pred_ela, y_val_df.values))
print("Validation RMSE: {:.5f}".format(float(val_rmse_ela)))

### Scatter Matrix

In [None]:
from pandas.plotting import scatter_matrix
to_plot=pd.concat([X_val_df,y_val_df],axis=1)
scatter_matrix(to_plot, figsize=(30,18))

From the scatter matrix, only distance and air time seem to be correlated (=> logical that these features are correlated)



## Trees

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn import tree
X_train_copy=X_train.copy()
y_train_copy=y_train.copy()
clf = tree.DecisionTreeRegressor()
grid_values = {'max_depth': [15,30,50,64],'min_samples_leaf':[1,10,50,100]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall')
grid_clf_acc.fit(X_train_copy, y_train_copy)
# Print the optimal hyper-parameters
print(grid_clf_acc.best_estimator_)

In [None]:
from sklearn import tree
X_train_copy=X_train.copy()
y_train_copy=y_train.copy()
X_val_copy=X_val.copy()
clf = tree.DecisionTreeRegressor(max_depth=15)
clf = clf.fit(X_train_copy,y_train_copy)
clf.predict(X_val_copy)
print(clf.score(X_train_copy,y_train_copy))
print(clf.tree_.max_depth) #64 with default parameters

In [None]:
imp_df_clf = pd.DataFrame(clf.feature_importances_, feature_names ,columns=['Importance'])
imp_df_clf

Most important features: distance (0.16), air time (0.089), scheduled arrival (0.39) and departure time (0.29). Other features are <0.03. Remembering the correlation (scatter matrix) between the distance and the air time, we may only keep the distance (and not the air time): that is confirmed by the importance when the max depth is 15 (distance: 0.11 and air time: 0.04).

In [None]:
y_train_pred_clf = make_prediction(X_train, clf, y_scaler)
train_rmse_clf = np.sqrt(mean_squared_error(y_train_pred_clf, y_train_df.values))
print("Training RMSE: {:.5f}".format(float(train_rmse_clf)))

y_val_pred_clf = make_prediction(X_val, clf, y_scaler)
val_rmse_clf = np.sqrt(mean_squared_error(y_val_pred_clf, y_val_df.values))
print("Validation RMSE: {:.5f}".format(float(val_rmse_clf)))

With same parameters as baseline

Validation RMSE=24.33 a bit better than before where = 38.507.

Validation RMSE=23.57 for a tree of max depth=15

Advantage of tree: transparency

## Random Forest 

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
X_train_copy2=X_train_num.copy()
y_train_copy2=y_train_num.copy()
X_val_copy2=X_val_num.copy()
extra = ExtraTreesRegressor(n_estimators=5) #nb of trees in the forest. Default=10: mais out of memory. Essais avec 5, 7: ok
extra = extra.fit(X_train_copy2,y_train_copy2)
extra.predict(X_val_copy2)
print(extra.score(X_train_copy2,y_train_copy2))

In [None]:
extra.estimators_

In [None]:
imp_df_extra = pd.DataFrame(extra.feature_importances_, feature_names ,columns=['Importance'])
imp_df_extra

In [None]:
y_train_pred_extra = make_prediction(X_train_copy2, extra, y_scaler)
train_rmse_extra = np.sqrt(mean_squared_error(y_train_pred_extra, y_train_df_num.values))
print("Training RMSE: {:.5f}".format(float(train_rmse_extra)))

y_val_pred_extra = make_prediction(X_val_copy2, extra, y_scaler)
val_rmse_extra = np.sqrt(mean_squared_error(y_val_pred_extra, y_val_df_num.values))
print("Validation RMSE: {:.5f}".format(float(val_rmse_extra)))

a bit better Validation RMSE (with same parameters as baseline)

(with 5 trees in the forest): 19.87

(with 7 trees in the forest): 19.31

(with 5 or 10 trees in the forest + max depth=15): 23.27. A lot quicker!

# Make prediction

In [None]:
# Load the test data
X_test_df = pd.read_csv("/kaggle/input/corrected-dataset-airports/X_test_df.csv", dtype=X_dtype)
X_test_df, _ = encoding(X_test_df)
#X_test_df.AIRLINE = trained_le.transform(X_test_df['AIRLINE'])
X_test_df['ORIGIN_AIRPORT'] = trained_le.transform(X_test_df['ORIGIN_AIRPORT'])
X_test_df['DESTINATION_AIRPORT'] = trained_le.transform(X_test_df['DESTINATION_AIRPORT'])
# Preprocessing data

X_test = preprocess_data(X_test_df, feature_names, imputer, X_scaler)

# Make predictions
y_test_pred = make_prediction(X_test, extra, y_scaler)

In [None]:
# Create a dataframe containing the predictions
submission_df = pd.DataFrame(data={'ID': X_test_df.ID.values,
                                   'ARRIVAL_DELAY': y_test_pred.squeeze()})

# Save the predictions into a csv file
# Notice that this file should be saved under the directory `/kaggle/working` 
# so that you can download it later
submission_df.to_csv("/kaggle/working/submission_airline_with_dep_delay_air_time.csv", index=False)