In [None]:
# import the necessary libraries we need for your analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sys
from io import StringIO

from sklearn.metrics import r2_score, accuracy_score

RSEED =  10

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import project_fun as prj

## alternative way of doing the same
#import sys;
#sys.path.insert(0,'..')
#import project_fun as prj

In [None]:
df = pd.read_csv('../data/Train_cleaned.csv',delimiter=';')

In [None]:
df.head()

In [None]:
# 1) encode departure countries
le1 = LabelEncoder()
le1.fit(df['country_dep'])
df['country_dep'] = le1.transform(df['country_dep'])

# 2) encode arrival countries
le2 = LabelEncoder()
le2.fit(df['country_arr'])
df['country_arr'] = le2.transform(df['country_arr'])

# 2) encode flight id
le3 = LabelEncoder()
le3.fit(df['FLTID'])
df['FLTID'] = le3.transform(df['FLTID'])

In [None]:
df.columns

In [None]:
df.drop(['DATOP','STA','STD','season','outcome','lat_arr','lon_arr','lat_dep','lon_dep'],axis=1,inplace=True)

In [None]:
df.sample(5)

In [None]:
# Define features and target
y = df['target']
X = df[['FLTID','hour_STA','hour_STD','month','year','scheduled_time_duration','country_dep','season_num','elevation_dep','day_of_week']]

# Split the features and target data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RSEED, train_size = .8)

## XGBoost regression

In [None]:
xgb = XGBRegressor()

#fit model
xgb.fit(X_train, y_train)

#predict model
y_train_pred_xgb = xgb.predict(X_train)
y_test_pred_xgb = xgb.predict(X_test)

In [None]:
print('Test_evaluation_metrics')
print("R-squared_xgb:", r2_score(y_test, y_test_pred_xgb).round(3))
print("RMSE-squared_xgb:", mean_squared_error(y_test, y_test_pred_xgb, squared=False).round(3))
print('-----------------')
print('Train_evaluation_metrics')
print("R-squared_xgb:", r2_score(y_train, y_train_pred_xgb).round(3))
print("RMSE-squared_xgb:", mean_squared_error(y_train, y_train_pred_xgb, squared=False).round(3))

## XGBoost with scaling

### standard scaler

In [None]:
X_train_sc = X_train.copy()
X_test_sc = X_test.copy()

In [None]:
scaler = StandardScaler()
numerical = ['FLTID', 'year','scheduled_time_duration','elevation_dep']
X_train_sc[numerical] = scaler.fit_transform(X_train_sc[numerical])
X_test_sc[numerical] = scaler.transform(X_test_sc[numerical])

#pd.DataFrame(X_train_sc[numerical])

In [None]:
X_train_sc

In [None]:
xgb_sc = XGBRegressor()

#fit model
xgb_sc.fit(X_train, y_train)

#predict model
y_train_pred_xgb_sc = xgb_sc.predict(X_train)
y_test_pred_xgb_sc = xgb_sc.predict(X_test)

In [None]:
print('Test_evaluation_metrics_standardscaler')
print("R-squared_xgb_sc:", r2_score(y_test, y_test_pred_xgb_sc).round(3))
print("RMSE-squared_xgb_sc:", mean_squared_error(y_test, y_test_pred_xgb_sc, squared=False).round(3))
print('-----------------')
print('Train_evaluation_metrics')
print("R-squared_xgb_sc:", r2_score(y_train, y_train_pred_xgb_sc).round(3))
print("RMSE-squared_xgb_sc:", mean_squared_error(y_train, y_train_pred_xgb_sc, squared=False).round(3))

## minmax_scaler

In [None]:
X_train_mm = X_train.copy()
X_test_mm = X_test.copy()

In [None]:
scaler_mm = MinMaxScaler()
numerical = ['FLTID', 'year','scheduled_time_duration','elevation_dep']
X_train_mm[numerical] = scaler_mm.fit_transform(X_train_mm[numerical])
X_test_mm[numerical] = scaler_mm.transform(X_test_mm[numerical])

#pd.DataFrame(X_train_sc[numerical])

In [None]:
xgb_mm = XGBRegressor()

#fit model
xgb_mm.fit(X_train, y_train)

#predict model
y_train_pred_xgb_mm = xgb_sc.predict(X_train)
y_test_pred_xgb_mm = xgb_sc.predict(X_test)

In [None]:
print('Test_evaluation_metrics_MinMaxscaler')
print("R-squared_xgb_sc:", r2_score(y_test, y_test_pred_xgb_mm).round(3))
print("RMSE-squared_xgb_sc:", mean_squared_error(y_test, y_test_pred_xgb_mm, squared=False).round(3))
print('-----------------')
print('Train_evaluation_metrics')
print("R-squared_xgb_sc:", r2_score(y_train, y_train_pred_xgb_mm).round(3))
print("RMSE-squared_xgb_sc:", mean_squared_error(y_train, y_train_pred_xgb_mm, squared=False).round(3))

In [None]:
prj.calculate_metrics(y_train, y_train_pred_xgb, y_test, y_test_pred_xgb)

In [None]:
prj.error_analysis(y_test,y_test_pred_xgb)

# Random Forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_reg = RandomForestRegressor(max_depth=2, random_state=0)
rf_reg.fit(X_train,y_train)

y_train_pred_rf = rf_reg.predict(X_train)
y_test_pred_rf = rf_reg.predict(X_test)

In [None]:
prj.calculate_metrics(y_train, y_train_pred_rf, y_test, y_test_pred_rf)