In [None]:
import pandas as pd
import numpy as np
import category_encoders as ec

from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error,r2_score

from sklearn.preprocessing import RobustScaler, OneHotEncoder, TargetEncoder, StandardScaler, MinMaxScaler,PolynomialFeatures
from sklearn.compose import ColumnTransformer
#from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict
#from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor


from sklearn.model_selection import  StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train_df = pd.read_csv('data/Train.csv')

In [None]:
data_train = pd.read_csv('Train_modified.csv',parse_dates=['DATOP','STA','STD'])
data_train.head()

In [None]:
df = data_train.copy()

In [None]:
df['fl_op'] = df['FLTID'].str[:2]

In [None]:
df['STATUS'] = train_df.STATUS

In [None]:
df.drop(['ID','DATOP','STD','STA','flight_duration_bin','flight_length_cat','ATA','SCH','FLTID','AC'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.STATUS = df.STATUS.astype('category')
df.month_DATOP = df.month_DATOP.astype('category')
df.year_DATOP = df.year_DATOP.astype('category')
df.weekdays_DATOP = df.weekdays_DATOP.astype('category')
df.fl_op = df.fl_op.astype('category')
df.ac_type = df.ac_type.astype('category')
df.flight_duration_category = df.flight_duration_category.astype('category')
df.time_of_day_cat = df.time_of_day_cat.astype('category')
df.ARRSTN = df.ARRSTN.astype('category')
df.DEPSTN = df.DEPSTN.astype('category')

In [None]:
cat_features = df.select_dtypes(include=['category']).columns.to_list()
cat_features

In [None]:
num_feature = df.select_dtypes(exclude=['category','datetime64']).columns
num_feature = num_feature.drop('target').to_list()
num_feature

In [None]:
for  cat in cat_features:
    print(f'{cat} has unique values ----- {df[cat].nunique()}')

In [None]:
y = df['target']
X = df.drop('target',axis=1)

In [None]:
train_X , test_x ,train_y,test_y = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
print('Training Set :',train_X.shape , train_y.shape)
print('Training Set :',test_x.shape , test_y.shape)

In [None]:
num_pipeline = Pipeline([
    ("scaler",StandardScaler()),
    ('poly',PolynomialFeatures(degree= 5))
])

cat_pipeline = Pipeline([
    #("one_hot", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ('target_encode', ec.LeaveOneOutEncoder())
])

In [None]:
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_feature),
    ("cat", cat_pipeline, cat_features)
])

In [None]:
pipe_linreg = Pipeline([
    ("preprocessor", preprocessor),
    ("linreg", LinearRegression())
])

In [None]:
pipe_dt = Pipeline([
    ('preprocessor',preprocessor),
    ('dt_model',DecisionTreeRegressor())
])

In [None]:
pipe_linreg.fit(train_X,train_y)

In [None]:
y_predict = pipe_linreg.predict(test_x)
print('RMSE : ', round(root_mean_squared_error(test_y,y_predict),2))

In [None]:
pipe_dt.fit(train_X,train_y)

In [None]:
y_predict_dt = pipe_dt.predict(test_x)
print('RMSE : ', round(root_mean_squared_error(test_y,y_predict_dt),2))

In [None]:
pipe_rf = Pipeline([
    ('preprocessor',preprocessor),
    ('rand_forest',RandomForestRegressor())
])

In [None]:
pipe_rf.fit(train_X,train_y)

In [None]:
y_predict_rf = pipe_rf.predict(test_x)
print('RMSE : ', round(root_mean_squared_error(test_y,y_predict_rf),2))

In [None]:
y_train_predict = cross_val_predict(pipe_rf, train_X, train_y, cv=5)
print('RMSE : ', round(root_mean_squared_error(train_y,y_train_predict),2))

In [None]:
grid_param_dt ={'dt_model__criterion' :['squared_error', 'absolute_error', 'friedman_mse'],
                'dt_model__max_depth':[10,20,30,40,50],
                'dt_model__max_features':[11,20,30,40],
                'dt_model__max_leaf_nodes':[4,6,8]
                }
                
rand_dt = RandomizedSearchCV(pipe_dt,grid_param_dt,scoring='neg_root_mean_squared_error',n_jobs=-1,verbose=2,cv=5,n_iter=20)
rand_dt.fit(train_X,train_y)
rand_dt.best_params_ , rand_dt.best_score_

In [None]:
y_predict= rand_dt.predict(test_x)
root_mean_squared_error(test_y,y_predict)

In [None]:
grid_param_rf ={'rand_forest__n_estimators': [50,100,150,200],
                'rand_forest__criterion' :['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
                'rand_forest__max_depth':[10,20,30,40,50],
                'rand_forest__max_features':['sqrt','log2',None]
}
rand_fr = RandomizedSearchCV(pipe_rf,grid_param_rf,scoring='neg_root_mean_squared_error',n_jobs=-1,verbose=2,cv= 5)
rand_fr.fit(train_X,train_y)
rand_fr.best_params_

In [None]:
rand_fr.best_score_

In [None]:
y_predict_fr= rand_fr.predict(test_x)
root_mean_squared_error(test_y,y_predict_fr)