#### Model Training


#### 1.1 Import data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn, and Warnings Library 

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling Import

from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [100]:
df =pd.read_csv('data/train.csv')

In [101]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [102]:
df = df.drop(columns=['Name', 'Ticket', 'Cabin'])

In [103]:
df['FamilySize'] = df['SibSp'] + df['Parch'] +  1

In [104]:
# Creating a new column IsAlone (0 = Not Alone, 1= Alone)
df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x==1 else 0)

In [105]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,1,0,3,male,22.0,1,0,7.25,S,2,0
1,2,1,1,female,38.0,1,0,71.2833,C,2,0
2,3,1,3,female,26.0,0,0,7.925,S,1,1
3,4,1,1,female,35.0,1,0,53.1,S,2,0
4,5,0,3,male,35.0,0,0,8.05,S,1,1


In [106]:
X_train = df.drop(columns=['Survived'])

In [107]:
y_train= df['Survived']

In [108]:
X_train, y_train

(     PassengerId  Pclass     Sex   Age  ...     Fare  Embarked  FamilySize IsAlone
 0              1       3    male  22.0  ...   7.2500         S           2       0
 1              2       1  female  38.0  ...  71.2833         C           2       0
 2              3       3  female  26.0  ...   7.9250         S           1       1
 3              4       1  female  35.0  ...  53.1000         S           2       0
 4              5       3    male  35.0  ...   8.0500         S           1       1
 ..           ...     ...     ...   ...  ...      ...       ...         ...     ...
 886          887       2    male  27.0  ...  13.0000         S           1       1
 887          888       1  female  19.0  ...  30.0000         S           1       1
 888          889       3  female   NaN  ...  23.4500         S           4       0
 889          890       1    male  26.0  ...  30.0000         C           1       1
 890          891       3    male  32.0  ...   7.7500         Q           1 

In [109]:
print("Categories in 'Pclass' column: ", end= " ")
print(df['Pclass'].unique())

print("Categories in 'Sex' column: ", end=" ")
print(df['Sex'].unique())

print("Categories in 'Embarked' column: ", end=" ")
print(df['Embarked'].unique())

print("Categories in 'IsAlone' column: ", end="")
print(df['IsAlone'].unique())

Categories in 'Pclass' column:  [3 1 2]
Categories in 'Sex' column:  ['male' 'female']
Categories in 'Embarked' column:  ['S' 'C' 'Q' nan]
Categories in 'IsAlone' column: [0 1]


In [110]:
num_features = X_train.select_dtypes(exclude="object").columns
cat_features = X_train.select_dtypes(include="object").columns

X_train[num_features] = SimpleImputer(strategy='mean').fit_transform(X_train[num_features])
X_train[cat_features] = SimpleImputer(strategy='most_frequent').fit_transform(X_train[cat_features])


In [112]:
X_train.isna().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
FamilySize     0
IsAlone        0
dtype: int64

In [113]:

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()


preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", categorical_transformer, cat_features),
        ("StandardScaler", numerical_transformer, num_features)
    ]
)

In [114]:
X_train = preprocessor.fit_transform(X_train)

In [115]:
X_train.shape

(891, 13)

In [116]:
test_df = pd.read_csv('data/test.csv')

In [117]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [118]:
test_df  = test_df.drop(columns=['Name','Ticket','Cabin'])

In [119]:
X_test = df.drop(columns=['Survived'])
y_test = df['Survived']

In [120]:
num_features = X_test.select_dtypes(exclude="object").columns
cat_features = X_test.select_dtypes(include="object").columns


X_test[num_features] = SimpleImputer(strategy='mean').fit_transform(X_test[num_features])
X_test[cat_features] = SimpleImputer(strategy='most_frequent').fit_transform(X_test[cat_features]) 

In [122]:


numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
std_scaler = StandardScaler() 

preprocessor = ColumnTransformer(
    [  
        ("OneHotEncoder", categorical_transformer, cat_features),
        ("StandardScaler", numerical_transformer, num_features)
    ]
)

In [123]:
X_test = preprocessor.fit_transform(X_test)

##### Create Evaluate Function to find all the metrics after Model Training

In [129]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_scr = r2_score(true, predicted)

    return mae, rmse, r2_scr

In [131]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "XGBRegressor": XGBRegressor(),
    "K-Neighbours": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "CatBoostRegressor": CatBoostRegressor(verbose=False),
    "RandomForestRegressor": RandomForestRegressor(),
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Performance on Training Data')
    print("- Root mean squared error : {:.4f}".format(model_train_rmse))
    print("- Mean absolute error : {:.4f}".format(model_train_mae))
    print("- R2 Score : {:.4f}".format(model_train_r2))

    print('-----------------------------')

    print("Model Performance on Test Data")
    print("- Root mean squared error: {:.4f}".format(model_test_rmse))
    print("- Mean absolute error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    print('='*35)
    print('\n')





LinearRegression
Model Performance on Training Data
- Root mean squared error : 0.3759
- Mean absolute error : 0.2926
- R2 Score : 0.4026
-----------------------------
Model Performance on Test Data
- Root mean squared error: 0.3759
- Mean absolute error: 0.2926
- R2 Score: 0.4026


Lasso
Model Performance on Training Data
- Root mean squared error : 0.4863
- Mean absolute error : 0.4730
- R2 Score : 0.0000
-----------------------------
Model Performance on Test Data
- Root mean squared error: 0.4863
- Mean absolute error: 0.4730
- R2 Score: 0.0000


Ridge
Model Performance on Training Data
- Root mean squared error : 0.3758
- Mean absolute error : 0.2909
- R2 Score : 0.4027
-----------------------------
Model Performance on Test Data
- Root mean squared error: 0.3758
- Mean absolute error: 0.2909
- R2 Score: 0.4027


AdaBoostRegressor
Model Performance on Training Data
- Root mean squared error : 0.3539
- Mean absolute error : 0.2866
- R2 Score : 0.4705
-----------------------------
M