# Building Machine Learning models for classifiying survival on the titanic 

<h3> Activities </h3>
<ol type="1">
    <li>Data preparation</li>
    <li>Exploratory Data Analysis(Data exploration)</li>
    <li>Data preprocessing 
        <ul>
            <li> Handling outliers </li> 
            <li> Label encoding </li> 
            <li> Data scaling </li> 
        </ul>
    </li>
    <li> Building ML models
        <ul>
            <li> Linear regression </li> 
            <li> Logistic regression </li>
            <li> Decision Tree </li> 
            <li> KNN </li> 
            <li> Random forest </li>    
        </ul>
    </li>
    <li> Model evaluation </li>
    <li> Feature selection and engineering</li>
    <li> Hyper parameter tuning</li>
    <li> Communicating the insights obtained</li>
</ol>

In [1]:
# for data loading, assembly, manipulation, and exploration.
import pandas as pd
import numpy as np

# for statistical models
import statsmodels.api as sm

# for preprocessing 
from sklearn.preprocessing import LabelEncoder

#for model selection and fitting
from sklearn.linear_model import LinearRegression  
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier     #Decision tree     
from sklearn.ensemble import RandomForestClassifier      # random forest model
from sklearn.neighbors import KNeighborsClassifier  # KNN classification model
from sklearn.model_selection import GridSearchCV

from pathlib import Path  # saving files

# ignoring warnings
import warnings
warnings.filterwarnings("ignore")

### Loading and analyzing the dataset

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# transforming sex and embarked from categorical to numerical using laber encoder
# train set
labenc = LabelEncoder()
train_data.Sex = labenc.fit_transform(train_data.Sex)
train_data.Embarked = labenc.fit_transform(train_data.Embarked)

# test set
labenc = LabelEncoder()
test_data.Sex = labenc.fit_transform(test_data.Sex)
test_data.Embarked = labenc.fit_transform(test_data.Embarked)

train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,2
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,2
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,0


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int32  
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    int32  
dtypes: float64(2), int32(2), int64(5), object(3)
memory usage: 76.7+ KB


In [6]:
# fill nan values with average
train_data = round(train_data.fillna(train_data.mean()))
test_data = round(test_data.fillna(test_data.mean()))
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.0,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.0,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,8.0,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.0,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0,,2
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0,B42,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,30.0,1,2,W./C. 6607,23.0,,2
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0,C148,0


In [7]:
# defining independent and dependent variables
x_train = train_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y_train = train_data['Survived'] 
x_test = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.0,2
1,1,0,38.0,1,0,71.0,0
2,3,0,26.0,0,0,8.0,2
3,1,0,35.0,1,0,53.0,2
4,3,1,35.0,0,0,8.0,2
...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0,2
887,1,0,19.0,0,0,30.0,2
888,3,0,30.0,1,2,23.0,2
889,1,1,26.0,0,0,30.0,0


In [8]:
#forward regression method from https://github.com/AakkashVijayakumar/stepwise-regression/blob/master/stepwise_regression/step_reg.py
# defining dependent and independent variables 
x = x_train    # all the variables minus the last(class) as dependent variables
y = y_train       # the last variable as independent variable

selected_features=[]

# forward selection
def forward_regression(X, y, verbose=False):
                       
    top_variable = []               # keeps the most significant variables
    
    while True:
        changed=False
        least_significant = list(set(X.columns)-set(top_variable))      # the least significant variables
        new_pvalue = pd.Series(index=least_significant)
        for values in least_significant:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[top_variable+[values]]))).fit() 
            new_pvalue[values] = model.pvalues[values]
        best_pvalue = new_pvalue.min()
        if best_pvalue < 0.05:                 # the best pvalue is the pvalue with less than 0.05 threshold
            best_variable = new_pvalue.idxmin()
            top_variable.append(best_variable)
            changed=True
            if True:
                print('Add {:20} with p-value {:.6}'.format(best_variable, best_pvalue))
                selected_features.append(best_variable) #storing the added feature

        if not changed:
            break
    print(model.summary())

forward_regression(x,y,verbose=True)


Add Sex                  with p-value 1.40607e-69
Add Pclass               with p-value 1.00123e-22
Add Age                  with p-value 2.40906e-06
Add SibSp                with p-value 0.000156179
Add Embarked             with p-value 0.022147
                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.393
Method:                 Least Squares   F-statistic:                     97.10
Date:                Tue, 17 Jan 2023   Prob (F-statistic):           1.03e-93
Time:                        17:15:44   Log-Likelihood:                -396.42
No. Observations:                 891   AIC:                             806.8
Df Residuals:                     884   BIC:                             840.4
Df Model:                           6                                         
Covariance Type:            nonrobust     

In [9]:
# adjusting the x_train and x_test to be the selected feaatures
x_train=train_data[selected_features]
y_train= train_data['Survived']

x_test = test_data[selected_features]
x_test

Unnamed: 0,Sex,Pclass,Age,SibSp,Embarked
0,1,3,34.0,0,1
1,0,3,47.0,1,2
2,1,2,62.0,0,1
3,1,3,27.0,0,2
4,0,3,22.0,1,2
...,...,...,...,...,...
413,1,3,30.0,0,2
414,0,1,39.0,0,0
415,1,3,38.0,0,2
416,1,3,30.0,0,2


### Linear regression

In [10]:
# building linear regression model 
# fiting the model
model1 = LinearRegression()        
model1.fit(x_train,y_train)

# making predictions
y_predicted = model1.predict(x_test).round()

# dump results in a dataframe
df1 = pd.DataFrame(y_predicted)


In [11]:
# submission on kaggle for linear regression 
df = test_data[['PassengerId']]
lrresult = pd.concat([df, df1], axis=1)
lrresult.columns =['PassengerId', 'Survived']

# copy results to the folder and save it as a csv
from pathlib import Path  
filepath = Path('lrresult.csv')   
filepath.parent.mkdir(parents=True, exist_ok=True)  
lrresult.to_csv(filepath) 

### Logistic regression

In [12]:
# building logistic regression model
logregression = LogisticRegression()
logregression.fit(x_train,y_train)

# predicting 
y_pred = logregression.predict(x_test)
df_predicted= pd.DataFrame(y_pred)


In [13]:
# submission on kaggle
df = test_data[['PassengerId']]
# predictions using logistic regression second results
logresult = pd.concat([df, df_predicted], axis=1)
logresult.columns =['PassengerId', 'Survived']

# copy results to the folder and save it as a csv
filepath = Path('logresult.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
logresult.to_csv(filepath)

### Decision tree

In [14]:
# Feature Selection
features = ['Pclass','Sex', 'Age','SibSp','Embarked']

# # Building Decision Tree Model

# Create Decision Tree classifer object
dtmodel = DecisionTreeClassifier(criterion = "gini", max_depth=3, random_state=42)

# fit the Tree Classifier
dtmodel = dtmodel.fit(x_train,y_train)

# predict
y_predict = dtmodel.predict(x_test)
df_predict = pd.DataFrame(y_predict)

In [15]:
# submission on kaggle
df = test_data[['PassengerId']]
# predictions using logistic regression second results
dtresult = pd.concat([df, df_predict], axis=1)
dtresult.columns =['PassengerId', 'Survived']

# save results in a csv file
filepath = Path('dtresult.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
dtresult.to_csv(filepath)

### KNN

In [16]:
# building KNN classifier model 
knn = KNeighborsClassifier(n_neighbors=3)
# fit the  model
knn.fit(x_train, y_train)

# prediction
ypredicted = knn.predict(x_test)
df_predicted = pd.DataFrame(ypredicted)


In [17]:
# submission on kaggle
df = test_data[['PassengerId']]
# predictions using logistic regression second results
knnresult = pd.concat([df, df_predicted], axis=1)
knnresult.columns =['PassengerId', 'Survived']

# save results in a csv file  
filepath = Path('knnresult.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
knnresult.to_csv(filepath)

### Random forest

In [18]:
## building the random forest classifier model
rf = RandomForestClassifier(n_estimators=5,criterion='gini',random_state=1)
# train the model 
rf.fit(x_train,y_train) 

# predictions
ypred = rf.predict(x_test)
df_pred = pd.DataFrame(ypred)

In [19]:
# submission on kaggle
df = test_data[['PassengerId']]
# predictions using logistic regression second results
rfresult = pd.concat([df, df_pred], axis=1)
rfresult.columns =['PassengerId', 'Survived']

# save results in a csv file  
filepath = Path('rfresult.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
rfresult.to_csv(filepath)

### Increasing the accuracy using hyper parameter tuning on RF


In [20]:
# defining the search area
search_area={'n_estimators':[10,40,70,90,120,150,200,250],'max_depth':[2,3,5,10,20,30,50]}

# GridSearch 
grid_search=GridSearchCV(estimator=RandomForestClassifier(), param_grid=search_area,cv=10, n_jobs=1,verbose=2,scoring='accuracy')
grid_search=grid_search.fit(x_train,y_train)             

Fitting 10 folds for each of 56 candidates, totalling 560 fits
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=40; total time=   0.0s
[CV] END .......................max_depth=2, n

[CV] END .......................max_depth=3, n_estimators=70; total time=   0.1s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.1s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.0s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.1s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.1s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.2s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.1s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.2s
[CV] END .......................max_depth=3, n_estimators=90; total time=   0.2s
[CV] END .......................max_depth=3, n_estimators=90; total time=   0.3s
[CV] END .......................max_depth=3, n_estimators=90; total time=   0.3s
[CV] END .......................max_depth=3, n_estimators=90; total time=   0.3s
[CV] END ...................

[CV] END ......................max_depth=5, n_estimators=120; total time=   0.2s
[CV] END ......................max_depth=5, n_estimators=120; total time=   0.4s
[CV] END ......................max_depth=5, n_estimators=120; total time=   0.4s
[CV] END ......................max_depth=5, n_estimators=120; total time=   0.3s
[CV] END ......................max_depth=5, n_estimators=120; total time=   0.4s
[CV] END ......................max_depth=5, n_estimators=120; total time=   0.5s
[CV] END ......................max_depth=5, n_estimators=150; total time=   0.4s
[CV] END ......................max_depth=5, n_estimators=150; total time=   0.6s
[CV] END ......................max_depth=5, n_estimators=150; total time=   0.7s
[CV] END ......................max_depth=5, n_estimators=150; total time=   0.5s
[CV] END ......................max_depth=5, n_estimators=150; total time=   0.2s
[CV] END ......................max_depth=5, n_estimators=150; total time=   0.3s
[CV] END ...................

[CV] END .....................max_depth=10, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=10, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=10, n_estimators=200; total time=   0.4s
[CV] END .....................max_depth=10, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=10, n_estimators=250; total time=   0.6s
[CV] END .....................max_depth=10, n_estimators=250; total time=   0.6s
[CV] END .....................max_depth=10, n_estimators=250; total time=   0.6s
[CV] END .....................max_depth=10, n_estimators=250; total time=   0.5s
[CV] END .....................max_depth=10, n_estimators=250; total time=   0.8s
[CV] END .....................max_depth=10, n_estimators=250; total time=   1.1s
[CV] END .....................max_depth=10, n_estimators=250; total time=   1.0s
[CV] END .....................max_depth=10, n_estimators=250; total time=   0.9s
[CV] END ...................

[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.1s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=30, n_estimators=70; total time=   0.1s
[CV] END ......................max_depth=30, n_estimators=70; total time=   0.1s
[CV] END ...................

[CV] END ......................max_depth=50, n_estimators=90; total time=   0.2s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.3s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.3s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.2s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.3s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.3s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.3s
[CV] END ......................max_depth=50, n_estimators=90; total time=   0.3s
[CV] END .....................max_depth=50, n_estimators=120; total time=   0.4s
[CV] END .....................max_depth=50, n_estimators=120; total time=   0.2s
[CV] END .....................max_depth=50, n_estimators=120; total time=   0.5s
[CV] END .....................max_depth=50, n_estimators=120; total time=   0.4s
[CV] END ...................

In [21]:
# the best estimator obtained by gridsearch
best_estimator= grid_search.best_estimator_
best_estimator

RandomForestClassifier(max_depth=5, n_estimators=70)

In [22]:
# Fit the best retrieved parameters into a random forest model
randomf = RandomForestClassifier(n_estimators=70, max_depth=5,random_state=42)
randomf.fit(x_train,y_train)   # fiting the model using training set

# predictions
y_pred = randomf.predict(x_test) 
df_prediction = pd.DataFrame(y_pred)

In [23]:
# submission on kaggle
df = test_data[['PassengerId']]
# predictions using logistic regression second results
rfresult2 = pd.concat([df, df_prediction], axis=1)
rfresult2.columns =['PassengerId', 'Survived']

# save results in a csv file 
filepath = Path('rfresult2.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
rfresult2.to_csv(filepath)