In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [99]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

# Checking for Null Values

In [100]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [101]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Encoding categorical variables

In [102]:
train_df['Sex']=train_df['Sex'].replace({'male':0,'female':1})
train_df['Embarked']=train_df['Embarked'].replace({'C':0,'S':1,'Q':2})

In [103]:
test_df['Sex']=test_df['Sex'].replace({'male':0,'female':1})
test_df['Embarked']=test_df['Embarked'].replace({'C':0,'S':1,'Q':2})

# String Slicing 

In [104]:
train_df['Cabin']=train_df['Cabin'].str.get(0)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1.0


In [105]:
test_df['Cabin']=test_df['Cabin'].str.get(0)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1


In [106]:
train_df['Cabin'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

In [107]:
test_df['Cabin'].value_counts()

C    35
B    18
D    13
E     9
F     8
A     7
G     1
Name: Cabin, dtype: int64

# Encoding Cabin Feature

In [108]:
train_df['Cabin']=train_df['Cabin'].replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'T':7})
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,2.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,2.0,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,,1.0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,1.0,1.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.4500,,1.0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,2.0,0.0


In [109]:
test_df['Cabin']=test_df['Cabin'].replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6})
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1


# Using KNN Imputer to Impute missing values

In [110]:
x=train_df.drop(['Name','Ticket'],axis=1)

In [111]:
from sklearn.impute import KNNImputer

In [112]:
knn_impute= KNNImputer(n_neighbors=8)

In [113]:
knn_array=knn_impute.fit_transform(x)

In [114]:
imputed_df=pd.DataFrame(knn_array,columns=['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked'])
imputed_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1.0,0.0,3.0,0.0,22.00,1.0,0.0,7.2500,2.625,1.0
1,2.0,1.0,1.0,1.0,38.00,1.0,0.0,71.2833,2.000,0.0
2,3.0,1.0,3.0,1.0,26.00,0.0,0.0,7.9250,3.000,1.0
3,4.0,1.0,1.0,1.0,35.00,1.0,0.0,53.1000,2.000,1.0
4,5.0,0.0,3.0,0.0,35.00,0.0,0.0,8.0500,3.000,1.0
...,...,...,...,...,...,...,...,...,...,...
886,887.0,0.0,2.0,0.0,27.00,0.0,0.0,13.0000,2.125,1.0
887,888.0,1.0,1.0,1.0,19.00,0.0,0.0,30.0000,1.000,1.0
888,889.0,0.0,3.0,1.0,26.75,1.0,2.0,23.4500,2.125,1.0
889,890.0,1.0,1.0,0.0,26.00,0.0,0.0,30.0000,2.000,0.0


In [115]:
imputed_df['Age']=imputed_df['Age'].round(0)
imputed_df['Cabin']=imputed_df['Cabin'].round(0)
imputed_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1.0,0.0,3.0,0.0,22.0,1.0,0.0,7.2500,3.0,1.0
1,2.0,1.0,1.0,1.0,38.0,1.0,0.0,71.2833,2.0,0.0
2,3.0,1.0,3.0,1.0,26.0,0.0,0.0,7.9250,3.0,1.0
3,4.0,1.0,1.0,1.0,35.0,1.0,0.0,53.1000,2.0,1.0
4,5.0,0.0,3.0,0.0,35.0,0.0,0.0,8.0500,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,887.0,0.0,2.0,0.0,27.0,0.0,0.0,13.0000,2.0,1.0
887,888.0,1.0,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,1.0
888,889.0,0.0,3.0,1.0,27.0,1.0,2.0,23.4500,2.0,1.0
889,890.0,1.0,1.0,0.0,26.0,0.0,0.0,30.0000,2.0,0.0


# imputing test dataset

In [116]:
x_t=test_df.drop(['Name','Ticket'],axis=1)
knn_impute_t= KNNImputer(n_neighbors=8)
knn_array_t=knn_impute_t.fit_transform(x_t)
imputed_test_df=pd.DataFrame(knn_array_t,columns=['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked'])
imputed_test_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892.0,3.0,0.0,34.5000,0.0,0.0,7.8292,2.250,2.0
1,893.0,3.0,1.0,47.0000,1.0,0.0,7.0000,2.250,1.0
2,894.0,2.0,0.0,62.0000,0.0,0.0,9.6875,2.250,2.0
3,895.0,3.0,0.0,27.0000,0.0,0.0,8.6625,2.250,1.0
4,896.0,3.0,1.0,22.0000,1.0,1.0,12.2875,2.250,1.0
...,...,...,...,...,...,...,...,...,...
413,1305.0,3.0,0.0,23.6875,0.0,0.0,8.0500,2.750,1.0
414,1306.0,1.0,1.0,39.0000,0.0,0.0,108.9000,2.000,0.0
415,1307.0,3.0,0.0,38.5000,0.0,0.0,7.2500,2.750,1.0
416,1308.0,3.0,0.0,25.4375,0.0,0.0,8.0500,2.750,1.0


In [117]:
imputed_test_df['Age']=imputed_test_df['Age'].round(0)
imputed_test_df['Cabin']=imputed_test_df['Cabin'].round(0)
imputed_test_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892.0,3.0,0.0,34.0,0.0,0.0,7.8292,2.0,2.0
1,893.0,3.0,1.0,47.0,1.0,0.0,7.0000,2.0,1.0
2,894.0,2.0,0.0,62.0,0.0,0.0,9.6875,2.0,2.0
3,895.0,3.0,0.0,27.0,0.0,0.0,8.6625,2.0,1.0
4,896.0,3.0,1.0,22.0,1.0,1.0,12.2875,2.0,1.0
...,...,...,...,...,...,...,...,...,...
413,1305.0,3.0,0.0,24.0,0.0,0.0,8.0500,3.0,1.0
414,1306.0,1.0,1.0,39.0,0.0,0.0,108.9000,2.0,0.0
415,1307.0,3.0,0.0,38.0,0.0,0.0,7.2500,3.0,1.0
416,1308.0,3.0,0.0,25.0,0.0,0.0,8.0500,3.0,1.0


# Preprocessing

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
X=imputed_df.drop('Survived',axis=1)
y=imputed_df['Survived']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [121]:
from sklearn.preprocessing import StandardScaler

In [122]:
scaler=StandardScaler()

In [123]:
scaled_X_train= scaler.fit_transform(X_train)

In [124]:
scaled_X_test= scaler.transform(X_test)

# Model Building

In [125]:
from sklearn.ensemble import RandomForestClassifier

In [126]:
RFC=RandomForestClassifier()

In [127]:
RFC.fit(scaled_X_train,y_train)

RandomForestClassifier()

In [128]:
from sklearn.metrics import accuracy_score
y_pred=RFC.predict(scaled_X_test)
accuracy_score(y_test,y_pred)

0.8475336322869955

In [129]:
#Scaling test_df 

In [130]:
test_X=imputed_test_df

In [131]:
scaled_X_test= scaler.transform(test_X)

In [132]:
scaled_X_test

array([[ 1.72409448,  0.80934914, -0.72224656, ..., -0.47221994,
        -0.74068117,  2.15998237],
       [ 1.72797278,  0.80934914,  1.38456873, ..., -0.4883005 ,
        -0.74068117,  0.17733967],
       [ 1.73185107, -0.40558395, -0.72224656, ..., -0.43618219,
        -0.74068117,  2.15998237],
       ...,
       [ 3.33358761,  0.80934914, -0.72224656, ..., -0.48345228,
         0.25284297,  0.17733967],
       [ 3.33746591,  0.80934914, -0.72224656, ..., -0.46793799,
         0.25284297,  0.17733967],
       [ 3.34134421,  0.80934914, -0.72224656, ..., -0.19045915,
        -0.74068117, -1.80530303]])

In [133]:
y_pred_final=RFC.predict(scaled_X_test)
y_pred_final

array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [134]:
my_predictions_final=pd.DataFrame(y_pred_final,columns=['Survived'])
final_pred=my_predictions_final.join(test_df['PassengerId'])
final_pred=final_pred[['PassengerId','Survived']]
final_pred['Survived']=final_pred['Survived'].astype('int64')
final_pred

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


# Final Prediction for your Submission

In [135]:
final_pred.to_csv('My_Titanic_Prediction.csv',index=False)

In [136]:
#Got a Kaggle Score of 0.79186 for my Final Prediction.