In [49]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Preparing training data

In [50]:
df_train = pd.read_csv(r'G:\Coding ninjas\Kaggle projects & submissions\Titanic\train.csv')
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [51]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [52]:
df_train['Sex'].replace({'female':0, 'male':1}, inplace = True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [53]:
df_train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [54]:
df_train['Embarked'].replace((np.nan, df_train['Embarked'].mode().values[0]), inplace = True)
df_train['Embarked'].isnull().sum()

0

In [55]:
df_train_em = pd.get_dummies(df_train['Embarked'])
df_train.drop('Embarked', axis = 1, inplace = True)
df_train2 = pd.concat([df_train_em, df_train], axis = 1)
df_train2.head()

Unnamed: 0,C,Q,S,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
0,0,0,1,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,
1,1,0,0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85
2,1,0,0,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,
3,1,0,0,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123
4,1,0,0,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,


In [56]:
df_train2['Age'].replace((np.nan, df_train2['Age'].median()), inplace = True)
df_train2['Age'].isnull().sum()

0

In [57]:
df_train2.loc[df_train2['Cabin'].isnull(), 'Cabin'] = 0
df_train2.loc[df_train2['Cabin'] != 0, 'Cabin'] = 1
df_train2['Cabin'].isnull().sum()

0

In [58]:
df_train2.isnull().sum()

C              0
Q              0
S              0
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
dtype: int64

In [59]:
df_train2.drop(['PassengerId', 'Name', 'Ticket'], axis =1, inplace=True)
df_train2.head()

Unnamed: 0,C,Q,S,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,0,0,1,0,3,1,22.0,1,0,7.25,0
1,1,0,0,1,1,0,38.0,1,0,71.2833,1
2,1,0,0,1,3,0,26.0,0,0,7.925,0
3,1,0,0,1,1,0,35.0,1,0,53.1,1
4,1,0,0,0,3,1,35.0,0,0,8.05,0


In [60]:
Xtrain = df_train2.drop('Survived', axis = 1)
Xtrain.shape

(891, 10)

In [61]:
Ytrain = df_train2['Survived']

# Prepare the testing data

In [96]:
df_test = pd.read_csv(r'G:\Coding ninjas\Kaggle projects & submissions\Titanic\test.csv')
df_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [97]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [98]:
pass_id = df_test['PassengerId']

In [99]:
df_test['Sex'].replace({'female':0, 'male':1}, inplace = True)

In [100]:
df_test_em = pd.get_dummies(df_test['Embarked'])
df_test_em

Unnamed: 0,C,Q,S
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1
...,...,...,...
413,0,0,1
414,1,0,0
415,0,0,1
416,0,0,1


In [101]:
df_test.drop('Embarked', axis = 1, inplace = True)
df_test2 = pd.concat([df_test_em, df_test], axis = 1)
df_test2.head()

Unnamed: 0,C,Q,S,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
0,0,1,0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,
1,0,0,1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,
2,0,1,0,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,
3,0,0,1,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,
4,0,0,1,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,


In [102]:
df_test2['Age'].replace((np.nan, df_test2['Age'].median()), inplace = True)

df_test2.loc[df_test2['Cabin'].isnull(), 'Cabin'] = 0
df_test2.loc[df_test2['Cabin'] != 0, 'Cabin'] = 1

df_test2.isnull().sum()

C              0
Q              0
S              0
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin          0
dtype: int64

In [103]:
df_test2['Fare'].replace((np.nan, df_test2['Fare'].mean()), inplace = True)
df_test2['Fare'].isnull().sum()

0

In [104]:
df_test2.drop(['PassengerId', 'Name', 'Ticket'], axis =1, inplace=True)
df_test2.head()

Unnamed: 0,C,Q,S,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,0,1,0,3,1,34.5,0,0,7.8292,0
1,0,0,1,3,0,47.0,1,0,7.0,0
2,0,1,0,2,1,62.0,0,0,9.6875,0
3,0,0,1,3,1,62.0,0,0,8.6625,0
4,0,0,1,3,0,22.0,1,1,12.2875,0


In [105]:
Xtest = df_test2

# Building the model

In [72]:
logReg = LogisticRegression(penalty = 'l2', max_iter=10000)
grid1 = {'solver':['lbfgs', 'sag', 'newton-cg']}

bestLogReg = GridSearchCV(logReg, grid1, cv=5)
bestLogReg.fit(Xtrain, Ytrain)
bestLogReg.best_estimator_

In [73]:
bestLogReg.score(Xtrain, Ytrain)

0.7934904601571269

In [75]:
ranfor = RandomForestClassifier(criterion = 'gini')
grid2 = {'n_estimators':[10, 50, 100], 'max_depth':[5, 6, 7]}

bestRF = GridSearchCV(ranfor, grid2, cv=5)
bestRF.fit(Xtrain, Ytrain)
bestRF.best_estimator_

In [76]:
bestRF.score(Xtrain, Ytrain)

0.898989898989899

# Predictions

In [106]:
Ypred = bestRF.predict(Xtest)
Ypred

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [107]:
df_preds = pd.DataFrame({'PassengerId':pass_id, 'Survived':Ypred})
df_preds

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [108]:
df_preds.to_csv('Titanic_preds.csv', index = False)