### **Kaggle Competition**

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

In [30]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [32]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [33]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [34]:
#copies so that we don't alter the originals
train2 = train.copy()
test2 = test.copy()

In [35]:
#handle missing value train
train2['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)
train2["CabinBool"] = (train["Cabin"].notnull().astype('int'))
train2['Age'] = train2['Age'].fillna(train['Age'].mean())

In [36]:
#handle missing value test
test2["CabinBool"] = (test["Cabin"].notnull().astype('int'))
test2['Fare'].fillna(test['Fare'].median(), inplace = True)
test2['Age'] = test2['Age'].fillna(train['Age'].mean())

In [37]:
# Family Size helps us when we assume that families stuck together, so this
# feature will be used in Family Survived
train2['Family_Size'] = train2['Parch'] + train2['SibSp']
test2['Family_Size'] = test2['Parch'] + test2['SibSp']

In [38]:
# if two tickets have numbers that are close to each other, then it's likely the passengers were 
#located near each other on the ship
train2['ticket_st'] = train2.Ticket.str.extract('(\d+$)').fillna(0).astype(int)
test2['ticket_st'] = test2.Ticket.str.extract('(\d+$)').fillna(0).astype(int)

In [39]:
#cabin letter
train2['Cabin_letter']=train2["Cabin"].astype(str).str[0]
test2['Cabin_letter']=test2["Cabin"].astype(str).str[0]

In [40]:
#dummies
label=LabelEncoder()
train2['Sex']=label.fit_transform(train2['Sex'])
embarked_dummy_tr=pd.get_dummies(train2.Embarked, prefix='Embarked')
Pclass_dummy_tr = pd.get_dummies(train2.Pclass,prefix='Pclass')
cabin_dummy_tr= pd.get_dummies(train2.Cabin_letter,prefix='Cabin')

train_md=pd.concat([train2,embarked_dummy_tr,Pclass_dummy_tr,cabin_dummy_tr],axis=1)
Y_train=train['Survived']
X_train=train_md.drop(['Survived','Embarked','Name','Ticket','Cabin','ticket_st',
                       'Cabin_letter'],axis=1)


test2['Sex']=label.fit_transform(test2['Sex'])
embarked_dummy_ts=pd.get_dummies(test2.Embarked, prefix='Embarked')
Pclass_dummy_ts = pd.get_dummies(test2.Pclass,prefix='Pclass')
cabin_dummy_ts= pd.get_dummies(test2.Cabin_letter,prefix='Cabin')

test_md=pd.concat([test2,embarked_dummy_ts,Pclass_dummy_ts,cabin_dummy_ts],axis=1)
X_test=test_md.drop(['Name','Embarked','Ticket','Cabin','ticket_st',
                      'Cabin_letter'],axis=1)

X_test['Cabin_T']=0

In [41]:
train2['Cabin_letter'].unique()

array(['n', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [42]:
test2['Cabin_letter'].unique()

array(['n', 'B', 'E', 'A', 'C', 'D', 'F', 'G'], dtype=object)

In [43]:
X_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,CabinBool,Family_Size,Embarked_C,...,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_n
0,1,3,1,22.000000,1,0,7.2500,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1,2,1,0,38.000000,1,0,71.2833,1,1,1,...,0,0,0,1,0,0,0,0,0,0
2,3,3,0,26.000000,0,0,7.9250,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,4,1,0,35.000000,1,0,53.1000,1,1,0,...,0,0,0,1,0,0,0,0,0,0
4,5,3,1,35.000000,0,0,8.0500,0,0,0,...,1,0,0,0,0,0,0,0,0,1
5,6,3,1,29.699118,0,0,8.4583,0,0,0,...,1,0,0,0,0,0,0,0,0,1
6,7,1,1,54.000000,0,0,51.8625,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,3,1,2.000000,3,1,21.0750,0,4,0,...,1,0,0,0,0,0,0,0,0,1
8,9,3,0,27.000000,0,2,11.1333,0,2,0,...,1,0,0,0,0,0,0,0,0,1
9,10,2,0,14.000000,1,0,30.0708,0,1,1,...,0,0,0,0,0,0,0,0,0,1


### **Modelling**

In [44]:
# LogisticRegression
lgr=LogisticRegression(random_state = 42,solver='liblinear')
cv = KFold(n_splits=10)
cv.get_n_splits(X_train)
for train_index, test_index in cv.split(X_train):
    lgr.fit(X_train.iloc[train_index,:], Y_train[train_index])
    lgrpred = lgr.predict(X_train.iloc[test_index,:])
    acc_lgr = accuracy_score(Y_train[test_index],lgrpred)*100
    print(acc_lgr)

80.0
80.89887640449437
78.65168539325843
82.02247191011236
79.7752808988764
77.52808988764045
76.40449438202246
80.89887640449437
87.64044943820225
82.02247191011236


In [46]:
#RandomForestClassifier
rfc=RandomForestClassifier()
cv = KFold(n_splits=10)
cv.get_n_splits(X_train)
for train_index, test_index in cv.split(X_train):
    rfc.fit(X_train.iloc[train_index,:], Y_train[train_index])
    rfcpred = rfc.predict(X_train.iloc[test_index,:])
    acc_rfc = accuracy_score(Y_train[test_index],rfcpred)*100
    print(acc_rfc)

75.55555555555556
79.7752808988764
77.52808988764045
75.28089887640449
83.14606741573034
79.7752808988764
83.14606741573034
77.52808988764045
84.26966292134831
83.14606741573034


In [47]:
#MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(150), max_iter=100,
                     solver='lbfgs', verbose=1, activation='logistic', random_state=42)  

model.fit(X_train,Y_train)
accuracy_score(Y_train, model.predict(X_train))

0.7048260381593715

### **Submission**

In [48]:
Y_pred_test = rfc.predict(X_test)

In [49]:
submission = pd.DataFrame(index=X_test.PassengerId)
submission['Survived'] = rfc.predict(X_test)
submission.reset_index().to_csv('submission.csv', index=False)