### Imports

In [41]:
import pandas as pd
import numpy as np
import sklearn
import pickle

### Loading Dataset

In [42]:
data_train = pd.read_csv('train.csv')
data_test=pd.read_csv('test.csv')
sub=pd.read_csv('gender_submission.csv')

In [43]:
data_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
452,453,0,1,"Foreman, Mr. Benjamin Laventall",male,30.0,0,0,113051,27.75,C111,C
434,435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,E44,S
436,437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21.0,2,2,W./C. 6608,34.375,,S
322,323,1,2,"Slayter, Miss. Hilda Mary",female,30.0,0,0,234818,12.35,,Q
537,538,1,1,"LeRoy, Miss. Bertha",female,30.0,0,0,PC 17761,106.425,,C
291,292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19.0,1,0,11967,91.0792,B49,C
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
112,113,0,3,"Barton, Mr. David John",male,22.0,0,0,324669,8.05,,S
604,605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35.0,0,0,111426,26.55,,C
601,602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S


In [44]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 177
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 2


### Pre-processing Data

In [45]:
data_train['Age'].fillna(data_train['Age'].mean(), inplace = True)
data_train['Embarked'].fillna(data_train['Embarked'].bfill(), inplace = True)

data_test['Age'].fillna(data_test['Age'].mean(), inplace = True)
data_test['Fare'].fillna(data_test['Fare'].mean(), inplace = True)
data_test['Embarked'].fillna(data_test['Embarked'].bfill(), inplace = True)

In [46]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 0
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 0


In [47]:
data_train['Sex'] = data_train['Sex'].map({'male':0, 'female':1})
data_train['Embarked'] = data_train['Embarked'].map({'C':0, 'Q':1, 'S':2})
data_test['Sex'] = data_test['Sex'].map({'male':0, 'female':1})
data_test['Embarked'] = data_test['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [48]:
passenger_test=pd.DataFrame(data_test['PassengerId'])
y_train=data_train['Survived']
y_test=sub['Survived']
x_train = data_train.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis = 1)
x_test= data_test.drop([ 'Name','Ticket', 'Cabin'], axis = 1)
x_train.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
336,337,1,0,29.0,1,0,66.6,2
289,290,3,1,22.0,0,0,7.75,1
228,229,2,0,18.0,0,0,13.0,2
356,357,1,1,22.0,0,1,55.0,2
838,839,3,0,32.0,0,0,56.4958,2
127,128,3,0,24.0,0,0,7.1417,2
747,748,2,1,30.0,0,0,13.0,2
692,693,3,0,29.699118,0,0,56.4958,2
531,532,3,0,29.699118,0,0,7.2292,0
826,827,3,0,29.699118,0,0,56.4958,2


In [49]:
x_test.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
136,1028,3,0,26.5,0,0,7.225,0
84,976,2,0,30.27259,0,0,10.7083,1
151,1043,3,0,30.27259,0,0,7.8958,0
211,1103,3,0,30.27259,0,0,7.05,2
226,1118,3,0,23.0,0,0,7.7958,2
11,903,1,0,46.0,0,0,26.0,2
89,981,2,0,2.0,1,1,23.0,2
293,1185,1,0,53.0,1,1,81.8583,2
110,1002,2,0,41.0,0,0,15.0458,0
103,995,3,0,26.0,0,0,7.775,2


In [50]:
def remove_pk(data):
    new_data=data.loc[ : , data.columns != 'PassengerId']
    return new_data

### Creating Decision tree Model

In [51]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth = 10, min_samples_split = 20)
decision_tree.fit(remove_pk(x_train),y_train)
y_pred = decision_tree.predict(remove_pk(x_test))

In [52]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE : ',RMSE)

cm = confusion_matrix(y_test, y_pred)
print(cm)

RMSE :  0.39735970711951313
[[249  17]
 [ 49 103]]


In [53]:
query_data=pd.concat([x_train, x_test])
query_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.00000,1,0,7.2500,2
1,2,1,1,38.00000,1,0,71.2833,0
2,3,3,1,26.00000,0,0,7.9250,2
3,4,1,1,35.00000,1,0,53.1000,2
4,5,3,0,35.00000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
413,1305,3,0,30.27259,0,0,8.0500,2
414,1306,1,1,39.00000,0,0,108.9000,0
415,1307,3,0,38.50000,0,0,7.2500,2
416,1308,3,0,30.27259,0,0,8.0500,2


### Saving the model and pre-processed test data set

In [54]:
pickle_out = open("model.pickle","wb")
query_data.to_csv('testing_data.csv')
passenger_test.to_csv('passenger.csv')
pickle.dump(decision_tree, pickle_out)
pickle_out.close()

### Loading it to check validity

In [55]:
pickle_in = open("model.pickle","rb")
dt2 = pickle.load(pickle_in)
pickle_in.close()
testing_data=pd.read_csv('testing_data.csv',index_col=[0])

In [57]:
y_pred2=dt2.predict(remove_pk(x_test))
RMSE = np.sqrt(mean_squared_error(y_test, y_pred2))
print('RMSE : ',RMSE)

RMSE :  0.39735970711951313


In [58]:
passenger_test[passenger_test['PassengerId']==896]


Unnamed: 0,PassengerId
4,896


In [59]:
data_test[data_test['PassengerId']==896]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,2
