### Imports

In [22]:
import pandas as pd
import numpy as np
import sklearn
import pickle

### Loading Dataset

In [23]:
data_train = pd.read_csv('train.csv')
data_test=pd.read_csv('test.csv')
sub=pd.read_csv('gender_submission.csv')

In [24]:
data_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
663,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
384,385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
726,727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30.0,3,0,31027,21.0,,S
178,179,0,2,"Hale, Mr. Reginald",male,30.0,0,0,250653,13.0,,S
349,350,0,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,,S
219,220,0,2,"Harris, Mr. Walter",male,30.0,0,0,W/C 14208,10.5,,S
473,474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23.0,0,0,SC/AH Basle 541,13.7917,D,C
312,313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S


In [25]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 177
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 2


### Pre-processing Data

In [26]:
data_train['Age'].fillna(data_train['Age'].mean(), inplace = True)
data_train['Embarked'].fillna(data_train['Embarked'].bfill(), inplace = True)

data_test['Age'].fillna(data_test['Age'].mean(), inplace = True)
data_test['Fare'].fillna(data_test['Fare'].mean(), inplace = True)
data_test['Embarked'].fillna(data_test['Embarked'].bfill(), inplace = True)

In [27]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 0
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 0


In [28]:
data_train['Sex'] = data_train['Sex'].map({'male':0, 'female':1})
data_train['Embarked'] = data_train['Embarked'].map({'C':0, 'Q':1, 'S':2})
data_test['Sex'] = data_test['Sex'].map({'male':0, 'female':1})
data_test['Embarked'] = data_test['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [29]:
passenger_test=pd.DataFrame(data_test['PassengerId'])
y_train=data_train['Survived']
y_test=sub['Survived']
x_train = data_train.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis = 1)
x_test= data_test.drop([ 'Name','Ticket', 'Cabin'], axis = 1)
x_train.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
251,252,3,1,29.0,1,1,10.4625,2
379,380,3,0,19.0,0,0,7.775,2
744,745,3,0,31.0,0,0,7.925,2
466,467,2,0,29.699118,0,0,0.0,2
616,617,3,0,34.0,1,1,14.4,2
610,611,3,1,39.0,1,5,31.275,2
264,265,3,1,29.699118,0,0,7.75,1
248,249,1,0,37.0,1,1,52.5542,2
847,848,3,0,35.0,0,0,7.8958,0
390,391,1,0,36.0,1,2,120.0,2


In [30]:
x_test.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
93,985,3,0,30.27259,0,0,8.05,2
176,1068,2,1,20.0,0,0,36.75,2
125,1017,3,1,17.0,0,1,16.1,2
148,1040,1,0,30.27259,0,0,26.55,2
390,1282,1,0,23.0,0,0,93.5,2
252,1144,1,0,27.0,1,0,136.7792,0
234,1126,1,0,39.0,1,0,71.2833,0
380,1272,3,0,30.27259,0,0,7.75,1
61,953,2,0,32.0,0,0,13.5,2
56,948,3,0,35.0,0,0,7.8958,2


In [31]:
def remove_pk(data):
    new_data=data.loc[ : , data.columns != 'PassengerId']
    return new_data

### Creating Decision tree Model

In [32]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth = 10, min_samples_split = 20)
decision_tree.fit(remove_pk(x_train),y_train)
y_pred = decision_tree.predict(remove_pk(x_test))

In [33]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE : ',RMSE)

cm = confusion_matrix(y_test, y_pred)
print(cm)

RMSE :  0.39735970711951313
[[249  17]
 [ 49 103]]


In [34]:
query_data=pd.concat([x_train, x_test])
query_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.00000,1,0,7.2500,2
1,2,1,1,38.00000,1,0,71.2833,0
2,3,3,1,26.00000,0,0,7.9250,2
3,4,1,1,35.00000,1,0,53.1000,2
4,5,3,0,35.00000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
413,1305,3,0,30.27259,0,0,8.0500,2
414,1306,1,1,39.00000,0,0,108.9000,0
415,1307,3,0,38.50000,0,0,7.2500,2
416,1308,3,0,30.27259,0,0,8.0500,2


### Saving the model and pre-processed test data set

In [35]:
pickle_out = open("model.pickle","wb")
query_data.to_csv('testing_data.csv')
passenger_test.to_csv('passenger.csv',index=False)
pickle.dump(decision_tree, pickle_out)
pickle_out.close()

### Loading it to check validity

In [36]:
pickle_in = open("model.pickle","rb")
dt2 = pickle.load(pickle_in)
pickle_in.close()
testing_data=pd.read_csv('testing_data.csv',index_col=[0])

In [37]:
y_pred2=dt2.predict(remove_pk(x_test))
RMSE = np.sqrt(mean_squared_error(y_test, y_pred2))
print('RMSE : ',RMSE)

RMSE :  0.39735970711951313


In [38]:
passenger_test[passenger_test['PassengerId']==896]


Unnamed: 0,PassengerId
4,896


In [39]:
data_test[data_test['PassengerId']==896]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,2
