### Imports

In [1]:
import pandas as pd
import numpy as np
import sklearn
import pickle

### Loading Dataset

In [2]:
data_train = pd.read_csv('train.csv')
data_test=pd.read_csv('test.csv')
sub=pd.read_csv('gender_submission.csv')

In [3]:
data_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
736,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48.0,1,3,W./C. 6608,34.375,,S
77,78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S
355,356,0,3,"Vanden Steen, Mr. Leo Peter",male,28.0,0,0,345783,9.5,,S
706,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
423,424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria ...",female,28.0,1,1,347080,14.4,,S
442,443,0,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,S
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C
452,453,0,1,"Foreman, Mr. Benjamin Laventall",male,30.0,0,0,113051,27.75,C111,C
459,460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q
217,218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,243847,27.0,,S


In [4]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 177
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 2


### Pre-processing Data

In [5]:
data_train['Age'].fillna(data_train['Age'].mean(), inplace = True)
data_train['Embarked'].fillna(data_train['Embarked'].bfill(), inplace = True)

data_test['Age'].fillna(data_test['Age'].mean(), inplace = True)
data_test['Fare'].fillna(data_test['Fare'].mean(), inplace = True)
data_test['Embarked'].fillna(data_test['Embarked'].bfill(), inplace = True)

In [6]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 0
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 0


In [7]:
data_train['Sex'] = data_train['Sex'].map({'male':0, 'female':1})
data_train['Embarked'] = data_train['Embarked'].map({'C':0, 'Q':1, 'S':2})
data_test['Sex'] = data_test['Sex'].map({'male':0, 'female':1})
data_test['Embarked'] = data_test['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [8]:
passenger_test=pd.DataFrame(data_test['PassengerId'])
y_train=data_train['Survived']
y_test=sub['Survived']
x_train = data_train.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis = 1)
x_test= data_test.drop([ 'Name','Ticket', 'Cabin'], axis = 1)
x_train.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
791,792,2,0,16.0,0,0,26.0,2
35,36,1,0,42.0,1,0,52.0,2
761,762,3,0,41.0,0,0,7.125,2
401,402,3,0,26.0,0,0,8.05,2
391,392,3,0,21.0,0,0,7.7958,2
460,461,1,0,48.0,0,0,26.55,2
361,362,2,0,29.0,1,0,27.7208,0
570,571,2,0,62.0,0,0,10.5,2
700,701,1,1,18.0,1,0,227.525,0
433,434,3,0,17.0,0,0,7.125,2


In [9]:
x_test.sample(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
132,1024,3,1,30.27259,0,4,25.4667,2
362,1254,2,1,31.0,0,0,21.0,2
325,1217,3,0,23.0,0,0,7.05,2
97,989,3,0,29.0,0,0,7.925,2
130,1022,3,0,32.0,0,0,8.05,2
351,1243,2,0,25.0,0,0,10.5,2
246,1138,2,1,22.0,0,0,21.0,2
277,1169,2,0,40.0,1,0,26.0,2
326,1218,2,1,12.0,2,1,39.0,2
399,1291,3,0,31.0,0,0,7.7333,1


In [10]:
def remove_pk(data):
    new_data=data.loc[ : , data.columns != 'PassengerId']
    return new_data

### Creating Decision tree Model

In [11]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth = 10, min_samples_split = 20)
decision_tree.fit(remove_pk(x_train),y_train)
y_pred = decision_tree.predict(remove_pk(x_test))

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE : ',RMSE)

cm = confusion_matrix(y_test, y_pred)
print(cm)

RMSE :  0.39735970711951313
[[249  17]
 [ 49 103]]


In [13]:
query_data=pd.concat([x_train, x_test])
query_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.00000,1,0,7.2500,2
1,2,1,1,38.00000,1,0,71.2833,0
2,3,3,1,26.00000,0,0,7.9250,2
3,4,1,1,35.00000,1,0,53.1000,2
4,5,3,0,35.00000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
413,1305,3,0,30.27259,0,0,8.0500,2
414,1306,1,1,39.00000,0,0,108.9000,0
415,1307,3,0,38.50000,0,0,7.2500,2
416,1308,3,0,30.27259,0,0,8.0500,2


### Saving the model and pre-processed test data set

In [15]:
pickle_out = open("model.pickle","wb")
query_data.to_csv('testing_data.csv')
passenger_test.to_csv('passenger.csv',index=False)
pickle.dump(decision_tree, pickle_out)
pickle_out.close()

### Loading it to check validity

In [55]:
pickle_in = open("model.pickle","rb")
dt2 = pickle.load(pickle_in)
pickle_in.close()
testing_data=pd.read_csv('testing_data.csv',index_col=[0])

In [57]:
y_pred2=dt2.predict(remove_pk(x_test))
RMSE = np.sqrt(mean_squared_error(y_test, y_pred2))
print('RMSE : ',RMSE)

RMSE :  0.39735970711951313


In [58]:
passenger_test[passenger_test['PassengerId']==896]


Unnamed: 0,PassengerId
4,896


In [59]:
data_test[data_test['PassengerId']==896]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,2
