### Imports

In [36]:
import pandas as pd
import numpy as np
import sklearn
import pickle

### Loading Dataset

In [37]:
data_train = pd.read_csv('train.csv')
data_test=pd.read_csv('test.csv')
sub=pd.read_csv('gender_submission.csv')

In [38]:
data_train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
310,311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C
402,403,0,3,"Jussila, Miss. Mari Aina",female,21.0,1,0,4137,9.825,,S
372,373,0,3,"Beavan, Mr. William Thomas",male,19.0,0,0,323951,8.05,,S
168,169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S
652,653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8475,8.4333,,S
349,350,0,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,,S
141,142,1,3,"Nysten, Miss. Anna Sofia",female,22.0,0,0,347081,7.75,,S
253,254,0,3,"Lobb, Mr. William Arthur",male,30.0,1,0,A/5. 3336,16.1,,S
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,,S
383,384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline To...",female,35.0,1,0,113789,52.0,,S


In [39]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 177
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 2


### Pre-processing Data

In [40]:
data_train['Age'].fillna(data_train['Age'].mean(), inplace = True)
data_train['Embarked'].fillna(data_train['Embarked'].bfill(), inplace = True)

data_test['Age'].fillna(data_test['Age'].mean(), inplace = True)
data_test['Fare'].fillna(data_test['Fare'].mean(), inplace = True)
data_test['Embarked'].fillna(data_test['Embarked'].bfill(), inplace = True)

In [41]:
column_names = data_train.columns
for column in column_names:
    print(column + ' - ' + str(data_train[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 0
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 0


In [42]:
data_train['Sex'] = data_train['Sex'].map({'male':0, 'female':1})
data_train['Embarked'] = data_train['Embarked'].map({'C':0, 'Q':1, 'S':2})
data_test['Sex'] = data_test['Sex'].map({'male':0, 'female':1})
data_test['Embarked'] = data_test['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [43]:
passenger_test=pd.DataFrame(data_test['PassengerId'])
y_train=data_train['Survived']
y_test=sub['Survived']
x_train = data_train.drop(['Survived','PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
x_test= data_test.drop(['PassengerId', 'Name','Ticket', 'Cabin'], axis = 1)
x_train.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
879,1,1,56.0,0,1,83.1583,0
294,3,0,24.0,0,0,7.8958,2
560,3,0,29.699118,0,0,7.75,1
530,2,1,2.0,1,1,26.0,2
758,3,0,34.0,0,0,8.05,2
501,3,1,21.0,0,0,7.75,1
50,3,0,7.0,4,1,39.6875,2
573,3,1,29.699118,0,0,7.75,1
272,2,1,41.0,0,1,19.5,2
201,3,0,29.699118,8,2,69.55,2


In [29]:
x_test.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
287,1,0,24.0,1,0,82.2667,2
337,3,0,39.0,0,2,7.2292,0
361,2,1,24.0,1,1,37.0042,0
309,3,1,45.0,1,0,14.1083,2
167,3,0,18.0,2,2,34.375,2
320,3,0,26.0,0,0,7.775,2
223,3,0,21.0,0,0,7.7958,2
217,1,0,57.0,1,1,164.8667,2
321,3,0,25.0,0,0,7.2292,0
367,3,1,22.0,0,0,39.6875,2


### Creating Decision tree Model

In [30]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_depth = 10, min_samples_split = 20)
decision_tree.fit(x_train,y_train)
y_pred = decision_tree.predict(x_test)

In [31]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE : ',RMSE)

cm = confusion_matrix(y_test, y_pred)
print(cm)

RMSE :  0.39735970711951313
[[249  17]
 [ 49 103]]


### Saving the model and pre-processed test data set

In [48]:
pickle_out = open("model.pickle","wb")
x_test.to_csv('testing_data.csv')
passenger_test.to_csv('passenger.csv')
pickle.dump(decision_tree, pickle_out)
pickle_out.close()

### Loading it to check validity

In [76]:
pickle_in = open("model.pickle","rb")
dt2 = pickle.load(pickle_in)
pickle_in.close()
testing_data=pd.read_csv('testing_data.csv',index_col=[0])

In [34]:
y_pred2=dt2.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred2))
print('RMSE : ',RMSE)

RMSE :  0.39735970711951313


In [58]:
passenger_test[passenger_test['PassengerId']==896]


Unnamed: 0,PassengerId
4,896


In [59]:
data_test[data_test['PassengerId']==896]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,2
