In [1]:
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
import numpy as np

In [4]:
from sklearn import tree

In [5]:
from sklearn import preprocessing

In [6]:
titanic_train_data = pd.read_csv('train.csv')

In [7]:
titanic_test_data = pd.read_csv('test.csv')

In [8]:
titanic_train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [9]:
titanic_train_data["Age"].mean()

32.69985376827896

In [10]:
new_age = np.where(titanic_train_data["Age"].isnull(),32,titanic_train_data["Age"])

In [11]:
titanic_train_data["Age"] = new_age

In [12]:
titanic_train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [13]:
# Drop cabin column
titanic_train_data.drop('Cabin', axis=1, inplace=True)

In [14]:
titanic_train_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [15]:
titanic_train_data = titanic_train_data.dropna()

In [16]:
titanic_train_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [17]:
label_encoder = preprocessing.LabelEncoder()

In [18]:
label_encoder.fit(titanic_train_data['Sex'])
titanic_train_data['Sex'] = label_encoder.transform(titanic_train_data['Sex'])

In [19]:
label_encoder.fit(titanic_train_data['Embarked'])
titanic_train_data['Embarked'] = label_encoder.fit_transform(titanic_train_data['Embarked'])
#titanic_test_data['Embarked'] = le.fit_transform(titanic_test_data['Embarked'])

In [20]:
titanic_test_data["Age"].mean()

30.272590361445783

In [21]:
new_age1 = np.where(titanic_test_data["Age"].isnull(),30,titanic_test_data["Age"])

In [22]:
titanic_test_data["Age"] = new_age1

In [23]:
titanic_test_data.drop('Cabin', axis=1, inplace=True)

In [24]:
titanic_test_data = titanic_test_data.dropna()

In [25]:
titanic_test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## Random Forest

In [26]:
rf = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [27]:
titanic_train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,2
...,...,...,...,...,...,...,...,...,...,...,...
884,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,2
885,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,2
886,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,45.0,1,2,W./C. 6607,23.4500,2
887,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,0


In [28]:
features = ['Sex', 'Pclass', 'SibSp', 'Embarked', 'Age', 'Fare', 'Parch']

In [29]:
rf.fit(X = titanic_train_data[features], y = titanic_train_data['Survived'])

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [30]:
print("OOB Accuracy Score")
print(rf.oob_score_);

OOB Accuracy Score
0.8053993250843644


In [31]:
for feature, imp in zip(features,rf.feature_importances_):
    print(feature, imp)

Sex 0.26040991447469775
Pclass 0.08833184737019481
SibSp 0.05064287529165377
Embarked 0.03447535254846027
Age 0.25770306929165154
Fare 0.26959068693011423
Parch 0.03884625409322762


Now we can say that Sex, Age and Fare are the most important variable

## Decision Tree

In [32]:
tree_model = tree.DecisionTreeClassifier(max_depth=8)

In [33]:
feature_new = ['Sex', 'Age', 'Fare']

In [34]:
tree_model.fit(X=titanic_train_data[feature_new], y=titanic_train_data['Survived'])

DecisionTreeClassifier(max_depth=8)

In [35]:
with open('DTree.dot', 'w') as file:
    file = tree.export_graphviz(tree_model, feature_names=feature_new, out_file=file);

In [36]:
tree_model.score(X=titanic_train_data[feature_new], y=titanic_train_data['Survived'])

0.8706411698537683

In [37]:
label_encoder.fit(titanic_test_data['Sex'])
titanic_test_data['Sex'] = label_encoder.transform(titanic_test_data['Sex'])

In [38]:
label_encoder.fit(titanic_test_data['Embarked'])
titanic_test_data['Embarked'] = label_encoder.fit_transform(titanic_test_data['Embarked'])


In [39]:
test_pred = tree_model.predict(X=titanic_test_data[feature_new])

In [40]:
test_pred

array([0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,