## Classifying And Predicting the survivability of the passangers given in the dataset.

### Classification using the decision tree

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing as pp

trainData = pd.read_csv("Dataset/train.csv")

trainData.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

#### So no null cells in the required variable columns 

In [2]:
encoded_gender = pp.LabelEncoder().fit_transform(trainData['Sex'])

In [3]:
classify = pd.DataFrame([trainData['Age'],encoded_gender,trainData['Fare']]).T

tree_model = tree.DecisionTreeClassifier(max_depth=6)

tree_model.fit(X=classify,y=trainData['Survived'])

DecisionTreeClassifier(max_depth=6)

In [4]:
with open("Dtree.dot","w") as f:
    f=tree.export_graphviz(tree_model,feature_names=["Age","Sex","Fare"],out_file=f)

### "Tree Image" 

In [5]:
#Rules Updating

### After the Classification has done, the tree model is generated

#### Below we have the accuracy of the trained model

In [6]:
tree_model.score(X=classify,y=trainData["Survived"])

0.829021372328459

### The accuracy of the trained model is 82.56% 

## =========================================================================
## Now creating the predictive model using the test dataset

In [7]:
testData =  pd.read_csv("Dataset/test.csv")

testData['Sex'] = pp.LabelEncoder().fit_transform(testData['Sex'])

test_features = pd.DataFrame([testData['Age'],testData['Sex'],testData['Fare']]).T

test_predict = tree_model.predict(X=test_features)

predicted_data = pd.DataFrame({"PassengerId":testData['PassengerId'],"Survived":test_predict})

#Output file created
predicted_data.to_csv("Predicted_output.csv",index=False)

predicted_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## =========================================================================
## Now applying Random Forest classifier

In [9]:
trainData['Sex']      = pp.LabelEncoder().fit_transform(trainData['Sex'])
trainData['Embarked'] = pp.LabelEncoder().fit_transform(trainData['Embarked'])

trainData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [28]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)
features=["Pclass","Sex","Age","SibSp","Fare","Embarked"]
rf_model.fit(X=trainData[features],y=trainData['Survived'])

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [30]:
rf_model.oob_score_

0.8065241844769404

#### The oob accuracy is 80.87% 

In [31]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature,"\t: ", imp)

Pclass 	:  0.0908766964500156
Sex 	:  0.26843661015634557
Age 	:  0.2707505197091005
SibSp 	:  0.0520565909038855
Fare 	:  0.28332908373234006
Embarked 	:  0.03455049904831279


### Only "Age", "Sex", "Fare" has the higher value compare to others so only these variables are the important ones.

## Now only taking the important variables

In [32]:
features_new=["Sex","Age","Fare"]
rf_model.fit(X=trainData[features_new],y=trainData['Survived'])

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [33]:
rf_model.oob_score_

0.7907761529808774

In [34]:
#### The oob accuracy is 79.07% 