In [68]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
import pydotplus
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing,model_selection,tree,metrics
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from IPython.display import Image, display
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

In [77]:
Url='https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic = pd.read_csv(Url)

In [78]:
#titanic.columns =['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
titanic.count()
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [79]:
# Factorising sex column
titanic['Sex_fact'], _ = pd.factorize(titanic['Sex'])

In [80]:
# select features
y = titanic['Survived']
X = titanic[['Pclass', 'Sex_fact', 'Age', 'SibSp', 'Parch', 'Fare']]

In [81]:
# Checking if any feature column having null value or not.
X.isnull().any()

Pclass      False
Sex_fact    False
Age          True
SibSp       False
Parch       False
Fare        False
dtype: bool

In [82]:
# We observed that Age column is having null value with 177 records. So we will fill those with 0 value
print(X['Age'].isnull().sum())
print(y.isnull().sum())

177
0


In [83]:
## Cross valition 

###### Data manipulation ########
X.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [84]:
# So now all the data for feature columns are not NaN.
X.isnull().sum()

Pclass      0
Sex_fact    0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [85]:
# split data into 80% training and 20% test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=100)

In [86]:
# train the decision tree
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
dtree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [87]:
# use the model to make predictions with the test data
y_pred = dtree.predict(X_test)

In [88]:
dot_data = tree.export_graphviz(dtree, out_file=None, filled=True, rounded=True,
                                feature_names=['Pclass', 'Sex_fact', 'Age', 'SibSp', 'Parch', 'Fare'],
                                class_names=['setosa', 'versicolor', 'virginica','a','b','c']
                                )
graph = pydotplus.graph_from_dot_data(dot_data)

In [89]:
#display(Image(graph.create_png()))

display(Image(data=graph.create_png()))

InvocationException: GraphViz's executables not found

In [91]:
# Evaluate the Model performance 
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))


accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 36
Accuracy: 0.80


In [92]:
## Cross Validation

cv = KFold(n=len(X),  # Number of elements
           n_folds=10,            # Desired number of cv folds
           random_state=12) 

In [93]:
fold_accuracy = []

# titanic_train["Sex"] = encoded_sex

for train_fold, valid_fold in cv:
    train = X.loc[train_fold] # Extract train data with cv indices
    valid = X.loc[valid_fold] # Extract valid data with cv indices
    
    train_y = y.loc[train_fold]
    valid_y = y.loc[valid_fold]
    
    model = dtree.fit(X = train, 
                           y = train_y)
    valid_acc = model.score(X = valid, 
                            y = valid_y)
    fold_accuracy.append(valid_acc)

In [94]:
print("Accuracy per fold: ", fold_accuracy, "\n")
print("Average accuracy: ", sum(fold_accuracy)/len(fold_accuracy))

Accuracy per fold:  [0.8111111111111111, 0.8426966292134831, 0.7640449438202247, 0.8314606741573034, 0.7865168539325843, 0.8089887640449438, 0.7752808988764045, 0.7752808988764045, 0.8651685393258427, 0.797752808988764] 

Average accuracy:  0.8058302122347065


In [95]:
scores = cross_val_score(estimator= dtree,     # Model to test
                X= X,  
                y = y,      # Target variable
                scoring = "accuracy",               # Scoring metric    
                cv=10)                              # Cross validation folds

print("Accuracy per fold: ")
print(scores)
print("Average accuracy: ", scores.mean())

Accuracy per fold: 
[0.81111111 0.81111111 0.78651685 0.83146067 0.82022472 0.78651685
 0.79775281 0.78651685 0.83146067 0.79545455]
Average accuracy:  0.8058126205879015


# So model is predicting with 80% accuracy 