In [1]:
#Predicting Survival in the Titanic Data Set
#We will be using a decision tree to make predictions about the Titanic data set from
#Kaggle. This data set provides information on the Titanic passengers and can be used to
#predict whether a passenger survived or not.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [3]:
url= 'https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic = pd.read_csv(url)
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']

In [4]:
data = pd.DataFrame(titanic, columns = titanic.columns)

In [5]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
features = ['Pclass','Sex','Age','SibSp','Parch','Fare']

In [7]:
target = ['Survived']

In [8]:
df_columns = features + target
df = data[df_columns]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Survived    891 non-null int64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.8+ KB


In [10]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [11]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [12]:
catenc = LabelEncoder()

In [13]:
df_obj_array = catenc.fit_transform((df['Sex'].astype(str)))

In [14]:
df['Sex'] =  df_obj_array

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null int32
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Survived    891 non-null int64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.3 KB


In [16]:
imputer  = Imputer(strategy = 'mean')



In [17]:
df_encoded_array = imputer.fit_transform(df)

In [18]:
df_encoded = pd.DataFrame(df_encoded_array, columns = df_columns)

In [19]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null float64
Sex         891 non-null float64
Age         891 non-null float64
SibSp       891 non-null float64
Parch       891 non-null float64
Fare        891 non-null float64
Survived    891 non-null float64
dtypes: float64(7)
memory usage: 48.8 KB


In [20]:
df_encoded.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3.0,1.0,22.0,1.0,0.0,7.25,0.0
1,1.0,0.0,38.0,1.0,0.0,71.2833,1.0
2,3.0,0.0,26.0,0.0,0.0,7.925,1.0
3,1.0,0.0,35.0,1.0,0.0,53.1,1.0
4,3.0,1.0,35.0,0.0,0.0,8.05,0.0


In [21]:
X = df_encoded[features]
y = df_encoded[target]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 5)

In [23]:
clf = LogisticRegression()

In [24]:
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
clf.score(X_train, y_train)

0.7865168539325843

In [26]:
clf.score(X_test, y_test)

0.8207282913165266

In [28]:
y_clf_pred = clf.predict(X_test)

In [29]:
y_clf_prob = clf.predict_proba(X_test)

In [30]:
metrics.accuracy_score(y_test, y_clf_pred)

0.8207282913165266

In [31]:
metrics.roc_auc_score(y_test, y_clf_prob[:, 1])

0.8652490681125043

In [32]:
metrics.confusion_matrix(y_test, y_clf_pred)

array([[204,  23],
       [ 41,  89]], dtype=int64)

In [33]:
metrics.classification_report(y_test, y_clf_pred)

'              precision    recall  f1-score   support\n\n         0.0       0.83      0.90      0.86       227\n         1.0       0.79      0.68      0.74       130\n\n   micro avg       0.82      0.82      0.82       357\n   macro avg       0.81      0.79      0.80       357\nweighted avg       0.82      0.82      0.82       357\n'

In [34]:
cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.8       , 0.78888889, 0.7752809 , 0.83146067, 0.79775281,
       0.76404494, 0.79775281, 0.75280899, 0.84269663, 0.80681818])

In [35]:
cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10).mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.795750482351606

In [36]:
tree = DecisionTreeClassifier()

In [37]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [38]:
tree.score(X_train, y_train)

0.9868913857677902

In [39]:
tree.score(X_test, y_test)

0.7675070028011205

In [40]:
y_tree_pred = tree.predict(X_test)

In [41]:
y_tree_prob = tree.predict_proba(X_test)

In [42]:
metrics.accuracy_score(y_test, y_tree_pred)

0.7675070028011205

In [43]:
metrics.roc_auc_score(y_test, y_tree_prob[:, 1])

0.730260928498814

In [44]:
metrics.confusion_matrix(y_test, y_tree_pred)

array([[193,  34],
       [ 49,  81]], dtype=int64)

In [45]:
metrics.classification_report(y_test, y_tree_pred)

'              precision    recall  f1-score   support\n\n         0.0       0.80      0.85      0.82       227\n         1.0       0.70      0.62      0.66       130\n\n   micro avg       0.77      0.77      0.77       357\n   macro avg       0.75      0.74      0.74       357\nweighted avg       0.76      0.77      0.76       357\n'

In [46]:
cross_val_score(DecisionTreeClassifier(), X, y, scoring='accuracy', cv=10)

array([0.74444444, 0.7       , 0.71910112, 0.75280899, 0.79775281,
       0.75280899, 0.82022472, 0.78651685, 0.80898876, 0.84090909])

In [47]:
cross_val_score(DecisionTreeClassifier(), X, y, scoring='accuracy', cv=10).mean()

0.7689845080013619

In [48]:
#Just to test how the two different algorthms perform let us predict the probaility of survival
clf.predict_proba(np.array([[1,1,24,2,1,5]]))

array([[0.73568459, 0.26431541]])

In [50]:
tree.predict_proba(np.array([[1,1,24,2,1,5]]))

array([[1., 0.]])

In [None]:
#As seen above the Decision tree predicts the probabilty of survival of a 24 year old male is 0%
#but Logistic regression predicts the probability as 26.4%
#Based on the accuracy, confusion matrix, cross validation score and other metrics it can be concluided that for this data set
#Logistic regression performs better than Decision Tree