In [1]:
import pandas as pd

## Reading the dataset

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest
0,1,1,Allen Miss. Elisabeth Walton,female,29.0,0,0,24160,211.3375,B5,S,2.0,,St Louis MO
1,1,1,Allison Master. Hudson Trevor,male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,Montreal PQ / Chesterville ON
2,1,0,Allison Miss. Helen Loraine,female,2.0,1,2,113781,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON
3,1,0,Allison Mr. Hudson Joshua Creighton,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,Montreal PQ / Chesterville ON
4,1,0,Allison Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,1,2,113781,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON


## Dropping Unwanted Columns

In [4]:
colsToDrop = ['name','cabin','boat','body','home_dest','ticket']

In [5]:
df.drop(colsToDrop, inplace=True,axis=1)

df = df.drop(colsToDrop, axis=1)

In [6]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


## Changing the datatypes

In [7]:
df.dtypes

pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object

In [8]:
df['pclass'] = df['pclass'].astype('category')
#df.pclass = df.pclass.astype('category')

In [9]:
df['sex'] = df['sex'].astype('category')

In [10]:
df['embarked'] = df['embarked'].astype('category')

In [11]:
df['survived'] = df['survived'].astype('category')

In [12]:
df.dtypes

pclass      category
survived    category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

df.parch.value_counts()

df.sibsp.value_counts()

df['parch'] = df['parch'].map(lambda x: str(int(x)) if x <= 2 else '>2')

df['sibsp'] = df['sibsp'].map(lambda x: str(int(x)) if x <= 4 else '>4')

In [24]:
df.parch.value_counts()

0     1002
1      170
2      113
>2      24
Name: parch, dtype: int64

In [25]:
df.sibsp.value_counts()

0     891
1     319
2      42
4      22
3      20
>4     15
Name: sibsp, dtype: int64

In [26]:
df.dtypes

pclass      category
survived    category
sex         category
age          float64
sibsp         object
parch         object
fare         float64
embarked    category
dtype: object

## Handling Missing Information

In [13]:
df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [28]:
df.sibsp.isnull().sum()

0

In [14]:
for col in df:
    if df[col].isnull().sum() > 0:
        if str(df[col].dtype) == 'category':
            df[col] = df[col].fillna(value = df[col].mode()[0])
        else:
            print(col)
            df[col] = df[col].fillna(value = df[col].mean())

age
fare


In [16]:
df.isnull().sum().sum()

0

## Seperating Target Column

In [17]:
y = df['survived']

In [25]:
X = df.drop(['survived'],axis = 1)

In [26]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1,female,29.0,0,0,211.3375,S
1,1,male,0.9167,1,2,151.55,S
2,1,female,2.0,1,2,151.55,S
3,1,male,30.0,1,2,151.55,S
4,1,female,25.0,1,2,151.55,S


In [34]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, int64): [0, 1]

In [27]:
X.dtypes

pclass      category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

## Converting Categorical to Numeric

In [28]:
X.dtypes

pclass      category
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

In [29]:
X = pd.get_dummies(X,drop_first=True)

In [30]:
X.head()

Unnamed: 0,age,sibsp,parch,fare,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S
0,29.0,0,0,211.3375,0,0,0,0,1
1,0.9167,1,2,151.55,0,0,1,0,1
2,2.0,1,2,151.55,0,0,0,0,1
3,30.0,1,2,151.55,0,0,1,0,1
4,25.0,1,2,151.55,0,0,0,0,1


## Train Test Split

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test  = train_test_split(X,y,test_size=0.3, random_state=123)

In [35]:
X.shape

(1309, 9)

In [36]:
X_train.shape

(916, 9)

In [40]:
X_test.shape

(393, 15)

## Model Building

from sklearn.neural_network import MLPClassifier

In [37]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.ensemble import AdaBoostClassifier

clf = MLPClassifier(hidden_layer_sizes=(15,7),activation='relu',learning_rate='adaptive', alpha=0.5,max_iter=1000)

In [50]:
clf= DecisionTreeClassifier(criterion='gini',max_depth=4)

In [51]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [52]:
preds = clf.predict(X_test)

In [53]:
preds

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0,

preds = pd.DataFrame(preds)

preds.columns = ['Pred_0','Pred_1']

preds.head()

In [48]:
from sklearn.metrics import classification_report,confusion_matrix

In [49]:
confusion_matrix(y_test,preds)

array([[201,  43],
       [ 39, 110]], dtype=int64)

In [54]:
print(classification_report(y_test,preds))

             precision    recall  f1-score   support

          0       0.83      0.89      0.86       244
          1       0.80      0.70      0.75       149

avg / total       0.82      0.82      0.82       393



In [37]:
clf = RandomForestClassifier()#n_estimators=10,max_depth=

In [57]:
clf = AdaBoostClassifier(n_estimators=50)

In [72]:
clf = GradientBoostingClassifier(n_estimators=1000,verbose=True,max_depth=4,min_samples_leaf=5)

In [58]:
clf.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

## Visualize the tree

In [55]:
with open("decisiontree.dot", 'w') as f:
    f = export_graphviz(clf, out_file=f,feature_names=X.columns.values,filled=True, rounded=True,special_characters=True,class_names=['0','1'], proportion=True)

type the following code in the command line

dot -Tpng decisiontree.dot -o outfile.png

## Measuring Model Performance

In [74]:
from sklearn.metrics import confusion_matrix,recall_score, precision_score

In [75]:
preds = clf.predict(X_test)

In [76]:
confusion_matrix(y_test,preds)

array([[203,  28],
       [ 55, 107]])

In [42]:
recall_score(y_test,preds,pos_label=1)

0.65432098765432101

In [43]:
clf.feature_importances_

array([ 0.24720297,  0.30288213,  0.01457821,  0.0645454 ,  0.24465364,
        0.02105507,  0.00767393,  0.007664  ,  0.00527658,  0.00492541,
        0.0228685 ,  0.0122918 ,  0.00964291,  0.01197513,  0.02276431])

In [44]:
dict(zip(X.columns.values,clf.feature_importances_))

{'age': 0.24720296873525655,
 'embarked_Q': 0.011975134406695235,
 'embarked_S': 0.02276430742300144,
 'fare': 0.30288213408199416,
 'parch_1': 0.02286850162938666,
 'parch_2': 0.012291797766073878,
 'parch_>2': 0.0096429145703076907,
 'pclass_2.0': 0.014578205986743786,
 'pclass_3.0': 0.06454540268792347,
 'sex_male': 0.24465363727830258,
 'sibsp_1': 0.021055072251611281,
 'sibsp_2': 0.0076739297009471089,
 'sibsp_3': 0.0076640025448835008,
 'sibsp_4': 0.0052765848600943143,
 'sibsp_>4': 0.0049254060767784285}

In [77]:
from sklearn.model_selection import cross_val_score

In [89]:
clf = RandomForestClassifier(n_estimators=10,min_samples_leaf=5,max_depth=4,class_weight='balanced')#n_estimators=10,max_depth=

In [90]:
cross_val_score(clf,X,y,cv=4,scoring='recall')

array([ 0.936,  0.792,  0.704,  0.248])