In [3]:
dataFile = "titanic.csv"

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv(dataFile)

In [6]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
data = data[["Name","Age","Pclass","Survived"]]

In [9]:
data.head()

Unnamed: 0,Name,Age,Pclass,Survived
0,"Braund, Mr. Owen Harris",22.0,3,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,1
2,"Heikkinen, Miss. Laina",26.0,3,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,1
4,"Allen, Mr. William Henry",35.0,3,0


In [10]:
def checkAdult(age):
    if age>=18:
        return "Adult"
    return "Child"

data["Adult/Child"] = data["Age"].apply(checkAdult)
data.head()


Unnamed: 0,Name,Age,Pclass,Survived,Adult/Child
0,"Braund, Mr. Owen Harris",22.0,3,0,Adult
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,1,Adult
2,"Heikkinen, Miss. Laina",26.0,3,1,Adult
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,1,Adult
4,"Allen, Mr. William Henry",35.0,3,0,Adult


In [11]:
def checkGender(name):
    firstName = name[name.index(",")+2:]
    salutaion = firstName.split(" ")[0]
    if salutaion in ["Mr.", "Master"]:
        return "Male"
    return "Female"

data["Gender"] = data["Name"].apply(checkGender)
data.head()

Unnamed: 0,Name,Age,Pclass,Survived,Adult/Child,Gender
0,"Braund, Mr. Owen Harris",22.0,3,0,Adult,Male
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,1,Adult,Female
2,"Heikkinen, Miss. Laina",26.0,3,1,Adult,Female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,1,Adult,Female
4,"Allen, Mr. William Henry",35.0,3,0,Adult,Male


In [12]:
def compare(group, data):
    return data.groupby([group])["Survived"].sum()*100/data.groupby([group])["Survived"].count()

In [13]:
compare("Pclass", data) # important factor as the difference is high between the classes

Pclass
1    62.962963
2    47.282609
3    24.236253
Name: Survived, dtype: float64

In [14]:
compare("Adult/Child", data) # not so important

Adult/Child
Adult    38.103161
Child    38.965517
Name: Survived, dtype: float64

In [15]:
compare("Gender", data) # important

Gender
Female    69.786096
Male      15.667311
Name: Survived, dtype: float64

In [16]:
# Gender > Pclass > Gender

In [17]:
trainingData = data[["Age","Pclass", "Adult/Child","Gender", "Survived"]]
trainingData.head()

Unnamed: 0,Age,Pclass,Adult/Child,Gender,Survived
0,22.0,3,Adult,Male,0
1,38.0,1,Adult,Female,1
2,26.0,3,Adult,Female,1
3,35.0,1,Adult,Female,1
4,35.0,3,Adult,Male,0


In [18]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

In [19]:
catData = trainingData[["Pclass","Adult/Child","Gender"]].apply(catToNum)
trainingData[["Pclass","Adult/Child","Gender"]] = catData
trainingData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Age,Pclass,Adult/Child,Gender,Survived
0,22.0,2,0,1,0
1,38.0,0,0,0,1
2,26.0,2,0,0,1
3,35.0,0,0,0,1
4,35.0,2,0,1,0


In [20]:
len(trainingData)

891

In [21]:
trainingData = trainingData.dropna()
len(trainingData)

714

In [22]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(trainingData, test_size=0.2)

In [23]:
train.head()

Unnamed: 0,Age,Pclass,Adult/Child,Gender,Survived
546,19.0,1,0,0,1
172,1.0,2,1,0,1
786,18.0,2,0,0,1
642,2.0,2,1,0,0
244,30.0,2,0,1,0


In [24]:
test.head()

Unnamed: 0,Age,Pclass,Adult/Child,Gender,Survived
129,45.0,2,0,1,0
545,64.0,0,0,1,0
309,30.0,0,0,0,1
369,24.0,0,0,0,1
784,25.0,2,0,1,0


In [25]:
from sklearn.tree import DecisionTreeClassifier

# class_weights 
# criterion = "gini"/ information gain
# max_depth
# max_features -> it will pick the top most important ones
# max_leaf_nodes -> will simplify the tree if we limit the max number of leaves
# min_impurity_split -> min % impurity in the subset for further splitting
# min_sample_leaf 

clf = DecisionTreeClassifier(max_leaf_nodes=15).fit(train[["Age","Pclass","Gender","Adult/Child"]], train["Survived"])

In [26]:
# Visualizing the Tree
clf.feature_importances_

array([0.13418597, 0.24589991, 0.60724384, 0.01267029])

In [27]:
from sklearn import tree
with open("titanic.dot", "w") as f:
    f = tree.export_graphviz(clf, feature_names=["Age","Pclass","Adult/Child","Gender"], out_file=f)

In [28]:
test_predict = clf.predict(test[["Age","Pclass","Gender","Adult/Child"]])

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(test["Survived"], test_predict)

0.7902097902097902

In [30]:
# Default: 0.74125 -> Overfitting
# max_leaf_nodes =20 : 0.7552447552447552
# ............... = 5: 0.7552447552447552 -> underfitting
# 15: 0.7692307692307693

# Random Forests

In [42]:

from sklearn.ensemble import RandomForestClassifier
# n_estimators =10 -> number of decision trees that will be created
clf = RandomForestClassifier()

In [43]:
def checkAccuracy(clf):
    clf = clf.fit(train[["Age","Pclass","Gender","Adult/Child"]], train["Survived"])
    predictions = clf.predict(test[["Age","Pclass","Gender","Adult/Child"]])
    return accuracy_score(test["Survived"], predictions)

In [44]:
checkAccuracy(clf)

0.7692307692307693

In [None]:
# Default params: 0.7902097902097902
# Every problem, the params will show a different behaviour
# Tweak and test on multiple test sets to avoid overfitting
# Use the one that performs well on all

# Gradient Boosted Trees

In [45]:
from xgboost.sklearn import XGBClassifier

In [49]:
clf = XGBClassifier()
# n_estimators = 100 -> increase if the dataset is large and decrease otherwise
# learning_rate = 0.1 -> factor to reduce overfitting-> if we keep it low we will prevent the imporovement sparking from one tree to another hence reducing overfitting
# subsample = 1-> fraction of training set to be used for each tree.
                # -> lower the value[0.5-1], the more distinct the fraction will be and prevent overfitting - > too low may lead to underfitting 
# colsample_btree =1 -> fraction of features used for each tree
# gamme: controls the complexity, min reduction in error to split the tree further


In [50]:
checkAccuracy(clf)



0.7832167832167832

In [None]:
# default: 0.7832167832167832

# Hyperparameter Tuning

In [51]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [52]:
# let's create a space to search in
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000,1), # Ranges
    'learning_rate': hp.quniform('learning_rate', 0.025, 0.5,0.025),
    'max_depth': hp.quniform('max_depth', 1, 13,1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 6,1),
    'subsample': hp.quniform('subsample', 0.5, 1,0.05),
    'gamma': hp.quniform('gamma',  0.5, 1,0.05),
    'colsample_bytree': hp.quniform('colsample_bytree',  0.5, 1,0.05),
    'nthread': 6, # introducing parallelization into ensemble learning process to speed up the process to run a large number of trials
    'silent':1 
}

In [57]:
def scores(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    clf = XGBClassifier(**params)
    return {'loss': 1-checkAccuracy(clf), 'status':STATUS_OK}

In [58]:
trials = Trials()

In [59]:
best = fmin(scores, space, algo=tpe.suggest, trials = trials, max_evals=250)


Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed d

In [60]:
print(best)

{'colsample_bytree': 0.75, 'gamma': 0.9500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 9.0, 'min_child_weight': 1.0, 'n_estimators': 192.0, 'subsample': 0.8}


In [61]:
print(1-scores(best)['loss'])

0.8251748251748252
