# Titanic Problem using Gradient Boost Classifier

Link to Problem: https://www.kaggle.com/c/titanic/overview


# Importing and Preprocessing Data

In [39]:
# For Data Cleaning, Preprocessing
import numpy as np
import pandas as pd

# For Training
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [58]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [50]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
train.describe(include = 'all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Doharr, Mr. Tannous",male,,,,CA. 2343,,G6,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [18]:
test.describe(include = 'all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Duquemin, Mr. Joseph",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


# Visualization of Data

In [34]:
plt.figure()
_ = plt.hist(train['Fare'], bins = 5)

<IPython.core.display.Javascript object>

In [33]:
plt.figure()
_ = plt.hist(train[train['Age'].notnull()]['Age'])

<IPython.core.display.Javascript object>

## Preprocessing Basis

* The Field to be predicted is `Survived`.
* In training data, some values are missing from `Embarked`. We will impute them using the most popular embarked station if required while testing. While training, we've removed them.  
* In both train and test data, there are some missing values for `Age`. We will impute them using `mean` Age from the training data. 
* In test data, there is one missing value for `Fare` which we will impute using `mode` or the most occuring value of `Fare`. 

# Training

In [53]:
# Using GridSearchCV to find the best GradientBoostingClassifier. 

# meanAge = train['Age'].mean()
# train['Age'].fillna(meanAge, inplace = True) # Filling NaN values with mean Age
# train = train[train['Embarked'].notnull()] # Removing rows with no Embarked.

# COLUMNS = ['Pclass', 'Sex', 'Age', 'Parch', 'SibSp', 'Fare', 'Embarked']

# le = LabelEncoder()
# train['Sex'] = le.fit_transform(train['Sex'].values)
# train['Embarked'] = le.fit_transform(train['Embarked'].values)

# y = train['Survived']
# train = train[COLUMNS]

# X_train, X_test, y_train, y_test = train_test_split(train, y)
# grid_values = {'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'n_estimators': [1, 10, 50, 100, 1000], 'max_features' : ['auto', 'sqrt', 'log2']}

# clf = GridSearchCV(GradientBoostingClassifier(), param_grid = grid_values, cv = 5)
# clf.fit(X_train, y_train)
# report = classification_report(y_test, clf.predict(X_test))
# print(report)

             precision    recall  f1-score   support

          0       0.84      0.94      0.89       143
          1       0.87      0.69      0.77        80

avg / total       0.85      0.85      0.85       223



In [54]:
# clf.best_params_

{'learning_rate': 0.01, 'max_features': 'sqrt', 'n_estimators': 100}

In [59]:
# Using A GradientBoostClassifier

mostOccuringFare = train['Fare'].mode().iloc[0]
meanAge = train['Age'].mean()
maxEmbarked = train['Embarked'].value_counts().idxmax()

train['Age'].fillna(meanAge, inplace = True) # Filling NaN values with mean Age
train = train[train['Embarked'].notnull()] # Removing rows with no Embarked.

le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'].values)
train['Embarked'] = le.fit_transform(train['Embarked'].values)

COLUMNS = ['Pclass', 'Sex', 'Age', 'Parch', 'SibSp', 'Fare', 'Embarked']

y = train['Survived']
train = train[COLUMNS]

X_train, X_test, y_train, y_test = train_test_split(train, y, random_state = 46)
clf_ans = GradientBoostingClassifier(learning_rate = 0.1, max_features = 'auto', n_estimators = 100)
clf_ans.fit(X_train, y_train)
report = classification_report(y_test, clf_ans.predict(X_test))
print(report)

             precision    recall  f1-score   support

          0       0.76      0.91      0.83       123
          1       0.86      0.65      0.74       100

avg / total       0.80      0.79      0.79       223



# Testing

In [60]:
pIds = test['PassengerId']

test['Fare'].fillna(mostOccuringFare, inplace = True)
test['Age'].fillna(meanAge, inplace = True)
test = test[COLUMNS]

test['Sex'] = le.fit_transform(test['Sex'].values)
test['Embarked'] = le.fit_transform(test['Embarked'].values)

clf_ans.predict(test)

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [61]:
res = pd.DataFrame({'PassengerId' : pIds, 'Survived': clf_ans.predict(test)})

In [106]:
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [62]:
res.to_csv(r'titanic_submission.csv', index = False)