## Titanic with feature engineering. 

In [1]:
# Imports
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV

Get titanic & test csv files as a DataFrame

In [2]:
X_train = pd.read_csv("./train.csv")
X_test_orig = pd.read_csv("./test.csv")

Build the training set

In [3]:
y_train = X_train.pop("Survived")

Combine train and test for feature engineering

In [4]:
data = X_train.append(X_test_orig, ignore_index=True)

In [5]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


To Do: How many feature column? Hing: use info()

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


To Do: How many missing values. Hint: use isnull() and sum()

Find more useful information to learn: get the titles of each passenger 

In [None]:
data['Title'] = data.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
data.head()

To Do: Count diferent values of a feature. Hint: use value_counts()

Create bins for the title values

In [None]:
bin_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

map the titles to title bins

In [None]:
data.Title = data.Title.map(bin_titles)
data.Title.value_counts()

Filling Missing Values for Age feature

In [None]:
# group by Sex, Pclass, and Title
grouped = data.groupby(['Sex','Pclass', 'Title'])

view the median Age by the grouped features

In [None]:
grouped.Age.median()

In [None]:
data.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))

Fill Embarked with the most frequent value

In [None]:
most_embarked = data.Embarked.value_counts().index[0]
data["Embarked"].fillna(most_embarked, inplace=True)

Fill Fare with the mean value of all fare

In [None]:
data["Fare"].fillna(data.Fare.mean(), inplace=True)


To Do: Check null value again (and the Cabin feature will be dropped)

Apply 1-hot encoding for categorical feature Sex

In [None]:
data = pd.get_dummies(data, columns=['Sex'], prefix = ['Sex'])

To Do: apply 1-hot encoding for the other categorical features

In [None]:
data

Add new synthetic feature of Family Size

In [None]:
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1

Add new has_cabin feature (yes or no) 

In [None]:
data['Has_Cabin'] = ~data.Cabin.isnull()

Binning numerical columns

In [None]:
data['CatAge'] = pd.qcut(data.Age, q=4, labels=False )

Drop un-used features

In [None]:
data = data.drop(['Parch','SibSp','PassengerId','Name','Ticket','Cabin', 'Age'], axis=1)

In [None]:
data

Rebuild the training set and test set

In [None]:
X_train = data.iloc[:891]
X_test = data.iloc[891:]

## Build Classification Model 

### Use a simple Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

split out our own test dataset from the training set.

In [None]:
X_train_small = data.iloc[:800]
X_test_small = data.iloc[800:891]
y_train_small = y_train.iloc[:800]
y_test_small = y_train.iloc[800:891]

In [None]:
dt1 = DecisionTreeClassifier(random_state=42)
dt1.fit(X_train_small, y_train_small)

In [None]:
cross_val_score(dt1, X_train_small, y_train_small, cv=3, scoring="accuracy")

In [None]:
from sklearn.metrics import roc_curve, auc
y_pred_small = dt1.predict(X_test_small)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_small, y_pred_small)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

Fine tune hyper-parameters with GridSearchCV

In [None]:
dt_params = [
   { 'max_depth': [1, 2, 4, 8, 16, 32, 64], 
     'min_samples_leaf' : [1, 2, 3, 4, 5, 6],
   },
]

In [None]:
dt_cv = GridSearchCV(estimator=dt1, param_grid=dt_params, cv=4)
dt_cv.fit(X_train_small, y_train_small)

In [None]:
cross_val_score(dt_cv.best_estimator_, X_train_small, y_train_small, cv=3, scoring="accuracy")

In [None]:
print("Optimal params: {}".format(dt_cv.best_estimator_))

In [None]:
from sklearn.metrics import roc_curve, auc
y_pred_small = dt_cv.best_estimator_.predict(X_test_small)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_small, y_pred_small)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

To Do: Experiment with scaled data. Scale the test data and use decision tree classifier to see if the results in terms of AUC improve. 

To Do: Continue with the last exercise and use GridSearchCV to find the best parameter for the desicision tree classifier

### Use a random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rnd_clf.fit(X_train_small, y_train_small)

In [None]:
y_pred_small = rnd_clf.predict(X_test_small)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_small, y_pred_small)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

Prepare prediction results for submission to Kaggle

In [None]:
y_pred = rnd_clf.predict(X_test)

submission = pd.DataFrame({
        "PassengerId": X_test_orig["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('titanic.csv', index=False)