#  Class Competition

# Who survived the sinking of the Titanic?

The goal of this HW is to predict who survived the Titanic sinking in 1912.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Titanic_0.csv")

In [3]:
df.shape

(713, 12)

In [4]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            141
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          549
Embarked         1
dtype: int64

In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Data set description

<ul>
<li><b>Survived</b>: binary attribute that indicates whether the passenger survived. This is the dependent variable that we will attempt to explain
<li><b>Pclass</b>: Ticket class (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
<li><b>Age</b>: Passenger age
<li><b>SibSp</b>: The amout of the passenger's siblings/spouses aboard the Titanic
<li><b>Parch</b>: The amout of the passenger's parents/children aboard the Titanic
<li><b>Fare</b>: The ticket fare
<li><b>Male</b>: binary attibute that indicates the gender (1=Male, 0=Female)
<li><b>Embarked_C</b>: binary attibute that indicates whether the passenger embarked in Cherbourg
<li><b>Embarked_Q</b>: binary attibute that indicates whether the passenger embarked in Queenstown
<li><b>Embarked_S</b>: binary attibute that indicates whether the passenger embarked in Southampton
</ul>

## Instruction

Cleaning the data set if necessary. 

Use everything you know to find a machine learning model to achieve the highest possible AUC score. Two testing sets have been reserved: TestA.csv and TestB.csv. Your homework will be evaluated using these two sets. 75% of the grade will be based on the AUC score on TestA.csv. 25% of the grade will be based on the ranking of the AUC score on TestB.csv among the six groups. To be specific, your grade on TestA.csv will be equal to the final AUC score multiplied by 75, and your grade on TestB.csv will be equal to 5 * (6 - your ranking). You must submit the same model for both sets with clear explanation of your codes. You must include the codes to evaluate your model on TestA.csv and TestB.csv. Failure to do so will result in 20% loss of grades (10% for each test). 

TestB.csv is private, which means you will never see it. The ranking will be revealed only after next Wed deadline. TestA.csv is semi-private. This means that you have at most one chance everyday for me to check your model performance on TestA.csv using your code, and I will let you know the AUC score and post your score on the discussion board. I will save your notebook file in the same folder with the data files. If your code does not work on my computer, you lose the opportunity on the same day. 

In [6]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            141
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          549
Embarked         1
dtype: int64

In [7]:
'Mr.' in df.Name[0]

True

In [8]:
df['Title'] = df.Name.apply(lambda x: 'Mr' if 'Mr.' in x\
                            else 'Mrs' if 'Mrs.' in x\
                            else 'Miss' if 'Miss.' in x\
                            else 'Master' if 'Master.' in x\
                            else 'Miss' if 'Ms.' in x\
                            else 'Miss' if 'Mlle.' in x\
                            else 'Master' if 'Master.' in x\
                            else 'Mrs' if 'Mme' in x\
                            else 'Others'
                           )

In [9]:
df.groupby('Title').Title.count()

Title
Master     31
Miss      146
Mr        418
Mrs       101
Others     17
Name: Title, dtype: int64

In [10]:
df['Embarked'].describe()

count     712
unique      3
top         S
freq      517
Name: Embarked, dtype: object

In [11]:
df.Embarked.fillna('S', inplace = True) 

In [12]:
df = pd.get_dummies(df,columns=['Embarked'])
df = pd.get_dummies(df,columns=['Pclass'])
df = pd.get_dummies(df,columns=['Title'])

In [13]:
df['Male'] = df.Sex.apply(lambda x: 1 if x == 'male' else 0)

In [14]:
df['Relative'] = df.SibSp+df.Parch
df.groupby('Relative').Survived.mean().plot()
df['Relative_dummy'] = df.Relative.apply(lambda x: 1 if (x==1) | (x==2) | (x==3) else 0)

In [15]:
age_mean = df.Age.mean()
df.Age.fillna(age_mean, inplace = True)

In [16]:
df.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Others,Male,Relative,Relative_dummy
count,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0,713.0
mean,357.0,0.382889,29.811486,0.513324,0.395512,31.98927,0.182328,0.091164,0.726508,0.244039,0.215989,0.539972,0.043478,0.204769,0.586255,0.141655,0.023843,0.650771,0.908836,0.326788
std,205.969658,0.486433,13.01029,1.075861,0.843403,48.878417,0.386386,0.288044,0.446064,0.429818,0.411795,0.49875,0.204074,0.403816,0.49285,0.348941,0.152667,0.477061,1.610654,0.469368
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179.0,0.0,22.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,357.0,0.0,29.811486,0.0,0.0,14.4583,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,535.0,1.0,35.0,1.0,0.0,31.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
max,713.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,1.0


In [17]:
df['Fare'] = df.Fare/(1+df.SibSp+df.Parch)
df['Fare_Class'] = 0
df.loc[ df.Fare<=7.91, 'Fare_Class'] = 0
df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare_Class'] = 1
df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare_Class'] = 2
df.loc[(df['Fare'] > 31) & (df['Fare'] <= 99), 'Fare_Class'] = 3
df.loc[(df['Fare'] > 99) & (df['Fare'] <= 250), 'Fare_Class'] = 4
df.loc[df['Fare'] > 250, 'Fare_Class'] = 5

In [18]:
df['Age_Class'] = 0
df.loc[ df['Age'] <= 11, 'Age_Class'] = 0
df.loc[(df['Age'] > 11) & (df['Age'] <= 18), 'Age_Class'] = 1
df.loc[(df['Age'] > 18) & (df['Age'] <= 22), 'Age_Class'] = 2
df.loc[(df['Age'] > 22) & (df['Age'] <= 27), 'Age_Class'] = 3
df.loc[(df['Age'] > 27) & (df['Age'] <= 33), 'Age_Class'] = 4
df.loc[(df['Age'] > 33) & (df['Age'] <= 40), 'Age_Class'] = 5
df.loc[(df['Age'] > 40) & (df['Age'] <= 66), 'Age_Class'] = 6
df.loc[df['Age'] > 66, 'Age_Class'] = 7

In [19]:
df[df.Fare_Class == 4]

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Others,Male,Relative,Relative_dummy,Fare_Class,Age_Class
95,96,0,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,123.7604,B58 B60,...,0,0,1,0,0,1,1,1,4,3
156,157,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,...,0,1,0,0,0,0,0,0,4,6
245,246,1,"Fleming, Miss. Margaret",female,29.811486,0,0,17421,110.8833,,...,0,1,0,0,0,0,0,0,4,4
260,261,1,"Young, Miss. Marie Grice",female,36.0,0,0,PC 17760,135.6333,C32,...,0,1,0,0,0,0,0,0,4,5
270,271,1,"Burns, Miss. Elizabeth Margaret",female,41.0,0,0,16966,134.5,E40,...,0,1,0,0,0,0,0,0,4,6
299,300,0,"Ringhini, Mr. Sante",male,22.0,0,0,PC 17760,135.6333,,...,0,0,1,0,0,1,0,0,4,2
304,305,1,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,...,0,1,0,0,0,0,0,0,4,6
422,423,0,"Farthing, Mr. John",male,29.811486,0,0,PC 17483,221.7792,C95,...,0,0,1,0,0,1,0,0,4,4
430,431,1,"LeRoy, Miss. Bertha",female,30.0,0,0,PC 17761,106.425,,...,0,1,0,0,0,0,0,0,4,4
446,447,0,"Robbins, Mr. Victor",male,29.811486,0,0,PC 17757,227.525,,...,0,0,1,0,0,1,0,0,4,4


In [20]:
X= df[['Age_Class','SibSp','Parch','Fare_Class','Male','Relative_dummy','Embarked_C','Embarked_Q','Embarked_S',\
       'Pclass_1','Pclass_2','Pclass_3','Title_Mr','Title_Mrs','Title_Master','Title_Miss','Title_Mr','Title_Others']]
y= df[['Survived']]

In [21]:
%matplotlib inline
import matplotlib.pyplot as plt

In [22]:
X = np.matrix(X.values)
y = np.array(y.values).flatten()

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler 
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier


#pipe = make_pipeline(PolynomialFeatures(degree = 1), StandardScaler(),\
#                         GradientBoostingClassifier())


pipe = make_pipeline(RobustScaler(),\
                       GradientBoostingClassifier())

param_grid = {'gradientboostingclassifier__loss': ['deviance'], #'exponential'],
             'gradientboostingclassifier__learning_rate' : [0.0008, 0.001, 0.0012],
             'gradientboostingclassifier__n_estimators' : [1600, 1800, 1900]}

gridGB = GridSearchCV(pipe, param_grid = param_grid, cv = 6, n_jobs = -1, return_train_score = True, scoring='roc_auc')
gridGB.fit(X, y)

GridSearchCV(cv=6, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('robustscaler',
                                        RobustScaler(copy=True,
                                                     quantile_range=(25.0,
                                                                     75.0),
                                                     with_centering=True,
                                                     with_scaling=True)),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier(ccp_alpha=0.0,
                                                                   criterion='friedman_mse',
                                                                   init=None,
                                                                   learning_rate=0.1,
                                                                   loss='deviance',
       

In [24]:
df_grid = pd.DataFrame(gridGB.cv_results_)
df_grid.sort_values('rank_test_score').head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gradientboostingclassifier__learning_rate,param_gradientboostingclassifier__loss,param_gradientboostingclassifier__n_estimators,params,split0_test_score,split1_test_score,...,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,mean_train_score,std_train_score
4,2.484683,0.076864,0.007979,0.001411,0.001,deviance,1800,{'gradientboostingclassifier__learning_rate': ...,0.82027,0.828829,...,0.038526,1,0.910052,0.909974,0.898714,0.902339,0.891686,0.894665,0.901239,0.007024
5,3.007826,0.194924,0.010392,0.005454,0.001,deviance,1900,{'gradientboostingclassifier__learning_rate': ...,0.821772,0.827928,...,0.038314,2,0.912598,0.910717,0.898636,0.903402,0.892941,0.894868,0.902194,0.007464
6,2.992971,0.159618,0.008976,0.000576,0.0012,deviance,1600,{'gradientboostingclassifier__learning_rate': ...,0.820571,0.827928,...,0.0386,3,0.912922,0.910885,0.898588,0.903978,0.892941,0.894904,0.90237,0.007587
2,2.913942,0.157591,0.007148,0.00134,0.0008,deviance,1900,{'gradientboostingclassifier__learning_rate': ...,0.818769,0.828529,...,0.039258,4,0.909291,0.908464,0.897472,0.901265,0.891662,0.8939,0.900342,0.006733
3,2.463294,0.162595,0.006483,0.000499,0.001,deviance,1600,{'gradientboostingclassifier__learning_rate': ...,0.819069,0.828829,...,0.039053,5,0.909387,0.909069,0.89839,0.901337,0.891878,0.893936,0.900666,0.006768


In [27]:
df1 = pd.read_csv("TestA.csv")
df1.Embarked.fillna('S', inplace = True) 
df1 = pd.get_dummies(df1,columns=['Embarked'])
df1 = pd.get_dummies(df1,columns=['Pclass'])
df1['Male'] = df1.Sex.apply(lambda x: 1 if x == 'male' else 0)
df1['Relative'] = df1.SibSp+df.Parch
df1['Relative_dummy'] = df1.Relative.apply(lambda x: 1 if (x==1) | (x==2) | (x==3) else 0)
age_mean = df1.Age.mean()
df1.Age.fillna(age_mean, inplace = True)

df1['Fare'] = df1.Fare/(1+df1.SibSp+df1.Parch)
df1['Fare_Class'] = 0
df1.loc[ df1.Fare<=7.91, 'Fare_Class'] = 0
df1.loc[(df1['Fare'] > 7.91) & (df1['Fare'] <= 14.454), 'Fare_Class'] = 1
df1.loc[(df1['Fare'] > 14.454) & (df1['Fare'] <= 31), 'Fare_Class'] = 2
df1.loc[(df1['Fare'] > 31) & (df1['Fare'] <= 99), 'Fare_Class'] = 3
df1.loc[(df1['Fare'] > 99) & (df1['Fare'] <= 250), 'Fare_Class'] = 4
df1.loc[df1['Fare'] > 250, 'Fare_Class'] = 5

df1['Age_Class'] = 0
df1.loc[ df1['Age'] <= 11, 'Age_Class'] = 0
df1.loc[(df1['Age'] > 11) & (df1['Age'] <= 18), 'Age_Class'] = 1
df1.loc[(df1['Age'] > 18) & (df1['Age'] <= 22), 'Age_Class'] = 2
df1.loc[(df1['Age'] > 22) & (df1['Age'] <= 27), 'Age_Class'] = 3
df1.loc[(df1['Age'] > 27) & (df1['Age'] <= 33), 'Age_Class'] = 4
df1.loc[(df1['Age'] > 33) & (df1['Age'] <= 40), 'Age_Class'] = 5
df1.loc[(df1['Age'] > 40) & (df1['Age'] <= 66), 'Age_Class'] = 6
df1.loc[df['Age'] > 66, 'Age_Class'] = 7

df1['Title'] = df1.Name.apply(lambda x: 'Mr' if 'Mr.' in x\
                            else 'Mrs' if 'Mrs.' in x\
                            else 'Miss' if 'Miss.' in x\
                            else 'Master' if 'Master.' in x\
                            else 'Miss' if 'Ms.' in x\
                            else 'Miss' if 'Mlle.' in x\
                            else 'Master' if 'Master.' in x\
                            else 'Mrs' if 'Mme' in x\
                            else 'Others'
                           )
df1 = pd.get_dummies(df1,columns=['Title'])

X1= df1[['Age_Class','SibSp','Parch','Fare_Class','Male','Relative_dummy','Embarked_C','Embarked_Q','Embarked_S',\
       'Pclass_1','Pclass_2','Pclass_3','Title_Mr','Title_Mrs','Title_Master','Title_Miss','Title_Mr','Title_Others']]
y1= df1[['Survived']]

X1 = np.matrix(X1.values)
y1 = np.array(y1.values).flatten()

In [28]:
accuracy = gridGB.score(X1, y1)
print('TestA AUC score is {:.4f}'.format(accuracy))

TestA AUC score is 0.8994
