In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read the data and some data cleaning

In [2]:
train=pd.read_csv('../input/titanic/train.csv')
test=pd.read_csv('../input/titanic/test.csv')
all_data=pd.concat([train,test],axis=0)
all_data.head(-10)

In [3]:
len(train)

Check the missing values of training set.

In [4]:
all_data.isnull().sum()

In [5]:
# Fill in NaN value with 'No Cabin'
all_data['Cabin']=all_data.Cabin.fillna('No Cabin')
all_data.Cabin.value_counts()

I am very curious about which cabin has the highest survival rate, so I calculate the survival rates of each cabin and rank them from high to low.

In [6]:
# Calculation of survival rates for each cabin and then sort them descendingly
survive_by_cabin=train[['Survived','Cabin']].groupby(['Cabin']).mean()
survive_by_cabin.sort_values('Survived',ascending=False)

In [7]:
# Cabins that all members survived
survive_by_cabin[survive_by_cabin.Survived==1].head()

So we can see that some of the passengers have multiple cabins, however, those cabins have the same initial character, so we can extract it as the "Cabin" information. Otherwise, if we want to get dummy variable for "Cabin", it will result in very sparse data matrix.

In [8]:
all_data.Cabin=all_data.Cabin.apply(lambda x: x.split(' ')[0][0])
train.Cabin.value_counts()

This part I just want to make passengers name look cleaner therefore I split the name into prefix, first name and last name.

In [9]:
#Parsing the names
all_data['Last Name']=all_data['Name'].apply(lambda x:x.split(',')[0].strip())
all_data['Prefix']= all_data['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
all_data['First Name']=all_data['Name'].apply(lambda x:x.split('.')[1])
all_data['First Name']=all_data['First Name'].apply(lambda x:x.split()[0].strip())

In [10]:
all_data.Prefix.value_counts()

At this point we can drop Name column and also Ticket since we already have Pclass and Cabin.

In [11]:
#Drop column 'Name' and 'Ticket'
all_data.drop(['Name','Ticket'],inplace=True,axis=1)

For data integration purpose, here I use "Family" to represent the total family memebers of a passenger which equals to his/her siblings/spouses plus parents/children.

In [12]:
all_data['Family']=all_data.SibSp+all_data.Parch

Check missing values for other features

In [13]:
all_data.loc[:, all_data.isnull().any()].columns

Impute the mode of "Embarked" for missing "Embarked" value and we will leave age to be filled in since the imputation can be tricky.

In [14]:
all_data.Embarked.fillna(all_data.Embarked.value_counts().idxmax(),inplace=True)

In [15]:
all_data.Fare.fillna(all_data.Fare.median(),inplace=True)

Again, check the missing values

In [16]:
all_data.isnull().sum()

Good! There is no more missing values other than 'Age' and let us have a look at the cleaned dataset.

# Exploratory Data Analysis

First, I want to see if there is a different age distribution between survivors and non-survivors. 

In [17]:
sns.displot(all_data[0:891],x='Age',hue='Survived',multiple='stack')

One can see from the above histogram that there is no huge differences between those two groups (just eye test). And for children (0-10), the survival rates will be much higher than other age groups (almost 50%).

For distribution of fare, we can see it is not normal (right-skewed), therefore, we need to do a bit of transformation.

In [18]:
sns.displot(all_data[0:891],x='Fare',hue='Survived',multiple='stack')

Then we should look at gender differences.

In [19]:
sns.displot(all_data[0:891],x='Sex',hue='Survived',multiple='stack')

It is clear that females have a higher survival rate than males. We should admire those people who give the chance of survival to children and women!

Let's see if you will have a higher chance to survive if you have more family members.

In [20]:
sns.displot(all_data[0:891],x="Family",hue='Survived',multiple='stack')

In [21]:
sns.catplot(x="Survived",y="Family",kind='box',data=all_data[0:891])

For survival group, either you are alone or you have a relatively small family size (2-3), you will more likely to survive. For non-survivors, most of them are alone or only have one family member, so they may give their chance of survival to the other one.

In [22]:
sns.displot(all_data[0:891],x='Embarked',hue='Survived',multiple='stack')

Seems like people embarked from Cherbourg or Queenstown are more likely to be saved.

In [23]:
sns.displot(all_data[0:891],x='Pclass',hue='Survived',multiple='stack')

It is sad to see that lower class people has a lower chance to survive.

In [24]:
train_corr=all_data[0:891].iloc[:,1:].corr()
cmap=sns.diverging_palette(220,10,as_cmap=True)
sns.heatmap(train_corr,vmax=.3,center=0,cmap=cmap)

It can be observed from above heatmap that age and ticket class has negative correlation with survival and family related factors as well as fare are positively correlate to survival.

# Feature Engineering

One thing to do is to fill in the missing values for "Age", the other things like getting dummies for some categorical variables.

## Missing value imputation

For missing values we have different imputation method, like mean or median for numerical variable, mode for categorical variable (like I did for "Embarked" in the beginning). Here, I think we can take a step further, not just impute mean or median for age, we can take family information into consideration to impute age. For example, if a child will more likely to have parents on board, no spouse, few siblings. So here I choose to use KNN to impute age.

In [25]:
from sklearn.impute import KNNImputer
imputer=KNNImputer(n_neighbors=10)
all_data["Age"]=pd.DataFrame(imputer.fit_transform(all_data[["Age","SibSp","Parch"]]),columns=["Age","SibSp","Parch"])["Age"].astype(int)

In [26]:
all_data.isnull().sum()

## Normalization of Numerical variables

From above EDA, we know "Fare" is right-skewed distributed, therefore, we need to do Box-Cox transformation to make it normalized.

In [27]:
all_data['Fare']=np.log(all_data['Fare']+1)

In [28]:
sns.displot(all_data[0:891],x='Fare',hue='Survived',multiple='stack')

Also, standardize "Age":

In [29]:
from sklearn.preprocessing import StandardScaler
all_data["Age"]=pd.DataFrame(StandardScaler().fit_transform(all_data[["Age"]]))

## Other feature engineering

In [30]:
all_data["Prefix"] = all_data["Prefix"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
all_data["Prefix"] = all_data["Prefix"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
all_data["Prefix"] = all_data["Prefix"].astype(int)
X_train_reduced = all_data.drop(['PassengerId','Survived','Last Name','First Name'],axis=1)
y_train_reduced=train["Survived"]
X_train_reduced = pd.get_dummies(X_train_reduced, columns = ["Prefix"],prefix="Prefix")
X_train_reduced = pd.get_dummies(X_train_reduced, columns = ["Cabin"],prefix="Cabin")
X_train_reduced = pd.get_dummies(X_train_reduced, columns = ["Embarked"],prefix="Embarked")
X_train_reduced = pd.get_dummies(X_train_reduced, columns = ["Sex"],prefix="Sex")
X_train_reduced = pd.get_dummies(X_train_reduced, columns = ["Pclass"],prefix="Pclass")
X_test=X_train_reduced[891:]
X_train_reduced=X_train_reduced[0:891]

We can see that the distribution is much more normal than before.

# Modeling

## Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [32]:
logistic=LogisticRegression()
logistic.fit(X_train_reduced,y_train_reduced)

In [33]:
cross_val_score(logistic, X_train_reduced, y_train_reduced, scoring = "accuracy", cv = 10, n_jobs=-1).mean()

## Random Forest Regressor

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
grid_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_forest=RandomForestClassifier()
random_search_rf=RandomizedSearchCV(estimator = random_forest, param_distributions = grid_rf, n_iter = 100, cv = 3, verbose=2, random_state=42,n_jobs=-1)
random_search_rf.fit(X_train_reduced,y_train_reduced)

In [35]:
random_search_rf.best_score_

In [36]:
random_forest_final = random_search_rf.best_estimator_
random_forest_final.fit(X_train_reduced,y_train_reduced)
cross_val_score(random_forest_final, X_train_reduced, y_train_reduced, scoring = "accuracy", cv = 10, n_jobs=-1).mean()

## XGBoost

In [37]:
from xgboost import XGBClassifier
grid_xgb=  {"n_estimators"    : range(8, 20),
              "learning_rate"   : [ 0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
              "max_depth"       : [ 3, 4, 5, 6, 8, 10, 12, 15 ],
              "colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7 ] }
xgb_clf=XGBClassifier(use_label_encoder =False,eval_metric='error')
random_search_xgb=RandomizedSearchCV(estimator=xgb_clf, param_distributions = grid_xgb, scoring = "accuracy", n_iter = 50, cv = 3,random_state=42)
random_search_xgb.fit(np.ascontiguousarray(X_train_reduced),y_train_reduced)

In [38]:
xgb_clf_final=random_search_xgb.best_estimator_

In [39]:
random_search_xgb.best_score_

In [40]:
cross_val_score(xgb_clf_final, X_train_reduced, y_train_reduced, scoring = "accuracy", cv = 10,n_jobs=-1).mean()

## Support Vector Classifier

In [41]:
from sklearn.svm import SVC
sv_clf=SVC()
grid_svc={'gamma'     :[0.1, 0.5, 1, 5, 10],
          'C' :[10, 1, 0.1, 0.01, 0.001],
          'kernel':['linear', 'rbf']
         }
random_search_svc=RandomizedSearchCV(sv_clf,grid_svc,cv=3,scoring='accuracy',n_jobs=-1,random_state=42,n_iter=20)
random_search_svc.fit(X_train_reduced,y_train_reduced)
random_search_svc.best_score_

In [42]:
svc_final=random_search_svc.best_estimator_

In [43]:
svc_final.fit(X_train_reduced,y_train_reduced)
cross_val_score(svc_final, X_train_reduced, y_train_reduced, scoring = "accuracy", cv = 10,n_jobs=-1).mean()

## Voting Classifier

In [44]:
from sklearn.ensemble import VotingClassifier
voting_clf=VotingClassifier(
        estimators=[('log',logistic),('rf',random_forest_final),('xgboost',xgb_clf_final),('svc',svc_final)],
        voting='hard')
voting_clf.fit(X_train_reduced,y_train_reduced)
cross_val_score(voting_clf, X_train_reduced, y_train_reduced, scoring = "accuracy", cv = 10,n_jobs=-1).mean()

# Output predictions on test set

In [45]:
X_test.columns

In [46]:
Survived=xgb_clf_final.predict(np.ascontiguousarray(X_test))

In [47]:
results = pd.concat([test["PassengerId"],pd.Series(Survived,name="Survived")],axis=1)
results.to_csv("xgboost_results.csv",index=False)

In [48]:
results2=pd.concat([test["PassengerId"],pd.Series(voting_clf.predict(X_test),name="Survived")],axis=1)
results2.to_csv("voting_clf_results.csv",index=False)

In [49]:
results3=pd.concat([test["PassengerId"],pd.Series(random_forest_final.predict(X_test),name="Survived")],axis=1)
results3.to_csv("random_forest_results.csv",index=False)

In [51]:
results4=pd.concat([test["PassengerId"],pd.Series(svc_final.predict(X_test),name="Survived")],axis=1)
results4.to_csv("svc_results.csv",index=False)