In [1]:
# import the necessary libraries
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import the data
raw_train_data = pd.read_csv('train.csv')
raw_test_data = pd.read_csv('test.csv')

In [3]:
raw_train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
raw_test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# EDA

In [5]:
# make train and test datasets and seperate feature to be predicted
train_data = raw_train_data.drop('Survived', axis = 1)
y_train = raw_train_data['Survived']

test_data = raw_test_data.copy()

In [6]:
# get the indices of various columns
col_names = "SibSp", "Parch", "Age", "Cabin", "Pclass"
SibSp_ix, Parch_ix, Age_ix, Cabin_ix, Pclass_ix = [train_data.columns.get_loc(c) for c in col_names]

In [7]:
age_avg = train_data['Age'].mean() # mean of age in train data
age_std = train_data['Age'].std() # std of age in train data

fare_avg = train_data['Fare'].mean() # mean of fare in train data

In [8]:
# make a function to get the title from the name
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [9]:
# get the most frequent top 16 ticket types
top_16 = [x for x in train_data['Ticket'].apply(lambda x: x[0:3]).
          value_counts().sort_values(ascending = False).head(16).index]

In [10]:
# this class performs various functions as follows:
# > fill missing values for age
# > fill missing values for fare
# > merge SibSp and Parch columns into new one, Family
# > convert Cabin as :1 if cabin name is given and :0 if cabin name is NaN
# > make a new column Title which contains the titles of the names
# > make a new column Ticket_Type whuch contains type of the ticket
# > drop the unnecessary columns
# > adds the new columns
# > returns a dataframe

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):      
        return self  # nothing else to do
    
    def transform(self, X):
               
        Family = X.values[:,SibSp_ix] + X.values[:,Parch_ix] 
        cabin = X.iloc[:,Cabin_ix].apply(lambda x: 0 if type(x) == float else 1)
        
        age_null_count = X['Age'].isnull().sum()
        age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
        X['Age'][np.isnan(X['Age'])] = age_null_random_list
        age = X['Age'].astype(int)
        
        X['Fare'][np.isnan(X['Fare'])] = fare_avg
        Fare = X['Fare'].astype(int)
        
        
        Title = X['Name'].apply(get_title)
        for i in range(len(X)):
            if Title[i] in (['Mr', 'Mrs', 'Miss', 'Master','Dr']):
                pass
            else:
                Title[i] = 'Rare_title'
        
        ticket_type = X['Ticket'].apply(lambda x: x[0:3])
        for i in range(len(X)):
            if ticket_type[i] in (top_16):
                pass
            else:
                ticket_type[i] = 'Rare_ticket'

        X = X.drop(['Cabin', 'SibSp', 'Parch', 'PassengerId','Age', 'Name', 'Ticket', 'Fare'], axis=1)
        
        return pd.DataFrame(data = np.c_[X, Family, cabin,Title,ticket_type,age,Fare],
                           columns = ['Pclass', 'Sex', 'Embarked',
                                              'Family', 'cabin', 'Title','Ticket_Type','Age','Fare'])

In [11]:
# this class performs following actions:
# > map the Fare into labels
# > map the Age into labels
# > returns a dataframe

class age_fare_encoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):      
        return self  # nothing else to do
    
    def transform(self, X):
        # Mapping Fare
        
        X.loc[ X['Fare'] <= 7.91, 'Fare'] = 0
        X.loc[(X['Fare'] > 7.91) & (X['Fare'] <= 14.454), 'Fare'] = 1
        X.loc[(X['Fare'] > 14.454) & (X['Fare'] <= 31), 'Fare']   = 2
        X.loc[ X['Fare'] > 31, 'Fare'] = 3
        Fare = X['Fare'].astype(int)

        # Mapping Age
        X.loc[ X['Age'] <= 16, 'Age'] = 0
        X.loc[(X['Age'] > 16) & (X['Age'] <= 32), 'Age'] = 1
        X.loc[(X['Age'] > 32) & (X['Age'] <= 48), 'Age'] = 2
        X.loc[(X['Age'] > 48) & (X['Age'] <= 64), 'Age'] = 3
        X.loc[ X['Age'] > 64, 'Age'] = 4
        Age = X['Age'].astype(int)
               
        X = X.drop(['Fare', 'Age'], axis=1)
        
        return pd.DataFrame(data = np.c_[X, Fare, Age],
                           columns = ['Pclass', 'Sex', 'Embarked',
                                              'Family', 'cabin', 'Title','Ticket_Type','Fare', 'Age'])

In [12]:
train_data = CombinedAttributesAdder().transform(train_data)
train_data = age_fare_encoder().transform(train_data)

test_data = CombinedAttributesAdder().transform(test_data)
test_data = age_fare_encoder().transform(test_data)

In [13]:
train_data

Unnamed: 0,Pclass,Sex,Embarked,Family,cabin,Title,Ticket_Type,Fare,Age
0,3,male,S,1,0,Mr,A/5,0,1
1,1,female,C,1,1,Mrs,PC,3,2
2,3,female,S,0,0,Miss,STO,0,1
3,1,female,S,1,1,Mrs,113,3,2
4,3,male,S,0,0,Mr,Rare_ticket,1,2
...,...,...,...,...,...,...,...,...,...
886,2,male,S,0,0,Rare_title,Rare_ticket,1,1
887,1,female,S,0,1,Miss,Rare_ticket,2,1
888,3,female,S,3,0,Miss,Rare_ticket,2,2
889,1,male,C,0,1,Mr,Rare_ticket,2,1


In [14]:
train_data.isnull().sum()

Pclass         0
Sex            0
Embarked       2
Family         0
cabin          0
Title          0
Ticket_Type    0
Fare           0
Age            0
dtype: int64

In [15]:
# to select particular columns from the dataframe

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [16]:
# a pipeline which converts takes only numerical features and imputes median for any missing value 
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Family"])),
        ("imputer", SimpleImputer(strategy="median"))
    ])

In [17]:
# imputes the most frequent item to the missing values
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [18]:
# this pipeline takes only categorical columns, imputes most frequent item to the missing values and then perform
# one hot encoding
cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked", "cabin","Fare","Age",
                                          "Title","Ticket_Type"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, drop='first')),
    ])

In [19]:
# combines the two numerical and categorical pipelines

from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),        
    ])

In [20]:
# performs the preprocess_pipeline on train and test datas

X_train = preprocess_pipeline.fit_transform(train_data)

X_test = preprocess_pipeline.fit_transform(test_data)

In [21]:
gbm_clf = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(X_train, y_train)

gbm_scores = cross_val_score(gbm_clf, X_train, y_train, cv=10)
gbm_scores.mean()

0.8350312109862672

In [22]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto").fit(X_train, y_train)

In [23]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.8271535580524345

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0).fit(X_train, y_train)

tree_scores = cross_val_score(rf_clf, X_train, y_train, cv=10)
tree_scores.mean()

0.8305742821473159