# Titanic Survivor Prediction

Based on https://www.kaggle.com/c/titanic competition

# Import Data and Exploratory Analysis

In [1]:
# import the packages we will need to extend the capabilities of Python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import re
import string

In [2]:
# import the two data files and convert them into Pandas Dataframes
train = pd.read_csv("titanic_train.csv")
test = pd.read_csv("titanic_test.csv")

In [4]:
# get the dimensions of the training dataframe
train.shape

(891, 12)

In [5]:
# group the columns in the training data by type of data
train.dtypes.to_frame().transpose()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,int64,int64,int64,object,object,float64,int64,int64,object,float64,object,object


In [6]:
# display the first few rows of the training data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Handling Missing Values

In [7]:
# print out a list of columns, sorted by their number of missing values
train.isna().sum().sort_values(ascending=False).head()

Cabin       687
Age         177
Embarked      2
Fare          0
Ticket        0
dtype: int64

In [8]:
# print out a list of columns, sorted by their number of missing values
test.isna().sum().sort_values(ascending=False).head()

Cabin       327
Age          86
Fare          1
Embarked      0
Ticket        0
dtype: int64

In [9]:
# replace missing values for the cabin column with an indicator value
for df in train, test:
    df['Cabin'].fillna(value='NA',inplace=True)

In [10]:
# impute an age value for the individuals with no age listed, based on their values for three other features
for df in train,test:
    df["AgeImputed"] = df.groupby(['Pclass', 'Sex','SibSp'])['Age'].transform(lambda x: x.fillna(x.mean()))

In [11]:
# drop the age column that is being replaced
for df in train,test:
    df.drop(['Age'],axis=1,inplace=True)

In [12]:
# replace the few remaining missing values in the age column with the average value for age
for df in train,test:
    df['AgeImputed'] = df['AgeImputed'].fillna(value=9.88)

In [13]:
# define a function for replacing missing values in a categorical column with the most common value
def fillcommon(cell):
    common = cell.value_counts().head(1)
    comdct = common.to_dict()
    comkey = [k for k,v in comdct.items()]
    cell.fillna(value=comkey[0],inplace=True)

In [14]:
# use the fillcommon function on the embarked categorical column
for df in train,test:
    fillcommon(df['Embarked'])

In [15]:
# replace missing values in the fare column with the average value
for df in train,test:
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

## Feature Engineering with Cabin

In [16]:
# creates a function that returns true or false depending on whether the family rented a cabin
def hasCabin(cell):
    if cell == 'NA':
        return False
    else:
        return True

for df in train,test:
    df['Has_Cabin'] = df['Cabin'].apply(hasCabin)

In [17]:
# family size
for df in train,test:
    df['FamSize'] = df.Parch + df.SibSp + 1

In [18]:
# a couple more measures of family size
for df in train,test:
    df['SmallFamily'] = df['FamSize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    df['LargeFamily'] = df['FamSize'].map(lambda s: 1 if 5 <= s else 0)

In [19]:
# creates a function that counts the number of cabins per family
def cabinsCount(lst):
    itms = lst.split()
    num = len(itms)
    if num >1:
        return num
    else:
        return 1
for df in train,test:
    df['CabinQuantForFam'] = df['Cabin'].apply(cabinsCount)
train['CabinQuantForFam'].value_counts()

1    867
2     16
3      6
4      2
Name: CabinQuantForFam, dtype: int64

In [20]:
# creates a function that extracts the most common first letters from the cabin column
def getLevel(lst):
    if lst[0] in ['C','B','D','E']:
    #if lst[0] in ['A', 'C', 'B', 'E', 'D', 'G', 'F', 'U', 'T']:
        return lst[0]
    else:
        return 'x'
for df in train,test:
    df['CabinLevel'] = df['Cabin'].apply(getLevel)
train['CabinLevel'].value_counts()

x    720
C     59
B     47
D     33
E     32
Name: CabinLevel, dtype: int64

In [21]:
# creates a function that finds the zone of the room number, which comes from the cabin column
def getRoom(lst):
    itms = lst.split()
    num = itms[0]
    pre = num[1:2]
    if pre=='1':
        return '100'
    elif pre=='2':
        return '200'
    elif pre=='3':
        return '300'
    else:
        return 'x'
for df in train,test:
    df['CabinRoomZone'] = df['Cabin'].apply(getRoom)
train['CabinRoomZone'].value_counts()

x      788
100     40
200     32
300     31
Name: CabinRoomZone, dtype: int64

In [22]:
# extract the full room number for each passenger
def RoomNum(lst):
    itms = lst.split()
    num = itms[0]
    room = num # [1:]
    return room
for df in train,test:
    df['RoomNumber'] = df['Cabin'].apply(RoomNum)
train['RoomNumber'].value_counts().head()

NA     687
C23      4
G6       4
F        4
B96      4
Name: RoomNumber, dtype: int64

In [23]:
# tabulate the number of people sharing the room for each family
traincounts = train['RoomNumber'].value_counts()
testcounts = test['RoomNumber'].value_counts()

In [24]:
# link each room number with the number of families sharing it
trdct = traincounts.to_dict() 
tedct = testcounts.to_dict() 

def TrainRoomCount(cell):
    cell = str(cell)
    if cell == 'NA':
        return 'NA'
    else:
        return trdct.get(cell)

def TestRoomCount(cell):
    cell = str(cell)
    if cell == 'NA':
        return 'NA'
    else:
        return tedct.get(cell)

train['NumFamInRoom'] = train['RoomNumber'].apply(TrainRoomCount)
test['NumFamInRoom'] = test['RoomNumber'].apply(TestRoomCount)

In [25]:
# creates a function that assigns the passengers to deck levels, based on their values in the cabin column
def getDeck(lst):
    if lst[0] in ['C','B','A']:
        return 'upper_deck'
    elif lst[0] in ['D','E','F']:
        return 'middle_deck'
    else:
        return 'lower_deck'
for df in train,test:
    df['CabinDeck'] = df['Cabin'].apply(getDeck)
train['CabinDeck'].value_counts()

lower_deck     692
upper_deck     121
middle_deck     78
Name: CabinDeck, dtype: int64

In [26]:
# drops the cabin column after its replacements have been created and added to the dataframe
for df in train,test:
    df.drop(['Cabin','RoomNumber'],axis=1,inplace=True)

## Categorical Conversion

In [27]:
# recodes the numeric values in the Pclass column with more descriptive categories
Pclass = {1:'first_class',2:'second_class',3:'third_class'}
for df in train,test:
    df['Pclass'] = df['Pclass'].map(Pclass)

In [28]:
# recodes the single-character values in the embarked column with more descriptive categories
embark = {'C':'Cherbourg','Q':'Queenstown','S':'Southampton'}
for df in train,test:
    df['Embarked'] = df['Embarked'].map(embark)

In [29]:
# recodes the numeric values in the Cabin Room Zone column with more descriptive categories
crz = {'x':'unknown','100':'one hundreds','200':'two hundreds','300':'three hundreds'}
for df in train,test:
    df['CabinRoomZone'] = df['CabinRoomZone'].map(crz)

## Feature Engineering with Ticket

In [30]:
# creates a function that extracts the most common first letters from the ticket column
def getLetter(text):
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub('', text)
    if text=='':
        return 'x'
    elif text[0] in ['p','s','c']:
        return text[0]
    else:
        return 'x'
for df in train,test:
    df['TicketLetter'] = df['Ticket'].apply(getLetter)
train['TicketLetter'].value_counts()

x    714
s     65
p     65
c     47
Name: TicketLetter, dtype: int64

In [31]:
# creates a function that extracts the numeric portion of the ticket number
def getNumber(num):
    # remove punctuation, a-z letters, and whitespace
    regex = re.compile(r"[%s%s%s]" % (string.punctuation,string.ascii_letters,string.whitespace))
    numbers = regex.sub('', num)
    return numbers
for df in train,test:
    df['TicketNum'] = df['Ticket'].apply(getNumber)
train['TicketNum'].value_counts().head()

2343       7
347082     7
1601       7
2144       6
3101295    6
Name: TicketNum, dtype: int64

In [32]:
# calculate the length of the ticket characters and numbers 
for df in train,test:
    df['TickLen'] = df['Ticket'].apply(len)
train['TickLen'].value_counts().head()

6     419
5     131
4     101
8      76
10     41
Name: TickLen, dtype: int64

In [33]:
# calculate the length of the ticket number
for df in train,test:
    df['TickNumLen'] = df['TicketNum'].apply(len)
train['TickNumLen'].value_counts().head()

6    439
5    242
4    154
7     24
8     20
Name: TickNumLen, dtype: int64

In [34]:
# creates a function that extracts the first digit of the ticket number
def getFirstNum(num):
    # remove punctuation
    regex = re.compile(r"[%s]" % (string.punctuation))
    num = regex.sub('', num)
    # detect numbers
    numbers = re.findall('[0-9]+', num)
    if len(numbers)>1:
        return 'x'
    elif len(numbers)==1:
        numstr = str(numbers[0])
        if numstr[0:1] in ['1','2','3']:
            return numstr[0:1]
        else:
            return 'x'
    else:
        return 'x'
for df in train,test:
    df['TicketFirstNum'] = df['Ticket'].apply(getFirstNum)
train['TicketFirstNum'].value_counts()

3    337
1    227
2    220
x    107
Name: TicketFirstNum, dtype: int64

In [35]:
# creates a function that extracts the first two digits of the ticket number
def getFirstTwoNum(num):
    # remove punctuation
    regex = re.compile(r"[%s]" % (string.punctuation))
    num = regex.sub('', num)
    # detect numbers
    numbers = re.findall('[0-9]+', num)
    if len(numbers)>1:
        return 'x'
    elif len(numbers)==1:
        numstr = str(numbers[0])
        if numstr[0:1] in ['1','2','3']:
            return numstr[0:2]
        else:
            return 'x'
    else:
        return 'x'
for df in train,test:
    df['TicketFirstTwoNum'] = df['Ticket'].apply(getFirstTwoNum)

In [36]:
# find the most common values for the first two digits of the ticket number
trtckcts = train['TicketFirstTwoNum'].value_counts().head(3) 
tetckcts = test['TicketFirstTwoNum'].value_counts().head(3) 

In [37]:
# link each room number with the number of families sharing it
trtckdct = trtckcts.to_dict() 
tetckdct = tetckcts.to_dict() 

def TrainTckFirstTwoNumCount(cell):
    cell = str(cell)
    if cell in trtckdct.keys():
        return cell
    else:
        return "other"

def TestTckFirstTwoNumCount(cell):
    cell = str(cell)
    if cell in tetckdct.keys():
        return cell
    else:
        return "other"

train['TicketFirstTwo'] = train['TicketFirstTwoNum'].apply(TrainTckFirstTwoNumCount)
test['TicketFirstTwo'] = test['TicketFirstTwoNum'].apply(TestTckFirstTwoNumCount)

In [38]:
# drop various ticket columns after their replacements have been added to the dataframe
for df in train,test:
    df.drop(['Ticket','TicketNum','TicketFirstTwoNum'],axis=1,inplace=True)

## Feature Engineering Names and Titles

In [39]:
# calculate the length of the name field for each family
for df in train,test:
    df['NameLen'] = df['Name'].apply(len)

In [40]:
# creates a function that assigns the various family head titles to a few basic categories
def getTitle(name):
    if pd.notna(name):
        regex = re.compile(r"\b\w+[.]")
        title = regex.findall(name)
        #return title[0]
        if title[0] in ['Mr.','Miss.','Mrs.']:
            return title[0]
        elif title[0] in ['Master.','Rev.','Major.','Col.','Don.','Jonkheer.','Sir.','Capt.']:
            return 'Upper_Class_Male'
        elif title[0] in ['Mlle.','Ms.']:
            return 'Miss.'
        elif title[0] in ['Mme.','Countess.','Dr.','Lady.']:
            return 'Mrs.'
        else:
            return np.nan
for df in train,test:
    df['FamilyHeadTitle'] = df['Name'].apply(getTitle)
train['FamilyHeadTitle'].value_counts()

Mr.                 517
Miss.               185
Mrs.                135
Upper_Class_Male     54
Name: FamilyHeadTitle, dtype: int64

In [41]:
# creates a function that extracts the surname for further processing
def getSurname(text):
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub('', text)
    first = text.split()
    name = first[0]
    return name
for df in train,test:
    df['Surname'] = df['Name'].apply(getSurname)
train['Surname'].head()

0       braund
1      cumings
2    heikkinen
3     futrelle
4        allen
Name: Surname, dtype: object

In [42]:
# imports demographic data that can be added to the surnames
census1 = pd.read_csv('first third.csv')
census2 = pd.read_csv('second third.csv')
census3 = pd.read_csv('third third.csv')
census = pd.concat([census1, census2, census3], ignore_index=True)

In [43]:
# converts the surnames in the demographic data to lowercase
def makeLower(cell):
    cell = str(cell)
    cell = cell.lower()
    return cell
census['lower_name'] = census['name'].apply(makeLower)
census['lower_name'].head()

0       smith
1     johnson
2    williams
3       brown
4       jones
Name: lower_name, dtype: object

In [44]:
# drops the name column, once its replacement has been added, as well as a few other unnecessary columns
census = census.drop(['name','rank','count','prop100k','cum_prop100k'],axis=1)

In [45]:
# replaces missing values, for small sample sizes, with zeroes
for col in census:
    census[col] = census[col].replace(to_replace='(S)', value=0)

In [46]:
# joins the reprocessed census data to the train and test data, using the lower-cased surnames as a common key
train = train.join(census.set_index('lower_name'), on='Surname', how='left')
test = test.join(census.set_index('lower_name'), on='Surname', how='left')

In [47]:
# converts the values from several of the new columns from the census data into float numbers
numcols = ['pctwhite','pctblack','pctapi','pctaian','pct2prace','pcthispanic']
for df in train,test:
    for col in numcols:
        df[col] = df[col].astype('float')

In [48]:
# impute missing values in the new census columns, based on the values for three other features
for df in train,test:
    for col in numcols:
        df[col] = df.groupby(['Pclass', 'AgeImputed','Embarked'])[col].transform(lambda x: x.fillna(x.mean()))

In [49]:
# replace any remaining missing values in the new census columns with their average value
for df in train,test:
    for col in numcols:
        df[col].fillna(df[col].mean(),inplace=True)

In [50]:
# drop the name and surname columns, which are no longer needed for analysis
for df in train,test:
    df.drop(['Name','Surname'],axis=1,inplace=True)

## Engineering of Feature Interaction 

In [51]:
# create new features based on interactions between features
for df in train,test:
    df["AdultMale"] = ((df["Sex"] == "male")&(df["AgeImputed"]>18))*1
    df["AdultFemale"] = ((df["Sex"] == "female")&(df["AgeImputed"]>18))*1
    df["IsBaby"] = (df["AgeImputed"]<2)*1
    df["IsLoner"] = ((df["AgeImputed"] >= 18) & (df["SibSp"] == 0) & (df["Parch"]==0))*1
    df["IsChild"] = ((df["AgeImputed"]>2)&(df["AgeImputed"]<18))*1
    df['Immed&ExtendFam'] = ((df["SibSp"]>0)&(df["Parch"]>0))*1
    df["FamilyMan"] = ((df["Sex"] == "male")&(df["SibSp"]>0)&(df["Parch"]>0))*1
    df["FamilyWoman"] = ((df["Sex"] == "female")&(df["SibSp"]>0)&(df["Parch"]>0))*1

## Binning Numerical Data

In [52]:
# Binning numerical columns
for df in train,test:
    df['FareBin'] = pd.qcut(df.Fare, q=4, labels=False,duplicates='drop')
    df['AgeBin'] = pd.qcut(df.AgeImputed, q=4, labels=False,duplicates='drop')
    df['FamSizeBin'] = pd.qcut(df.FamSize, q=4, labels=False,duplicates='drop')
    df['TickLenBin'] = pd.qcut(df.TickLen, q=4, labels=False,duplicates='drop')
    df['TickNumLenBin'] = pd.qcut(df.TickNumLen, q=4, labels=False,duplicates='drop')
    df['pctwhiteBin'] = pd.qcut(df.pctwhite, q=4, labels=False,duplicates='drop')
    df['pctblackBin'] = pd.qcut(df.pctblack, q=4, labels=False,duplicates='drop')
    df['pcthispanicBin'] = pd.qcut(df.pcthispanic, q=4, labels=False,duplicates='drop')

In [53]:
# drop the columns that are no longer needed after binning
for df in train,test:
    df.drop(['Fare','AgeImputed','FamSize','TickLen','TickNumLen','pctwhite','pctblack',
             'pcthispanic'],axis=1,inplace=True)

## Collapsing Sex into One Column

In [54]:
# remove the duplicate sex column
for df in train,test:
    df['Sex'] = df.Sex.apply(lambda x: 0 if x == "female" else 1)

## Dummy Columns

In [55]:
# prepare a list of categorical columns to add dummy columns
train2 = train.drop(['Survived'], axis=1)
catCols = train2.select_dtypes(exclude=["number"])
categorical_features = list(catCols.columns)
test2 = test
catCols2 = test2.select_dtypes(exclude=["number"])
categorical_features2 = list(catCols2.columns)

In [56]:
# add dummy columns and drop the first one of each set to minimize multicollinearity
Xtraincat= pd.get_dummies(train2[categorical_features],drop_first=True)
train = pd.concat([train, Xtraincat], axis=1)
train = train.drop(categorical_features,axis=1)
Xtestcat= pd.get_dummies(test2[categorical_features],drop_first=True)
test = pd.concat([test, Xtestcat], axis=1)
test = test.drop(categorical_features,axis=1)

In [57]:
# remove columns from the training data that are not in the test data and print dimensions
# to ensure the training data has just one more column (the target) than the test data
testCols = list(test.columns)
trainCols = list(train.columns)
trainCols.remove('Survived')
extraTrain = list(set(trainCols)-set(testCols))
train = train.drop(extraTrain,axis=1)

## Split for Validation

In [58]:
# prepare the train data for validation of the model
X = train.drop(['Survived'], axis=1)
y = train['Survived']

In [59]:
# split the train data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

In [60]:
# Following code to deal with SettingWithCopyWarning
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

## Evaluation

In [61]:
# instatiate the classifier object
xgb = XGBClassifier()

In [62]:
# run grid search for cross-validation purposes
param_grid = {
    'learning_rate':[0.095,0.1,0.105],
    'max_depth':[2,3,4],
    'n_estimators':[90,100,110]
}
gscv = GridSearchCV(xgb, param_grid, iid=False, cv=4, return_train_score=False)

In [63]:
# fit the model
gscv.fit(Xtrain, ytrain)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'learning_rate': [0.095, 0.1, 0.105], 'max_depth': [2, 3, 4], 'n_estimators': [90, 100, 110]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [64]:
# assign the best prediction to a variable
ypred = gscv.best_estimator_.predict(Xtest)

In [65]:
# print the summary statistics for the validation prediction
print ("accuracy score:",metrics.accuracy_score(ytest, ypred))
print()
print ("confusion matrix:")
print (metrics.confusion_matrix(ytest, ypred))
print()
print ("classification report:")
print (metrics.classification_report(ytest, ypred))

accuracy score: 0.8212290502793296

confusion matrix:
[[97  9]
 [23 50]]

classification report:
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       106
           1       0.85      0.68      0.76        73

   micro avg       0.82      0.82      0.82       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



## Prepare for Kaggle Submission

In [66]:
# prepare a DataFrame for the submission file
submission = pd.DataFrame(test['PassengerId'])
submission['Survived'] = gscv.best_estimator_.predict(test)

In [67]:
# convert the submission dataframe into a csv file, formatted as Kaggle requires
submission.to_csv('submission.csv',index=False,index_label=False)

In [68]:
print("The submission file has been saved to the same folder as this program.")

The submission file has been saved to the same folder as this program.
