In [1]:
import pandas as pd
import numpy as np

## Read the CSV

In [2]:
# loading the dataset
df = pd.read_csv("resources/train.csv")
test_data = pd.read_csv("resources/test.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Process features

Redoing the datasets in accordance with other peoples' feature engineering and selection:
https://medium.com/@praveen.orvakanti/this-will-help-you-score-95-percentile-in-the-kaggle-titanic-ml-competition-aa2b3fd1b79b

In [3]:
# check the number of missing values in the data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# replace null values in 'Embarked' with the mode
df['Embarked'] = df['Embarked'].fillna("S")

In [5]:
# basic feature engineering of columns Name, Cabin, and addition of Family Size, Age Class, and Fare per Person courtesy of:
# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
# starting with titles: function that searches for substrings
#import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print(big_string)
    return np.nan

# list of titles
title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

# recombine list to the four categories
df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

# replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 'Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
# titles now extracted from 'Name'
df['Title'] = df.apply(replace_titles, axis=1)

In [6]:
# replace null values in 'Cabin' with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

In [7]:
# turning 'Cabin' number into Deck
deck_list = []
cabin_list = df['Cabin'].tolist()
cabin_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'U']
for cabin in cabin_list:
    j = list(cabin)[0]
    if j in cabin_labels:
        deck_list.append(j)

df["Deck"] = deck_list

In [8]:
# 'borrowing' script from kaggle user 'PandaBrenda' https://www.kaggle.com/brendan45774/titanic-top-solution
mean = df["Age"].mean()
std = test_data["Age"].std()
is_null = df["Age"].isnull().sum()

# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)

In [9]:
# fill NaN values in Age column with random values generated
age_slice = df["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
df["Age"] = age_slice
df["Age"] = df["Age"].astype(int)

In [10]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Deck           0
dtype: int64

In [11]:
# creating new family_size column with linear combination of features
df['Family_Size'] = df['SibSp']+df['Parch']

# creating interaction term, 'Age'*'Class'
df['Age*Class'] = df['Age']*df['Pclass']

# create fare per person column
df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)

In [12]:
# create new column for travelled_alone or not
df.loc[df['Family_Size'] > 0, 'travelled_alone'] = 'No'
df.loc[df['Family_Size'] == 0, 'travelled_alone'] = 'Yes'

In [13]:
# convert Age and Fare values to bins
df['Age'] = df['Age'].astype(int)
df.loc[ df['Age'] <= 11, 'Age'] = 0
df.loc[(df['Age'] > 11) & (df['Age'] <= 18), 'Age'] = 1
df.loc[(df['Age'] > 18) & (df['Age'] <= 22), 'Age'] = 2
df.loc[(df['Age'] > 22) & (df['Age'] <= 27), 'Age'] = 3
df.loc[(df['Age'] > 27) & (df['Age'] <= 33), 'Age'] = 4
df.loc[(df['Age'] > 33) & (df['Age'] <= 40), 'Age'] = 5
df.loc[(df['Age'] > 40) & (df['Age'] <= 66), 'Age'] = 6
df.loc[ df['Age'] > 66, 'Age'] = 7
    
df['Age'] = df['Age'].astype(str)
df.loc[ df['Age'] == '0', 'Age'] = "Children"
df.loc[ df['Age'] == '1', 'Age'] = "Teens"
df.loc[ df['Age'] == '2', 'Age'] = "Youngsters"
df.loc[ df['Age'] == '3', 'Age'] = "Young Adults"
df.loc[ df['Age'] == '4', 'Age'] = "Adults"
df.loc[ df['Age'] == '5', 'Age'] = "Middle Age"
df.loc[ df['Age'] == '6', 'Age'] = "Senior"
df.loc[ df['Age'] == '7', 'Age'] = "Retired"

df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
df.loc[(df['Fare'] > 31) & (df['Fare'] <= 99), 'Fare']   = 3
df.loc[(df['Fare'] > 99) & (df['Fare'] <= 250), 'Fare']   = 4
df.loc[ df['Fare'] > 250, 'Fare'] = 5
df['Fare'] = df['Fare'].astype(int)
    
df['Fare'] = df['Fare'].astype(str)
df.loc[ df['Fare'] == '0', 'Fare'] = "Extremely Low"
df.loc[ df['Fare'] == '1', 'Fare'] = "Very Low"
df.loc[ df['Fare'] == '2', 'Fare'] = "Low"
df.loc[ df['Fare'] == '3', 'Fare'] = "High"
df.loc[ df['Fare'] == '4', 'Fare'] = "Very High"
df.loc[ df['Fare'] == '5', 'Fare'] = "Extremely High"

In [14]:
# drop features not to be included in our model
df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [15]:
# create binary encoded data from 'Sex' column with dummy encoding
df = pd.get_dummies(df, columns=["Pclass", "Sex", "Embarked", "Deck", "Title", "travelled_alone", "Age", "Fare"])

# drop redundant data
df = df.drop(["Sex_male", "travelled_alone_Yes"], axis=1)

In [16]:
# verification
df

Unnamed: 0,Survived,SibSp,Parch,Family_Size,Age*Class,Fare_Per_Person,Pclass_1,Pclass_2,Pclass_3,Sex_female,...,Age_Young Adults,Age_Youngsters,Fare_6,Fare_7,Fare_Extremely High,Fare_Extremely Low,Fare_High,Fare_Low,Fare_Very High,Fare_Very Low
0,0,1,0,1,66,3.62500,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
1,1,1,0,1,38,35.64165,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,78,7.92500,0,0,1,1,...,1,0,0,0,0,0,0,0,0,1
3,1,1,0,1,35,26.55000,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,105,8.05000,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,0,0,54,13.00000,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
887,1,0,0,0,19,30.00000,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0
888,0,1,2,3,72,5.86250,0,0,1,1,...,1,0,0,0,0,0,0,1,0,0
889,1,0,0,0,26,30.00000,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [17]:
# scale ONLY the numerical data:
# Capture all the numerical features so that we can scale them later
train_numerical_features = list(df.select_dtypes(include=['int64', 'float64', 'int32']).columns)
train_numerical_features

['Survived', 'SibSp', 'Parch', 'Family_Size', 'Age*Class', 'Fare_Per_Person']

In [18]:
del train_numerical_features[0]
train_numerical_features

['SibSp', 'Parch', 'Family_Size', 'Age*Class', 'Fare_Per_Person']

In [19]:
# Feature scaling - Standard scaler
from sklearn.preprocessing import StandardScaler
ss_scaler = StandardScaler()
train_df_ss = pd.DataFrame(data = df)
train_df_ss[train_numerical_features] = ss_scaler.fit_transform(train_df_ss[train_numerical_features])

In [20]:
# verification
train_df_ss

Unnamed: 0,Survived,SibSp,Parch,Family_Size,Age*Class,Fare_Per_Person,Pclass_1,Pclass_2,Pclass_3,Sex_female,...,Age_Young Adults,Age_Youngsters,Fare_6,Fare_7,Fare_Extremely High,Fare_Extremely Low,Fare_High,Fare_Low,Fare_Very High,Fare_Very Low
0,0,0.432793,-0.473674,0.059160,0.047322,-0.454798,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
1,1,0.432793,-0.473674,0.059160,-0.776535,0.438994,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1,-0.474545,-0.473674,-0.560975,0.400404,-0.334757,0,0,1,1,...,1,0,0,0,0,0,0,0,0,1
3,1,0.432793,-0.473674,0.059160,-0.864806,0.185187,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,-0.474545,-0.473674,-0.560975,1.194837,-0.331267,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,-0.474545,-0.473674,-0.560975,-0.305760,-0.193081,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
887,1,-0.474545,-0.473674,-0.560975,-1.335581,0.281499,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0
888,0,0.432793,2.008933,1.299429,0.223863,-0.392335,0,0,1,1,...,1,0,0,0,0,0,0,1,0,0
889,1,-0.474545,-0.473674,-0.560975,-1.129617,0.281499,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [21]:
# export as csv for use elsewhere
train_df_ss.to_csv("resources/train_transformed_solutions.csv", index=False, header=True)