In [1]:
import pandas as pd
import numpy as np

## Read the CSV

In [2]:
# loading the dataset
train_data = pd.read_csv("resources/train.csv")
df = pd.read_csv("resources/test.csv")
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## Process features

Redoing the datasets in accordance with other peoples' feature engineering and selection:
https://medium.com/@praveen.orvakanti/this-will-help-you-score-95-percentile-in-the-kaggle-titanic-ml-competition-aa2b3fd1b79b

In [3]:
# check the number of missing values in the data
df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [4]:
# replace null values in 'Fare' with the median
fare_median = df['Fare'].median()
df['Fare'] = df['Fare'].fillna(f'{fare_median}')

In [5]:
# basic feature engineering of columns Name, Cabin, and addition of Family Size, Age Class, and Fare per Person courtesy of:
# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
# starting with titles: function that searches for substrings
#import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print(big_string)
    return np.nan

# list of titles
title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

# recombine list to the four categories
df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

# replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 'Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
# titles now extracted from 'Name'
df['Title'] = df.apply(replace_titles, axis=1)

In [6]:
# replace null values in 'Cabin' with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

In [7]:
# turning 'Cabin' number into Deck
deck_list = []
cabin_list = df['Cabin'].tolist()
cabin_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'U']
for cabin in cabin_list:
    j = list(cabin)[0]
    if j in cabin_labels:
        deck_list.append(j)

df["Deck"] = deck_list

In [8]:
# 'borrowing' script from kaggle user 'PandaBrenda' https://www.kaggle.com/brendan45774/titanic-top-solution
mean = train_data["Age"].mean()
std = df["Age"].std()
is_null = df["Age"].isnull().sum()

# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)

In [9]:
# fill NaN values in Age column with random values generated
age_slice = df["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
df["Age"] = age_slice
df["Age"] = df["Age"].astype(int)

In [10]:
df.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Deck           0
dtype: int64

In [11]:
df['Fare'] = df['Fare'].astype(float)

In [12]:
# creating new family_size column with linear combination of features
df['Family_Size'] = df['SibSp']+df['Parch']

# creating interaction term, 'Age'*'Class'
df['Age*Class'] = df['Age']*df['Pclass']

# create fare per person column
df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)

In [13]:
# create new column for travelled_alone or not
df.loc[df['Family_Size'] > 0, 'travelled_alone'] = 'No'
df.loc[df['Family_Size'] == 0, 'travelled_alone'] = 'Yes'

In [14]:
# convert Age and Fare values to bins
df['Age'] = df['Age'].astype(int)
df.loc[ df['Age'] <= 11, 'Age'] = 0
df.loc[(df['Age'] > 11) & (df['Age'] <= 18), 'Age'] = 1
df.loc[(df['Age'] > 18) & (df['Age'] <= 22), 'Age'] = 2
df.loc[(df['Age'] > 22) & (df['Age'] <= 27), 'Age'] = 3
df.loc[(df['Age'] > 27) & (df['Age'] <= 33), 'Age'] = 4
df.loc[(df['Age'] > 33) & (df['Age'] <= 40), 'Age'] = 5
df.loc[(df['Age'] > 40) & (df['Age'] <= 66), 'Age'] = 6
df.loc[ df['Age'] > 66, 'Age'] = 7
    
df['Age'] = df['Age'].astype(str)
df.loc[ df['Age'] == '0', 'Age'] = "Children"
df.loc[ df['Age'] == '1', 'Age'] = "Teens"
df.loc[ df['Age'] == '2', 'Age'] = "Youngsters"
df.loc[ df['Age'] == '3', 'Age'] = "Young Adults"
df.loc[ df['Age'] == '4', 'Age'] = "Adults"
df.loc[ df['Age'] == '5', 'Age'] = "Middle Age"
df.loc[ df['Age'] == '6', 'Age'] = "Senior"
df.loc[ df['Age'] == '7', 'Age'] = "Retired"

df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
df.loc[(df['Fare'] > 31) & (df['Fare'] <= 99), 'Fare']   = 3
df.loc[(df['Fare'] > 99) & (df['Fare'] <= 250), 'Fare']   = 4
df.loc[ df['Fare'] > 250, 'Fare'] = 5
df['Fare'] = df['Fare'].astype(int)
    
df['Fare'] = df['Fare'].astype(str)
df.loc[ df['Fare'] == '0', 'Fare'] = "Extremely Low"
df.loc[ df['Fare'] == '1', 'Fare'] = "Very Low"
df.loc[ df['Fare'] == '2', 'Fare'] = "Low"
df.loc[ df['Fare'] == '3', 'Fare'] = "High"
df.loc[ df['Fare'] == '4', 'Fare'] = "Very High"
df.loc[ df['Fare'] == '5', 'Fare'] = "Extremely High"

In [15]:
# drop features not to be included in our model
df = df.drop(["Name", "Ticket", "Cabin"], axis=1)

In [16]:
# create binary encoded data from 'Sex' column with dummy encoding
df = pd.get_dummies(df, columns=["Pclass", "Sex", "Embarked", "Deck", "Title", "travelled_alone", "Age", "Fare"])

# drop redundant data
df = df.drop(["Sex_male", "travelled_alone_Yes"], axis=1)

In [17]:
# add missing columns
df["Deck_T"] = 0

In [18]:
# scale ONLY the numerical data:
# Capture all the numerical features so that we can scale them later
train_numerical_features = list(df.select_dtypes(include=['int64', 'float64', 'int32']).columns)
train_numerical_features

['PassengerId',
 'SibSp',
 'Parch',
 'Family_Size',
 'Age*Class',
 'Fare_Per_Person',
 'Deck_T']

In [19]:
del train_numerical_features[0]
train_numerical_features

['SibSp', 'Parch', 'Family_Size', 'Age*Class', 'Fare_Per_Person', 'Deck_T']

In [20]:
# Feature scaling - Standard scaler
from sklearn.preprocessing import StandardScaler
ss_scaler = StandardScaler()
train_df_ss = pd.DataFrame(data = df)
train_df_ss[train_numerical_features] = ss_scaler.fit_transform(train_df_ss[train_numerical_features])

In [21]:
# verification
train_df_ss

Unnamed: 0,PassengerId,SibSp,Parch,Family_Size,Age*Class,Fare_Per_Person,Pclass_1,Pclass_2,Pclass_3,Sex_female,...,Age_Youngsters,Fare_6,Fare_7,Fare_Extremely High,Fare_Extremely Low,Fare_High,Fare_Low,Fare_Very High,Fare_Very Low,Deck_T
0,892,-0.499470,-0.400248,-0.553443,1.332321,-0.392544,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0.0
1,893,0.616992,-0.400248,0.105643,2.648654,-0.514300,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0.0
2,894,-0.499470,-0.400248,-0.553443,2.074868,-0.340280,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0.0
3,895,-0.499470,-0.400248,-0.553443,0.623526,-0.369108,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0.0
4,896,0.616992,0.619896,0.764728,0.117244,-0.497543,0,0,1,1,...,1,0,0,0,0,0,0,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,-0.499470,-0.400248,-0.553443,1.534834,-0.386334,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0.0
414,1306,-0.499470,-0.400248,-0.553443,-0.794063,2.450020,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0.0
415,1307,-0.499470,-0.400248,-0.553443,1.737346,-0.408834,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0.0
416,1308,-0.499470,-0.400248,-0.553443,1.534834,-0.386334,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0.0


In [22]:
# export as csv for use elsewhere
train_df_ss.to_csv("resources/test_transformed_solutions.csv", index=False, header=True)