In [1]:
import pandas as pd
import numpy as np

## Read the CSV

In [2]:
# loading the dataset
df = pd.read_csv("resources/train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Process features

In [3]:
# check the number of missing values in the data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# drop rows with missing values in the 'Embarked' column
df = df.dropna(subset=['Embarked'])

In [5]:
# basic feature engineering of columns Name, Cabin, and addition of Family Size, Age Class, and Fare per Person courtesy of:
# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
# starting with titles: function that searches for substrings
#import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print(big_string)
    return np.nan

# list of titles
title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

# recombine list to the four categories
df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

# replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 'Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
# titles now extracted from 'Name'
df['Title'] = df.apply(replace_titles, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] = df.apply(replace_titles, axis=1)


In [6]:
# replace null values in 'Cabin' with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cabin'] = df['Cabin'].fillna('Unknown')


In [7]:
# turning 'Cabin' number into Deck
deck_list = []
cabin_list = df['Cabin'].tolist()
cabin_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'U']
for cabin in cabin_list:
    j = list(cabin)[0]
    if j in cabin_labels:
        deck_list.append(j)

df["Deck"] = deck_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Deck"] = deck_list


In [8]:
# replace null values in 'Age' by its median
#df['Age'] = df['Age'].fillna(df['Age'].median())

In [9]:
# 'borrowing' script from kaggle user 'PandaBrenda' https://www.kaggle.com/brendan45774/titanic-top-solution
mean = df["Age"].mean()
std = df["Age"].std()
is_null = df["Age"].isnull().sum()

# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)

In [10]:
# fill NaN values in Age column with random values generated
age_slice = df["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
df["Age"] = age_slice
df["Age"] = df["Age"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = age_slice
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df["Age"].astype(int)


In [11]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Deck           0
dtype: int64

In [12]:
# creating new family_size column with linear combination of features
df['Family_Size'] = df['SibSp']+df['Parch']

# creating interaction term, 'Age'*'Class'
df['Age*Class'] = df['Age']*df['Pclass']

# create fare per person column
df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Family_Size'] = df['SibSp']+df['Parch']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age*Class'] = df['Age']*df['Pclass']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)


In [13]:
# verification
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Family_Size,Age*Class,Fare_Per_Person
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,Unknown,S,Mr,U,1,66,3.62500
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,Mrs,C,1,38,35.64165
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,Unknown,S,Miss,U,0,78,7.92500
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S,Mrs,C,1,35,26.55000
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,Unknown,S,Mr,U,0,105,8.05000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0000,Unknown,S,Mr,U,0,54,13.00000
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0000,B42,S,Miss,B,0,19,30.00000
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,17,1,2,W./C. 6607,23.4500,Unknown,S,Miss,U,3,51,5.86250
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0000,C148,C,Mr,C,0,26,30.00000


In [14]:
# drop features not to be included in our model
df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [15]:
# create binary encoded data from 'Sex' column with dummy encoding
df = pd.get_dummies(df)

# drop redundant data
df = df.drop(["Sex_male"], axis=1)

In [16]:
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Family_Size,Age*Class,Fare_Per_Person,Sex_female,...,Title_Mrs,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,0,3,22,1,0,7.2500,1,66,3.62500,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,38,1,0,71.2833,1,38,35.64165,1,...,1,0,0,1,0,0,0,0,0,0
2,1,3,26,0,0,7.9250,0,78,7.92500,1,...,0,0,0,0,0,0,0,0,0,1
3,1,1,35,1,0,53.1000,1,35,26.55000,1,...,1,0,0,1,0,0,0,0,0,0
4,0,3,35,0,0,8.0500,0,105,8.05000,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27,0,0,13.0000,0,54,13.00000,0,...,0,0,0,0,0,0,0,0,0,1
887,1,1,19,0,0,30.0000,0,19,30.00000,1,...,0,0,1,0,0,0,0,0,0,0
888,0,3,17,1,2,23.4500,3,51,5.86250,1,...,0,0,0,0,0,0,0,0,0,1
889,1,1,26,0,0,30.0000,0,26,30.00000,0,...,0,0,0,1,0,0,0,0,0,0


In [17]:
# export as csv for use elsewhere
df.to_csv("resources/train_transformed_2.csv", index=False, header=True)