# Classification Walkthrough: Titanic Dataset


In [36]:
import matplotlib as plt
import pandas as pd 
from sklearn import ensemble, preprocessing, tree
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.model_selection import train_test_split, StratifiedKFold    
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from yellowbrick.model_selection import LearningCurve
import numpy as np
import seaborn as sns

In [37]:
# Set up the Notebook
%matplotlib widget
%load_ext autoreload

# We do this to ignore several specific warnings
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
df = pd.read_csv("datasets/titanic_data.csv")
orig_df = df
df.sample(5)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
994,3,0,"Mardirosian, Mr. Sarkis",male,?,0,0,2655,7.2292,F E46,C,?,?,?
990,3,0,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,?,S,?,?,?
610,3,0,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,?,S,?,?,"Sweden Akeley, MN"
1043,3,1,"Murphy, Miss. Margaret Jane",female,?,1,0,367230,15.5,?,Q,16,?,?
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


## Explore and Clean the Data

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [40]:
# let's check the summary statistics of objects columns:
df.describe(include=[object])

Unnamed: 0,name,sex,age,ticket,fare,cabin,embarked,boat,body,home.dest
count,1309,1309,1309,1309,1309.0,1309,1309,1309,1309,1309
unique,1307,2,99,929,282.0,187,4,28,122,370
top,"Kelly, Mr. James",male,?,CA. 2343,8.05,?,S,?,?,?
freq,2,843,263,11,60.0,1014,914,823,1188,564


In [41]:
df.shape

(1309, 14)

In [42]:
df.describe().iloc[:,:2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [43]:
# count missing data in each column:
df.isnull().sum()

pclass       0
survived     0
name         0
sex          0
age          0
sibsp        0
parch        0
ticket       0
fare         0
cabin        0
embarked     0
boat         0
body         0
home.dest    0
dtype: int64

In [44]:
# It seems like there are no missing datas in this dataframe
# Let's take a look at 20 random observations to see if missing data has another form than NaN
df.sample(20)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
201,1,0,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S,?,175,"Dorchester, MA"
1162,3,1,"Ryan, Mr. Edward",male,?,0,0,383162,7.75,?,Q,14,?,?
226,1,0,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S,?,?,"Isleworth, England"
211,1,0,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,?,S,?,?,"Washington, DC"
48,1,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,?,C,6,?,"Washington, DC"
1003,3,1,"McCoy, Mr. Bernard",male,?,2,0,367226,23.25,?,Q,16,?,?
39,1,0,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C,?,208,"Omaha, NE"
31,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31.0,A31,C,7,?,"Glen Ridge, NJ"
751,3,0,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,?,S,?,?,"Bulgaria Chicago, IL"
929,3,0,"Kiernan, Mr. John",male,?,1,0,367227,7.75,?,Q,?,?,?


In [45]:
# So let's replace the '?' with NaN
df.replace(to_replace='?', value=np.NaN, inplace=True)

In [46]:
# count missing data in each column again:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [47]:
# Rows with missing data
mask = df.isnull().any(axis=1)
mask.head()

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [48]:
df[mask].body.head()

0    NaN
1    NaN
2    NaN
3    135
4    NaN
Name: body, dtype: object

In [49]:
# Pandas ignores null or NaN values
# If youwant to include those, use dropna=False
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [50]:
df.embarked.value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [51]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In this case we have a couple of options to deal with missing data. We could for instance use S since it is the most common value, or just dig to the data and try to find a pattern.

In [52]:
# Let's save the column name in a variable, 
# before dropping it from our dataframe
# in cas we want to perform NLP or extract data out of text, like titles
name = df.name
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

## Features Selection

In [53]:
df = df.drop(columns=['name', 'ticket', 'home.dest',
                     'boat', 'body', 'cabin'])

In [54]:
df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [55]:
# check if only categorical data are objects
df.dtypes

pclass       int64
survived     int64
sex         object
age         object
sibsp        int64
parch        int64
fare        object
embarked    object
dtype: object

In [56]:
# Let's convert age and fare to float
df['fare'] = pd.to_numeric(df['fare'], errors='coerce')
df['age'] = df['age'].astype(np.float)
df[['age', 'fare']].dtypes

age     float64
fare    float64
dtype: object

In [57]:
# Now we can transform category features 
# such as sex and embarked to dummy variables
df = pd.get_dummies(df)
# alternatively, we cann add a drop_first=True parameter to the get_dummies
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [58]:
# since sex_male and sex_female are perfectly inverse correlated, 
# we can remove one of them.
# Typically we remove any columns with perfect or very high correlation
df.drop(columns=['sex_male'], inplace=True)
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'embarked_Q', 'embarked_S'],
      dtype='object')

In [59]:
# Create a dataframe X with the independent variables, 
# and a series y with the labels
y = df.survived
X = df.drop(columns='survived')

## Sample Data

In [60]:
# Split data into training and testing data
frac = 0.3
rstate = 42
X_train, X_test, y_train, y_text = train_test_split( X, y, test_size=0.3, random_state=rstate)

## Impute Data

In [61]:
# we impute missing values on the training set, and use the trained imputers to fill in the test dataset
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

In [62]:
num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female']

imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed

In [63]:
X_train.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female', 'embarked_Q',
       'embarked_S'],
      dtype='object')

## Normalize Data

In [71]:
cols = "pclass,age,sibsp,fare, 'sex_female', 'embarked_C', 'embarked_Q', 'embarked_S'".split(",")

In [72]:
# Create the scaler
sca = preprocessing.StandardScaler()
# fit and transform training data
X_train = sca.fit_transform(X_train)
# Convert the result back into a pandas DF
X_train = pd.DataFrame(X_train, columns=cols)

# Dito test data
X_test = sca.transform(X_test)
X_test = pd.DataFrame(X_test, columns=cols)

## Refactor

In [73]:
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

In [74]:
def get_train_test_X_y(
    df, y_col, size=0.3, std_cols=None
):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=size, random_state=42
    )
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    fi = impute.IterativeImputer()
    fitted = fi.fit_transform(X_train[num_cols])
    X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(num_cols)})
    test_fit = fi.transform(X_test[num_cols])
    X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(num_cols)})
    if std_cols:
        std = preprocessing.StandardScaler()
        fitted = std.fit_transform(X_train[std_cols])
        X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(std_cols)})
        test_fit = std.transform(X_test[std_cols])
        X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(std_cols)})

    return X_train, X_test, y_train, y_test