In [164]:
import pandas as pd

In [165]:
titanic_train = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

## Prepare the Data for Machine Learning Algorithms

### Data Cleaning

**Numeric values**

In [166]:
# check for na values
titanic_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [167]:
from sklearn.impute import SimpleImputer

In [168]:
# dropping na's in this column because this column is categorical
titanic_train = titanic_train.dropna(subset=['Embarked'])

In [169]:
imputer = SimpleImputer(strategy="median")

In [170]:
titanic_train_num = titanic_train.select_dtypes(include="number")

In [171]:
imputer.fit(titanic_train_num)

SimpleImputer(strategy='median')

In [172]:
# computed medians were stored in statistics_ attribute
imputer.statistics_ == titanic_train_num.median()

PassengerId    True
Survived       True
Pclass         True
Age            True
SibSp          True
Parch          True
Fare           True
dtype: bool

In [173]:
X = imputer.transform(titanic_train_num)

In [174]:
X

array([[  1.    ,   0.    ,   3.    , ...,   1.    ,   0.    ,   7.25  ],
       [  2.    ,   1.    ,   1.    , ...,   1.    ,   0.    ,  71.2833],
       [  3.    ,   1.    ,   3.    , ...,   0.    ,   0.    ,   7.925 ],
       ...,
       [889.    ,   0.    ,   3.    , ...,   1.    ,   2.    ,  23.45  ],
       [890.    ,   1.    ,   1.    , ...,   0.    ,   0.    ,  30.    ],
       [891.    ,   0.    ,   3.    , ...,   0.    ,   0.    ,   7.75  ]])

*If you want to put it back into a pandas DataFrame:*

In [175]:
#titatic_train_df = pd.DataFrame(X, columns=titanic_train_num.columns)

**Text and Categorical attributes**

In [176]:
titanic_string = titanic_train.select_dtypes(exclude='number')
titanic_string.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [177]:
# check for value counts if there is outlier in categorical columns
print(titanic_string['Sex'].value_counts())
print(titanic_string['Embarked'].value_counts())

male      577
female    312
Name: Sex, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [178]:
# check for null values 
print(titanic_string['Sex'].isnull().sum())
print(titanic_string['Embarked'].isnull().sum())

0
0


In [179]:
# selecting categorical columns
titanic_string_cat = titanic_string[['Sex', 'Embarked']]

In [180]:
titanic_string_cat.to_numpy()

array([['male', 'S'],
       ['female', 'C'],
       ['female', 'S'],
       ...,
       ['female', 'S'],
       ['male', 'C'],
       ['male', 'Q']], dtype=object)

*PS: for onehotencoder, hyperparamater handle_unknown='ignore' could be used instead of dropping na's from the Embarked  column at the bigging of the notebook. However, it can not be used simultaneously with drop paramater since we would be confused by the reason of zeros*

In [187]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(drop='if_binary')
enc.fit(titanic_string_cat)
enc.transform(titanic_string_cat).toarray()

array([[1., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.]])

 *Alternative to OneHotEncoder, especially when variables are in order like "good, normal, bad"*

In [184]:
# from sklearn.preprocessing import OrdinalEncoder

# ordinal_encoder = OrdinalEncoder()
# titanic_cats_encoded = ordinal_encoder.fit_transform(titanic_string_cat)
# titanic_cats_encoded[:5]
# titanic_cat_encoded.categories_