In [78]:
import pandas as pd

In [79]:
titanic = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")

In [80]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [81]:
# check for na values
titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Prepare the Data for Machine Learning Algorithms

### Data Cleaning

**Numeric values**

In [82]:
from sklearn.impute import SimpleImputer

In [83]:
# dropping na's in this column because this column is categorical and will be worked upon
titanic = titanic.dropna(subset=['Embarked'])

In [84]:
imputer = SimpleImputer(strategy="median")

In [85]:
titanic_age_fare = titanic_train[["Age", "Fare"]]

In [86]:
imputer.fit(titanic_age_fare)

SimpleImputer(strategy='median')

In [98]:
# computed medians were stored in statistics_ attribute of SimpleImputer class
imputer.statistics_ == titanic_age_fare.median()

Age     True
Fare    True
dtype: bool

In [88]:
titanic_imp_age_fare = imputer.transform(titanic_age_fare)

In [106]:
X = titanic_imp_age_fare

*If you want to put it back into a pandas DataFrame:*

In [110]:
#itanic_df = pd.DataFrame(X, columns=['Age', 'Fare'])

**Text and Categorical attributes**

In [99]:
titanic.select_dtypes(exclude='number').head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [100]:
# check for value counts if there is outlier in categorical columns
print(titanic['Sex'].value_counts())
print(titanic['Embarked'].value_counts())

male      577
female    312
Name: Sex, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [101]:
# check for null values 
print(titanic['Sex'].isnull().sum())
print(titanic['Embarked'].isnull().sum())

0
0


In [102]:
# selecting categorical columns
titanic_cat = titanic[['Sex', 'Embarked']]

In [103]:
titanic_cat.to_numpy()

array([['male', 'S'],
       ['female', 'C'],
       ['female', 'S'],
       ...,
       ['female', 'S'],
       ['male', 'C'],
       ['male', 'Q']], dtype=object)

*PS: for OneHotEncoder, hyperparamater handle_unknown='ignore' could be used instead of dropping na's from the Embarked  column at the bigging of the notebook. However, it can not be used simultaneously with drop paramater since we would be confused by the reason of zeros*

In [104]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(drop='if_binary')
enc.fit(titanic_cat)
enc.transform(titanic_cat).toarray()

array([[1., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [1., 1., 0., 0.],
       [1., 0., 1., 0.]])

 *Alternative to OneHotEncoder, especially when variables are in order like "good, normal, bad"*

In [105]:
# from sklearn.preprocessing import OrdinalEncoder

# ordinal_encoder = OrdinalEncoder()
# titanic_cats_encoded = ordinal_encoder.fit_transform(titanic_cat)
# titanic_cats_encoded[:5]
# titanic_cat_encoded.categories_