In [286]:
import pandas as pd
import numpy as np

In [287]:
train_data = pd.read_csv("../../train.csv")
test_data = pd.read_csv("../../test.csv")

In [288]:
#Removing cols

#removing 'cabin' column because it has many NAN cols
cols = ['Ticket', 'Name', 'Cabin', 'Ticket', 'PassengerId']
train_data.drop(columns=cols, inplace=True)
test_data.drop(columns=cols, inplace=True)

In [289]:
train_data.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [290]:
train_data.isnull().any()

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool

In [291]:
#fill NAN
def replace_numeric(columns: list, dataset: pd.DataFrame):
    nan_list = dataset.isnull().any()
    for col in columns:
        if nan_list[col]:
            temp = dataset[col]
            temp = temp.dropna()
            dataset[col].fillna(temp.mean(), inplace=True)

def replace_categorical(columns: list, dataset: pd.DataFrame):
    nan_list = dataset.isnull().any()
    for col in columns:
        if nan_list[col]:
            temp = dataset[col]
            temp = temp.dropna()
            dataset[col].fillna(value=temp.mode()[0], inplace=True)

numeric_cols = ['Age', 'Fare', 'Parch', 'SibSp']
replace_numeric(numeric_cols, train_data)
replace_numeric(numeric_cols, test_data)

categorical_cols = ['Embarked', 'Sex']
replace_categorical(categorical_cols, train_data)
replace_categorical(categorical_cols, test_data)

In [292]:
train_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [293]:
test_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [294]:
# mapping categorical to numeric
train_data = pd.concat([train_data, pd.get_dummies(train_data['Sex'], dtype=float)], axis=1)
test_data = pd.concat([test_data, pd.get_dummies(test_data['Sex'], dtype=float)], axis=1)

train_data = pd.concat([train_data, pd.get_dummies(train_data['Embarked'], dtype=float)], axis=1)
test_data = pd.concat([test_data, pd.get_dummies(test_data['Embarked'], dtype=float)], axis=1)

train_data['Fare'] = train_data['Fare'].map(lambda x: float(x))
test_data['Fare'] = test_data['Fare'].map(lambda x: float(x))

train_data.drop(columns=['Sex', 'Embarked', 'Fare'], inplace=True)
test_data.drop(columns=['Sex', 'Embarked', 'Fare'], inplace=True)

In [295]:
train_data.dtypes

Survived      int64
Pclass        int64
Age         float64
SibSp         int64
Parch         int64
female      float64
male        float64
C           float64
Q           float64
S           float64
dtype: object

In [296]:
test_data.dtypes

Pclass      int64
Age       float64
SibSp       int64
Parch       int64
female    float64
male      float64
C         float64
Q         float64
S         float64
dtype: object