# Libraries importing and configuration

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

train_data.head()

# Removing irrelevant features

For the classification models predictions some features will not give useful information, so we'll be removing the features 'PassengerId' and 'Name' that arent relevant information to know if a passenger survived or not.

In [None]:
del train_data['Name']
del train_data['PassengerId']

# Handling missing data I

In [None]:
train_data.isna().sum()

As the feature Embarked has a small number of missing data, we can remove the rows containing them.

In [None]:
train_data = train_data[train_data["Embarked"].notna()]

# Textual data encoding

Most of the available machine learning algorithms can't handle textual data, so we'll be transforming them into numercical data.

First we'll see how many unique values each of the textual features has to see the right encoding for each of them. If we aren't careful with the encoding, the model can suffer from the curse of dimensionality or it'll learn the features in the wrong way.

In [None]:
for col in train_data.columns:
    print(col+": ",len(pd.unique(train_data[col])), " ("+str(train_data[col].dtype)+")")

As can be seen, some of the features would lead to high dimensional data if we applied the one hot encoding. So, we'll apply the Ordinal encoding for the ticket and cabin features.

In [None]:
train_data['Cabin'][train_data['Cabin'].isna()] = 'NaN'
ord_enc = OrdinalEncoder()
ord_enc = ord_enc.fit(train_data[['Ticket', 'Cabin']])
train_data[['Ticket', 'Cabin']] = ord_enc.transform(train_data[['Ticket', 'Cabin']])

For the embarked and sex features we'll be using one hot encoding because the problem with the ordinal encoding is that the model could learn a order relationship between the values, and as we know, there isn't this kind of relation on these features values.

In [None]:
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])
train_data.head()

# Handling missing data II

For the rest, we'll be using the kNN imputer to fill the missing data.

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)
train_data = pd.DataFrame(knn_imputer.fit_transform(train_data), columns=train_data.columns)
train_data.head()

In [None]:
train_data.isna().sum()

# Checking for outliers

One way to check if a attribute has outliers is to check the statistical summary of the data. If the feature has a high discrepance between the mean and the median, its likely that it has outliers.

In [None]:
train_data.describe()

In [None]:
train_data.median()

As can be seen, only the fare feature has a significant difference between the mean and median, so we'll be removing all rows where the fare is higher than 2.4 standard deviations. 

In [None]:
train_data = train_data[(np.abs(stats.zscore(train_data['Fare'])) < 2.4)]
train_data.describe()

# Feature scaling

For a lot of machine learning algorithms is important for the data to have the same scale, so we'll be applying the MinMax encoding on the features with high variance.

In [None]:
variance = np.var(train_data)
print(variance)
highvar_cols = [col for col in train_data.columns if variance[col] > 2]
print(highvar_cols)

In [None]:
train_data_scaled = train_data.copy()
minmax_scal = MinMaxScaler(feature_range=(0.0,1.0))
minmax_scal = minmax_scal.fit(train_data_scaled[highvar_cols])
train_data_scaled[highvar_cols] = minmax_scal.transform(train_data_scaled[highvar_cols])
train_data_scaled.head()

# Correlation analysis

As we can see above, there are no features with high correlation.

In [None]:
sns.pairplot(train_data_scaled)
plt.show()

In [None]:
train_data_scaled.corr()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train_data_scaled.corr())
plt.show()

In [None]:
def preprocess_titanic(data):
    from sklearn.impute import KNNImputer
    from sklearn.preprocessing import OrdinalEncoder
    from sklearn.preprocessing import MinMaxScaler
    from scipy import stats
    
    del data['Name']
    del data['PassengerId']
    
    data = data[data["Embarked"].notna()]
    data['Cabin'][data['Cabin'].isna()] = 'NaN'
    
    ord_enc = OrdinalEncoder()
    ord_enc = ord_enc.fit(data[['Ticket', 'Cabin']])
    data[['Ticket', 'Cabin']] = ord_enc.transform(data[['Ticket', 'Cabin']])
    
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    
    knn_imputer = KNNImputer(n_neighbors=5)
    data = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)
    
    data = data[(np.abs(stats.zscore(data['Fare'])) < 2.4)]
    
    variance = np.var(data)
    highvar_cols = [col for col in data.columns if variance[col] > 2]

    minmax_scal = MinMaxScaler(feature_range=(0.0,1.0))
    minmax_scal = minmax_scal.fit(data[highvar_cols])
    data[highvar_cols] = minmax_scal.transform(data[highvar_cols])
    return data

In [None]:
teste = pd.read_csv('../input/titanic/train.csv')
teste = preprocess_titanic(teste)
teste.head()

In [None]:
train_data_scaled.head()