In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder

import warnings
warnings.filterwarnings("ignore")

## Read Data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
train.info()

In [None]:
target = train['Survived']
train.drop('Survived', axis = 1, inplace = True)

## Missing Values

In [None]:
train.isnull().values.sum(), test.isnull().values.sum()

In [None]:
train.isnull().mean(), test.isnull().mean()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,5))

sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis', ax=ax1)
sns.heatmap(test.isnull(), yticklabels=False, cbar=False, cmap='viridis', ax=ax2);


## Impute

In [None]:
df = pd.concat([train, test], axis = 0, ignore_index = True)

In [None]:
# categorical
feat_cat = [col for col in df.columns if df[col].dtypes == 'object']
feat_cat

In [None]:
for col in feat_cat:
    df.loc[:, col] = df[col].astype(str).fillna('NONE')

In [None]:
# numerical
feat_num = df.dtypes[df.dtypes != "object"]
feat_num

In [None]:
df['Age'].fillna(train['Age'].mean(), inplace = True)
df['Fare'].fillna(train['Fare'].median(), inplace = True)

## Categorical Interactions

In [None]:
comb = list(itertools.combinations(['Sex', 'Embarked'], 2))

In [None]:
for c1, c2 in comb:
    df.loc[:, c1 + '_' + c2] = df[c1].astype(str) + '_' + df[c2].astype(str)

## Polynomial Features

In [None]:
pf = PolynomialFeatures(degree = 2, interaction_only = False, include_bias = False)

In [None]:
feat_poly = pf.fit_transform(df.loc[:, ['Age', 'Fare']])
n = feat_poly.shape[1]
df_transformed = pd.DataFrame(feat_poly, columns = [f'f_{i}' for i in range(1, n+1)])
df = pd.concat([df, df_transformed], axis = 1)

## Label Encoder

In [None]:
def label_encode(df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(df[column].unique().tolist())
    df[new_feature] = le.transform(df[column])
    return new_feature

In [None]:
feat_le = []
for feat in ['Sex', 'Embarked', 'Sex_Embarked']:
    feat_le.append(label_encode(df, feat))

## Drop Columns

In [None]:
df.drop(['PassengerId','Name', 'Ticket', 'Cabin',
         'Sex', 'Embarked','Sex_Embarked',
         'Age', 'Fare'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
train, test = df.iloc[:train.shape[0], ], df.iloc[train.shape[0]:, ]

In [None]:
train.shape, test.shape