In [None]:
# Ideas
## binning age
## split names, check families
## investigate if 'SibSp' and 'Parch' have some relationship? binning?
## verify if tickets have some pattern
## Try some relationship test for Children age < XX and 'SibSp', 'Parch'
## imputation for NANs
## Family size = SibSp + Parch + 1
## Is Cabin? Wich type A, B, C or D etc
## No Ticket?


## Skewed variables
### 'SibSp', 'Parch', 'Fare'

# Tabular Playground Series - Apr-2021
https://www.kaggle.com/c/tabular-playground-series-apr-2021

## Libraries and Settings

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

import missingno as msno
import re


print("seaborn", sns.__version__)
print("pandas", pd.__version__)

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = "../input/tabular-playground-series-apr-2021/"
SEED = 42

## Load the data

In [None]:
train = pd.read_csv(DATA_PATH + "train.csv")
print(train.info())

In [None]:
train.head()

In [None]:
test = pd.read_csv(DATA_PATH + "test.csv")
print(test.info())

In [None]:
test.head()

## Preprocess

In [None]:
# Imputation
# Skewed variables / for linear models should be transformed
## 'SibSp', 'Parch', 'Fare'

from sklearn.preprocessing import KBinsDiscretizer

def impute(df, col, by, agg_func="mean"):
    df.loc[df[col].isna(), "flag_na"] = 1
    global_agg = df[df[col].notna()][col].agg(agg_func)
    group_agg = df[df[col].notna()].groupby(by, as_index=False)[col].agg(agg_func)
#     print(group_agg)
    df_imp = df.merge(group_agg, on=by, how="left", suffixes=("", "_imp"))
    df.loc[df[col].isna(), col] = df_imp.loc[df_imp[col].isna(), col+"_imp"]
    df[col].fillna(global_agg, inplace=True)
    return df

def preprocess(df):
    # imputation
    df["flag_na"] = 0
    df["Embarked"].fillna("-999", inplace=True)
    df = impute(df, "Age", ["Pclass", "Sex"])
    df = impute(df, "Fare", ["Pclass", "Sex", "Embarked"])    
    return df

## Feature Engineering

In [None]:
def ord_discretize(df, col, n_bins=10, sufix="_disc"):
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode="ordinal")
    discrete_values = discretizer.fit_transform(df[col].values.reshape(-1, 1))
    df[col+sufix] = pd.Series(discrete_values.reshape(-1))
    return df

def get_ticket_letters(x):
    z = "".join(re.findall("[a-zA-Z]+", x))
    if z == "":
        z="-999"
    return z

def feature_engineering(df):
    # first and last names
    names = df["Name"].str.split(",", expand=True)
    names.columns = ["LastName", "FirstName"]
    df = pd.concat([df, names], axis=1)
    
    # Family Size
    df["FamilySize"] = df.SibSp + df.Parch + 1
    
    # Age Features
    df["IsChild"] = df.Age.apply(lambda x: 1 if x <=12 else 0)   
    df["IsOld"] = df.Age.apply(lambda x: 1 if x >=65 else 0)
    df = ord_discretize(df, "Age")

    # Cabin Type
    df["CabinType"] = df.Cabin.apply(lambda x: str(x)[0])
    
    # Tickets Information
    df["NoTicket"] = df.Ticket.apply(lambda x: 1 if x == None else 0)
    df["Ticket"] = df.Ticket.fillna("0")
    df["TicketInfo"] = df.Ticket.apply(get_ticket_letters)
    
    return df

In [None]:
train = train \
    .pipe(preprocess) \
    .pipe(feature_engineering)

train.info()
train.head()

In [None]:
test = test \
    .pipe(preprocess) \
    .pipe(feature_engineering)

test.info()
test.head()

In [None]:
train.to_csv("train_processed.csv")
test.to_csv("test_processed.csv")