# Problem definition

The dataset is used for this competition is synthetic but based on a real dataset (in this case, the actual Titanic data!) and generated using a CTGAN.

Data description: 

| Variable        | Definition           | Key  |
|---------------|:-------------|------:|
|survival |	Survival | 0 = No, 1 = Yes |
|pclass |	Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
|sex |	Sex	 ||
|Age |	Age in years	 ||
|sibsp |	# of siblings / spouses aboard the Titanic	 ||
|parch |	# of parents / children aboard the Titanic	 ||
|ticket |	Ticket number	 ||
|fare |	Passenger fare	 ||
|cabin |	Cabin number	| |
|embarked |	Port of Embarkation	| C = Cherbourg, Q = Queenstown, S = Southampton |

<br>

Where `survival` will be our target variable! üéØ

<br>

Check out: 

  ‚ûú [Tuning of a Lightgbm with Bayesian Optimization using the `tidymodels` framework in R](https://www.kaggle.com/gomes555/tps-apr2021-r-eda-lightgbm-bayesopt)

  ‚ûú [AutoML (lgbm + catboost) with mljar](https://www.kaggle.com/gomes555/tps-apr2021-autoboost-mljar)
<br>

  ‚ûú [Simple CatBoost + Preprocess](https://www.kaggle.com/gomes555/tps-apr2021-simple-catboost)
  
  ‚ûú [CatBoost + Pseudo Data + MovingThreshold](https://www.kaggle.com/gomes555/tps-apr2021-catboost-pseudo-movingthreshold)

<p align="right"><span style="color:firebrick">Dont forget the upvote if you liked the notebook! ‚úåÔ∏è </p>

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot

import category_encoders as ce

from sklearn.pipeline import Pipeline

from sklearn.feature_selection import RFE
from boruta import BorutaPy

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
path = "../input/tabular-playground-series-apr-2021/"
# path = "" # local
train = pd.read_csv(path+'train.csv', index_col=0)
test = pd.read_csv(path+'test.csv', index_col=0)
submission = pd.read_csv(path+'sample_submission.csv')

# Preprocess + feature engineering

In [None]:
# Calcule SameFirstName

train['FirstName'] = train['Name'].apply(lambda x:x.split(', ')[0])
train['n'] = 1
gb = train.groupby('FirstName')
df_names = gb['n'].sum()
train['SameFirstName'] = train['FirstName'].apply(lambda x:df_names[x])

test['FirstName'] = test['Name'].apply(lambda x:x.split(', ')[0])
test['n'] = 1
gb = test.groupby('FirstName')
df_names = gb['n'].sum()
test['SameFirstName'] = test['FirstName'].apply(lambda x:df_names[x])

# To preprocess

data = pd.concat([train, test], axis=0)

# Before fill missing
data['AnyMissing'] = np.where(data.isnull().any(axis=1) == True, 1, 0)

# Family
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = np.where(data['FamilySize'] <= 1, 1, 0)

# Cabin
data['Has_Cabin'] = data["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
data['Cabin'] = data['Cabin'].fillna('X').map(lambda x: x[0].strip())
cabin_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5,
             'F': 6, 'G': 7, 'T': 1, 'X': 8}
data['Cabin'] = data['Cabin'].str[0].fillna('X').replace(cabin_map)

# Embarked
#map_Embarked = train.Embarked.mode().item()
data['Embarked'] = data['Embarked'].fillna("No")
#conditions = [
#    (data['Embarked']=="S"),
#    (data['Embarked']=="Q"),
#    (data['Embarked']=="C"),
#    (data['Embarked']=="No")
#]
#choices = [0, 1, 2, -1]
#data["Embarked"] = np.select(conditions, choices)
#data['Embarked'] = data['Embarked'].astype(int)

# Name
data['SecondName'] = data.Name.str.split(', ', 1, expand=True)[1] # to try
data['IsFirstNameDublicated'] = np.where(data.FirstName.duplicated(), 1, 0)

# Fare
data['Fare'] = data['Fare'].fillna(train['Fare'].median())
# train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# [(0.679, 10.04] < (10.04, 24.46] < (24.46, 33.5] < (33.5, 744.66]]
# From original Titanic:
conditions = [
    (data['Fare'] <= 7.91),
    ((data['Fare'] > 7.91) & (data['Fare'] <= 14.454)),
    ((data['Fare'] > 14.454) & (data['Fare'] <= 31)),
    (data['Fare'] > 31)
]

choices = [0, 1, 2, 3]
data["Fare"] = np.select(conditions, choices)
data['Fare'] = data['Fare'].astype(int)

# Fix Ticket
data['TicketNum'] = data.Ticket.str.extract(r'(\d+)').\
                    astype('float64', copy=False).\
                    fillna(0) # to_try
data['Ticket'] = data.Ticket.str.replace('\.','', regex=True).\
                    str.replace('(\d+)', '', regex=True).\
                    str.replace(' ', '', regex=True).\
                    replace(r'^\s*$', 'X', regex=True).\
                    fillna('X')

#data['Ticket'] = data['Ticket'].astype('category').cat.codes # to_try

# Age 
conditions = [
    ((data.Sex=="female")&(data.Pclass==1)&(data.Age.isnull())),
    ((data.Sex=="male")&(data.Pclass==1)&(data.Age.isnull())),
    ((data.Sex=="female")&(data.Pclass==2)&(data.Age.isnull())),
    ((data.Sex=="male")&(data.Pclass==2)&(data.Age.isnull())),
    ((data.Sex=="female")&(data.Pclass==3)&(data.Age.isnull())),
    ((data.Sex=="male")&(data.Pclass==3)&(data.Age.isnull()))
]
choices = data[['Age', 'Pclass', 'Sex']].\
            dropna().\
            groupby(['Pclass', 'Sex']).\
            mean()['Age']

data["Age"] = np.select(conditions, choices)

conditions = [
    (data['Age'].le(16)),
    (data['Age'].gt(16) & data['Age'].le(32)),
    (data['Age'].gt(32) & data['Age'].le(48)),
    (data['Age'].gt(48) & data['Age'].le(64)),
    (data['Age'].gt(64))
]
choices = [0, 1, 2, 3, 4]

data["AgeCut"] = np.select(conditions, choices)

# Sex
data['Sex'] = np.where(data['Sex']=='male', 1, 0)

# Drop columns
data = data.drop(['n'], axis = 1)

# Transform object to category
for col in data.columns[data.dtypes=='object'].tolist():
    data.loc[:,col] = data.loc[:,col].astype('category')

In [None]:
# Splitting into train and test
train = data.iloc[:train.shape[0]]
test = data.iloc[train.shape[0]:].drop(columns=['Survived'])

In [None]:
train.head(3)

# Feature Selection (all data)

In [None]:
encoder = ce.CatBoostEncoder()
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state= 314)

In [None]:
X = train.drop("Survived", axis=1)
y = train["Survived"].values.ravel()

## RFE

In [None]:
def get_models():
    models = dict()
    for i in range(2, train.shape[1], 2):
        rfe = RFE(estimator=clf1, n_features_to_select=i)
        models[str(i)] = Pipeline(steps=[('encoder', encoder), ('rfe',rfe), ('clf',clf1)])
    return models
 
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

### Find the best number of features

In [None]:
%%time
 
models = get_models()

results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show() 

### Get features

In [None]:
n_features = np.argmax([np.mean(x) for x in results])+2

rfe = RFE(estimator=clf1, n_features_to_select=n_features)
pipe_rfe = Pipeline(steps=[('encoder', encoder), ('rfe',rfe)])
pipe_rfe.fit(X, y)

results_rfe = pd.DataFrame({
    'Index': range(X.shape[1]),
    'Column': X.columns, 
    'Selected_RFE': pipe_rfe.named_steps.rfe.support_,
    'Rank_RFE': pipe_rfe.named_steps.rfe.ranking_
})

results_rfe.sort_values('Rank_RFE')

## Boruta

### Get features

In [None]:
X_transform = X.copy()
X_transform.loc[:,['Name', 'Ticket', 'Embarked', 'FirstName', 'SecondName']] = \
    encoder.fit_transform(X_transform.loc[:,['Name', 'Ticket', 'Embarked', 'FirstName', 'SecondName']], y)
X_transform = X_transform.to_numpy()

In [None]:
boruta_feature_selector = BorutaPy(clf2, n_estimators='auto', verbose=1, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(X_transform, y)

In [None]:
boruta_feature_selector.ranking_

results_boruta = pd.DataFrame({
    'Index': range(X.shape[1]),
    'Column': X.columns, 
    'Selected_Boruta': boruta_feature_selector.support_,
    'Rank_Boruta': boruta_feature_selector.ranking_
})

results_boruta.sort_values('Rank_Boruta')

In [None]:
results_rfe.merge(results_boruta, left_on=['Index', 'Column'], right_on=['Index', 'Column'])

# Feature Selection (by `sex`)

In [None]:
X_m = train[train.Sex==1].drop("Survived", axis=1)
y_m = train[train.Sex==1]["Survived"]

X_f = train[train.Sex==0].drop("Survived", axis=1)
y_f = train[train.Sex==0]["Survived"]

test_m = test[test.Sex==1]
test_f = test[test.Sex==0]

## Female

### RFE

#### Find the best number of features

In [None]:
%%time
 
models = get_models()

results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_f, y_f)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show() 

#### Get features

In [None]:
n_features = np.argmax([np.mean(x) for x in results])+2

rfe = RFE(estimator=clf1, n_features_to_select=n_features)
pipe_rfe = Pipeline(steps=[('encoder', encoder), ('rfe',rfe)])
pipe_rfe.fit(X_f, y_f)

results_rfe = pd.DataFrame({
    'Index': range(X.shape[1]),
    'Column': X_f.columns, 
    'Selected_RFE': pipe_rfe.named_steps.rfe.support_,
    'Rank_RFE': pipe_rfe.named_steps.rfe.ranking_
})

results_rfe.sort_values('Rank_RFE')

### Boruta

#### Get features

In [None]:
X_transform_f = X_f.copy()
X_transform_f.loc[:,['Name', 'Ticket', 'Embarked', 'FirstName', 'SecondName']] = \
    encoder.fit_transform(X_transform_f.loc[:,['Name', 'Ticket', 'Embarked', 'FirstName', 'SecondName']], y_f)
X_transform_f = X_transform_f.to_numpy()

In [None]:
boruta_feature_selector = BorutaPy(clf2, n_estimators='auto', verbose=1, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(X_transform_f, y_f)

In [None]:
boruta_feature_selector.ranking_

results_boruta = pd.DataFrame({
    'Index': range(X.shape[1]),
    'Column': X_f.columns, 
    'Selected_Boruta': boruta_feature_selector.support_,
    'Rank_Boruta': boruta_feature_selector.ranking_
})

results_boruta.sort_values('Rank_Boruta')

In [None]:
results_rfe.merge(results_boruta, left_on=['Index', 'Column'], right_on=['Index', 'Column'])

## Male

### RFE

#### Find the best number of features

In [None]:
%%time
 
models = get_models()

results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_m, y_m)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show() 

#### Get features

In [None]:
n_features = np.argmax([np.mean(x) for x in results])+2

rfe = RFE(estimator=clf1, n_features_to_select=n_features)
pipe_rfe = Pipeline(steps=[('encoder', encoder), ('rfe',rfe)])
pipe_rfe.fit(X_m, y_m)

results_rfe = pd.DataFrame({
    'Index': range(X.shape[1]),
    'Column': X_m.columns, 
    'Selected_RFE': pipe_rfe.named_steps.rfe.support_,
    'Rank_RFE': pipe_rfe.named_steps.rfe.ranking_
})

results_rfe.sort_values('Rank_RFE')

### Boruta

#### Get features

In [None]:
X_transform_m = X_m.copy()
X_transform_m.loc[:,['Name', 'Ticket', 'Embarked', 'FirstName', 'SecondName']] = \
    encoder.fit_transform(X_transform_m.loc[:,['Name', 'Ticket', 'Embarked', 'FirstName', 'SecondName']], y_m)
X_transform_m = X_transform_m.to_numpy()

In [None]:
boruta_feature_selector = BorutaPy(clf2, n_estimators='auto', verbose=1, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(X_transform_m, y_m)

In [None]:
boruta_feature_selector.ranking_

results_boruta = pd.DataFrame({
    'Index': range(X.shape[1]),
    'Column': X_m.columns, 
    'Selected_Boruta': boruta_feature_selector.support_,
    'Rank_Boruta': boruta_feature_selector.ranking_
})

results_boruta.sort_values('Rank_Boruta')

In [None]:
results_rfe.merge(results_boruta, left_on=['Index', 'Column'], right_on=['Index', 'Column'])