# Problem definition

The dataset is used for this competition is synthetic but based on a real dataset (in this case, the actual Titanic data!) and generated using a CTGAN.

Data description: 

| Variable        | Definition           | Key  |
|---------------|:-------------|------:|
|survival |	Survival | 0 = No, 1 = Yes |
|pclass |	Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
|sex |	Sex	 ||
|Age |	Age in years	 ||
|sibsp |	# of siblings / spouses aboard the Titanic	 ||
|parch |	# of parents / children aboard the Titanic	 ||
|ticket |	Ticket number	 ||
|fare |	Passenger fare	 ||
|cabin |	Cabin number	| |
|embarked |	Port of Embarkation	| C = Cherbourg, Q = Queenstown, S = Southampton |

<br>

Where `survival` will be our target variable! üéØ

<br>

Check out: 

  ‚ûú [Tuning of a Lightgbm with Bayesian Optimization using the `tidymodels` framework in R](https://www.kaggle.com/gomes555/tps-apr2021-r-eda-lightgbm-bayesopt)

  ‚ûú [AutoML (lgbm + catboost) with mljar](https://www.kaggle.com/gomes555/tps-apr2021-autoboost-mljar)
<br>

<p align="right"><span style="color:firebrick">Dont forget the upvote if you liked the notebook! ‚úåÔ∏è </p>

In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv', index_col=0)
submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
# Inpiration: https://www.kaggle.com/bagusbpg/my-12th-notebook

train['AnyMissing'] = np.where(train.isnull().any(axis=1) == True, 1, 0)
test['AnyMissing'] = np.where(test.isnull().any(axis=1) == True, 1, 0)

age_mean_input = train['Age'].mean()
train['Age'].fillna(age_mean_input,inplace = True)
train['Age_Pclass'] = train['Age'] * train['Pclass']
train['Age'] = train['Age'].apply(lambda x: '80s' if x >= 80 else '70s' if x>=70 else '60s' if x>=60 else '50s' if x>=50 else '40s' if x>=40 else '30s' if x>=30 else '20s' if x>=20 else '10s' if x>=10 else '0s')
test['Age'].fillna(age_mean_input,inplace = True)
test['Age'] = test['Age'].apply(lambda x: '80s' if x >= 80 else '70s' if x>=70 else '60s' if x>=60 else '50s' if x>=50 else '40s' if x>=40 else '30s' if x>=30 else '20s' if x>=20 else '10s' if x>=10 else '0s')

train['FamName'] = train['Name'].str.extract('([A-Za-z]+)\,', expand = False)
test['FamName'] = test['Name'].str.extract('([A-Za-z]+)\,', expand = False)

FamName = train['FamName'].append(test['FamName']).value_counts()
FamName = FamName.apply(lambda x: 'UltraCommon' if x >= 512 else 'VeryCommon' if x >= 256 else 'ModeratelyCommon' if x >= 128 else 'Common' if x >= 64 else 'SlightlyCommon' if x >= 32 else 'SlightlyRare' if x >= 16 else 'Rare' if x >= 8 else 'ModeratelyRare' if x >= 4 else 'VeryRare' if x >= 2 else 'UltraRare')
train['FamName'] = train['FamName'].apply(lambda x: FamName[x])
test['FamName'] = test['FamName'].apply(lambda x: FamName[x])

train['FamSize'] = train['SibSp'] + train['Parch'] + 1
train['FamSize'] = train['FamSize'].apply(lambda x: 'VeryBig' if x >= 12 else 'Big' if x >= 8 else 'Medium' if x >= 5 else 'Small' if x >= 3 else 'Couple' if x ==2 else 'Alone')
test['FamSize'] = test['SibSp'] + test['Parch'] + 1
test['FamSize'] = test['FamSize'].apply(lambda x: 'VeryBig' if x >= 12 else 'Big' if x >= 8 else 'Medium' if x >= 5 else 'Small' if x >= 3 else 'Couple' if x ==2 else 'Alone')

train['Fare'].fillna(train['Fare'].mean(),inplace = True)
train['Fare'] = train['Fare'].apply(lambda x: 'CrazyRich' if x >= 640 else 'UltraRich' if x >= 320 else 'VeryRich' if x >= 160 else 'Rich' if x >= 80 else 'SlightlyRich' if x >= 40 else 'SlightlyPoor' if x >= 20 else 'Poor' if x >= 10 else 'VeryPoor' if x >= 5 else 'UltraPoor')
test['Fare'].fillna(test['Fare'].mean(),inplace = True)
test['Fare'] = test['Fare'].apply(lambda x: 'CrazyRich' if x >= 640 else 'UltraRich' if x >= 320 else 'VeryRich' if x >= 160 else 'Rich' if x >= 80 else 'SlightlyRich' if x >= 40 else 'SlightlyPoor' if x >= 20 else 'Poor' if x >= 10 else 'VeryPoor' if x >= 5 else 'UltraPoor')

train['FirstName'] = train['Name'].map(lambda x: x.split(',')[0]).str.strip()
train['SecondName'] = train['Name'].map(lambda x: x.split(',')[1])

test['FirstName'] = test['Name'].map(lambda x: x.split(',')[0]).str.strip()
test['SecondName'] = test['Name'].map(lambda x: x.split(',')[1])


In [None]:
def initial_prep(data):
    
    # pipe: 
    # inpute lm AGE~Pclass ???
    # input Fare~Pclass ???
    # Embarked need input ???
    # create NameCount ???
    
    #data['CabinNum'] = pd.to_numeric(data['Cabin'].fillna('X').map(lambda x: x[1:].strip()))
    data['Cabin'] = data['Cabin'].fillna('X').map(lambda x: x[0].strip())
    
    #data['TicketNum'] = data.Ticket.str.extract(r'(\d+)').astype('float64', copy=False)
    data['Ticket'] = data.Ticket.str.replace('\.','', regex=True).str.replace('(\d+)', '', regex=True).str.replace(' ', '', regex=True).replace(r'^\s*$', 'X', regex=True).fillna('X')
    
    data['Embarked'] = data.Embarked.fillna('X')
    
    for col in data.columns[data.dtypes=="object"].tolist():
        data.loc[:,col] = data.loc[:,col].astype('category')
    
    return data

In [None]:
lab_cols = ['Age','FirstName', 'Ticket', 'Fare', 'Pclass', 'Sex', 'Cabin', 'Embarked','FamSize', 'FamName', 'AnyMissing']
num_cols = ['SibSp', 'Parch']
target = 'Survived'

train = initial_prep(train)

X = train[lab_cols + num_cols]
y = train[target]

test = test.pipe(initial_prep)[lab_cols + num_cols]

In [None]:
def kfold_prediction(X, y, X_test, K):

    yp = np.zeros(len(X_test))
    
    kf = KFold(n_splits=K, shuffle=True, random_state=314)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"\n FOLD {i} ...")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        params = {'loss_function':'Logloss',
                  'eval_metric':'AUC', 
                  'early_stopping_rounds': 500,
                  'n_estimators': 10000,
                  'cat_features': lab_cols,
                  'verbose': 500,
                  'random_seed': 314
         }
        
        clf = CatBoostClassifier(**params)
        
        model_fit = clf.fit(X_train,y_train,
                            eval_set=[(X_train, y_train), (X_val, y_val)],
                            use_best_model=True,
                            plot=False)
        
        yp += model_fit.predict_proba(X_test)[:, 1] / K
        
        
        yp_val = np.zeros(len(X_val))
        yp_val += model_fit.predict_proba(X_val)[:, 1]
        acc = accuracy_score(y_val, np.where(yp_val>=0.5, 1, 0))
        print(f"\n Accuracy: {acc} !")
        
    
    return yp

In [None]:
submission.loc[:, 'Survived'] = kfold_prediction(X, y, test, 8)

In [None]:
submission.loc[:, 'Survived'] = np.where(submission['Survived']>=0.5, 1, 0)

submission.to_csv('submission.csv', index = False)