# <center>Tabular Playground Series - April/2021<center>
## <center>EDA - 3 Models - Voting - Pseudo Labelling<center>

Topics and notebooks that inspired this notebook.

-Topics:
* [How TicketNumber and CabinNumber are helping in prediction?](https://www.kaggle.com/c/tabular-playground-series-apr-2021/discussion/233445)
* [Pseudolabelling - Tips and tricks](https://www.kaggle.com/c/tabular-playground-series-apr-2021/discussion/231738)

-Notebooks:
* [TPS Apr 2021 pseudo labeling/voting ensemble (Submission file used as Pseudo Label)](https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble)
* [TPS-Apr2021 Catboost Run Pseudo label](https://www.kaggle.com/gomes555/tps-apr2021-catboost-run-pseudo-label)
* [LightAutoML interpretable model - AutoWoE](https://www.kaggle.com/alexryzhkov/lightautoml-interpretable-model-autowoe)

## Importing Libraries and Datasets

In [None]:
import pandas as pd       
import matplotlib as mat
import matplotlib.pyplot as plt    
import numpy as np
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from hyperopt.pyll.base import scope
from hyperopt import hp, fmin, tpe, Trials
from hyperopt import space_eval

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv', index_col= 'PassengerId')
#X_train = df_train.copy().drop('Survived', axis = 1)
Y_train = df_train['Survived']

X_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv', index_col = 'PassengerId')

#Using Submission File as Pseudo Label
pseudo_label = pd.read_csv('../input/tps-apr-2021-pseudo-labeling-voting-ensemble/voting_submission.csv')

In [None]:
#Included for PseudoLabel
df_test = X_test.copy()
df_test['Survived'] = [x for x in pseudo_label.Survived]
#df_test
Y_test = df_test['Survived']

In [None]:
df_train

## Exploring the Data

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [None]:
X_test.info()

In [None]:
for col in ['Cabin','Ticket', 'Name']:
    print(df_train[col].value_counts())

In [None]:
df_train['Cabin'] = df_train['Cabin'].fillna("None")
df_train['CabinType'] = df_train['Cabin'].str[0]

X_test['Cabin'] = X_test['Cabin'].fillna("None")
X_test['CabinType'] = X_test["Cabin"].str[0]

df_train['CabinType'].value_counts()

In [None]:
plt.figure(figsize=(6,4))

ax = sns.countplot(x="Survived", data=df_train, palette="BuPu")

plt.xlabel("Status", fontsize= 12)
plt.ylabel("N_Passangers", fontsize= 12)
plt.title("Survived vs Deceased", fontsize= 13)
plt.ylim(0,100000)
plt.xticks([0,1], ['Deceased', 'Survived'])

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.32, p.get_height()+3000))
    
plt.show()

In [None]:
plt.figure(figsize=(6,4))

df_train['Survived'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%')

plt.legend(labels=['Deceased', 'Survived'])
plt.show()

In [None]:
cont_features = ['Age', 'Fare']

plt.figure(figsize=(12,7))

for i,col in enumerate(cont_features):    
    plt.subplot(2,1,i + 1)
    sns.distplot(df_train.loc[:,col])
    plt.ylabel('')
plt.show()

In [None]:
plt.figure(figsize=(12,7))

for i,col in enumerate(cont_features):    
    plt.subplot(2,1,i + 1)
    sns.kdeplot(df_train.loc[(df_train['Survived'] == 0), col], label = 'Deceased', shade = True)
    sns.kdeplot(df_train.loc[(df_train['Survived'] == 1), col], label = 'Survived', shade = True)
    plt.ylabel('')
    plt.legend()
plt.show()

In [None]:
df_train['AgeBin'] = pd.cut(df_train['Age'],[0, 20, 40, 60, 100])
print(df_train['AgeBin'].value_counts())

df_train['FareBin'] = pd.cut(df_train['Fare'],[0, 20, 40, 60, 80, 100, 200, 300, 1000])
print(df_train['FareBin'].value_counts())

In [None]:
plt.figure(figsize=(12,4))

df_train['AgeBin'] = df_train['AgeBin'].astype('str')
df_train['AgeBin'] = df_train['AgeBin'].fillna('None')
#print(df_train['AgeBin'].value_counts())

AgeBin_order = ['None', '(0, 20]', '(20, 40]', '(40, 60]', '(60, 100]']

sns.countplot(data = df_train, x = 'AgeBin', hue="Survived", palette = 'rocket', order = AgeBin_order)

plt.show()

In [None]:
df_train['Survived'].groupby(df_train['AgeBin']).value_counts(normalize = True)

In [None]:
plt.figure(figsize=(12,4))

df_train['FareBin'] = df_train['FareBin'].astype('str')
df_train['FareBin'] = df_train['FareBin'].fillna('None')
#print(df_train['FareBin'].value_counts())

FareBin_order = ['None', '(0, 20]', '(20, 40]', '(40, 60]', '(60, 80]', '(80, 100]', '(100, 200]', '(200, 300]', '(300, 1000]']


sns.countplot(data = df_train, x = 'FareBin', hue="Survived", palette = 'rocket', order = FareBin_order)

plt.show()

In [None]:
df_train['Survived'].groupby(df_train['FareBin']).value_counts(normalize = True)

In [None]:
disc_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'CabinType']
df_train['Embarked'] = df_train['Embarked'].fillna('None')

plt.figure(figsize=(15,11))

for i,col in enumerate(disc_features):    
    plt.subplot(3,2,i + 1)
    sns.countplot(data = df_train, x = col, palette = 'rocket')
    plt.ylabel("")

plt.show()

In [None]:
plt.figure(figsize=(15,11))

for i,col in enumerate(disc_features):    
    plt.subplot(3,2,i + 1)
    sns.countplot(data = df_train, x = col, hue="Survived", palette = 'rocket')
    plt.ylabel("")

plt.show()

In [None]:
print(df_train['Survived'].groupby(df_train['Embarked']).value_counts(normalize = True))
print(df_train['Survived'].groupby(df_train['CabinType']).value_counts(normalize = True))

In [None]:
df_train['IsAlone'] = (df_train['SibSp'] + df_train['Parch']).apply(lambda x: 0 if x > 0 else 1)
df_train['IsAlone'].value_counts()

plt.figure(figsize=(12,4))

sns.countplot(data = df_train, x = 'IsAlone', hue="Survived", palette = 'rocket')

plt.show()


## Preparing Training Data and Creating Models

In [None]:
X_train = df_train.copy().drop('Survived', axis = 1)
X_train

In [None]:
Sex_map = {'male': 0, 'female': 1}
X_train['Sex'] = X_train['Sex'].map(Sex_map).astype('int')

AgeBin_map = {'None': 0, 
              '(0, 20]': 1, 
              '(20, 40]': 2, 
              '(40, 60]': 3, 
              '(60, 100]': 4}
X_train['AgeBin'] = X_train['AgeBin'].map(AgeBin_map).astype('int')

FareBin_map = {'None': 0,
                '(0, 20]': 1, 
                '(20, 40]': 2, 
                '(40, 60]': 3,
                '(60, 80]': 4, 
                '(80, 100]': 5, 
                '(100, 200]': 6, 
                '(200, 300]': 7, 
                '(300, 1000]': 8}
X_train['FareBin'] = X_train['FareBin'].map(FareBin_map).astype('int')

Embarked_map = {'None': 0,
                'S': 1, 
                'Q': 2,  
                'C': 3}
X_train['Embarked'] = X_train['Embarked'].map(Embarked_map).astype('int')

CabinType_map = {'N': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5,
                 'F': 6, 'G': 7, 'T': 8}
X_train['CabinType'] = X_train['CabinType'].map(CabinType_map).astype('int')

In [None]:
X_train_enc = X_train.copy().drop(['Name', 'Ticket', 'Cabin', 'Age', 'Fare'], axis = 1)
X_train_enc

In [None]:
#categorical_ft for catboost
categorical_ft = ['Sex', 'Pclass', 'AgeBin', 'FareBin', 'Embarked', 'CabinType', 'IsAlone']

def cv_function (X_train, Y_train, model):
    
    kfold = StratifiedKFold(n_splits = 5)
    accuracies = []
   
    cv_pred = np.zeros((100000,))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xtest = X_train.iloc[test_idx]
        ytest = Y_train.iloc[test_idx]
        
        # fit model for current fold
        if model == 'catboost_model':
            print('Catboost')
            model.fit(xtrain, ytrain, early_stopping_rounds = 100, eval_set = [(xtest,ytest)]
                      ,cat_features = categorical_ft, verbose = False, plot = False)
        else:
            model.fit(xtrain, ytrain, 
                  early_stopping_rounds = 100, eval_set = [(xtest,ytest)], verbose = False)

        #create predictions
        preds = model.predict(xtest)
        cv_pred[test_idx] = preds
                              
        # calculate and append accuracy
        fold_accuracy = metrics.accuracy_score(ytest,preds)
        print("ACC: {0:0.4f}". format(fold_accuracy))
        accuracies.append(fold_accuracy)
        
    print (np.mean(accuracies))
    #return np.mean(accuracies)
    return cv_pred

In [None]:
xgb_model = XGBClassifier (n_estimators = 1000, learning_rate = 0.01, max_depth = 7
                           , subsample = 0.8, colsample_bytree = 0.8, min_child_weight = 3
                           , random_state = 42, eval_metric = 'logloss')

In [None]:
#xgb_cvpred = cv_function(X_train_enc, Y_train, xgb_model) 

#0.77967999 #1000/0.01/100 max_depth = 7, subsample = 0.8, colsample_bytree = 0.8, min_child_weight = 3

In [None]:
lgbm_model = LGBMClassifier(n_estimators = 1500, learning_rate = 0.03, random_state = 42)

In [None]:
#lgbm_cvpred = cv_function(X_train_enc, Y_train, lgbm_model)

#0.77950999 #1500/0.03/100

In [None]:
catboost_model = CatBoostClassifier (n_estimators = 1500, random_state = 42)

In [None]:
#catboost_cvpred = cv_function(X_train_enc, Y_train, catboost_model)
#0.78004000 #1500/0.0?/100

In [None]:
#Just checking
#print("ACC: {0:0.6f}".format(metrics.accuracy_score(Y_train,xgb_cvpred)))
#print("ACC: {0:0.6f}".format(metrics.accuracy_score(Y_train,lgbm_cvpred)))
#print("ACC: {0:0.6f}".format(metrics.accuracy_score(Y_train,catboost_cvpred)))

#voting_cvpred = xgb_cvpred + lgbm_cvpred + catboost_cvpred
#voting_cvpred = np.where(voting_cvpred > 2, 1, 0)
#voting_cvpred
#print("ACC: {0:0.6f}".format(metrics.accuracy_score(Y_train,voting_cvpred)))

## Preparing the Test Data

In [None]:
X_test_enc = X_test.copy()
X_test_enc

In [None]:
X_test_enc['AgeBin'] = pd.cut(X_test_enc['Age'],[0, 20, 40, 60, 100])
X_test_enc['AgeBin'] = X_test_enc['AgeBin'].astype('str')
X_test_enc['AgeBin'] = X_test_enc['AgeBin'].fillna('None')

X_test_enc['FareBin'] = pd.cut(X_test_enc['Fare'],[0, 20, 40, 60, 80, 100, 200, 300, 1000])
X_test_enc['FareBin'] = X_test_enc['FareBin'].astype('str')
X_test_enc['FareBin'] = X_test_enc['FareBin'].fillna('None')

X_test_enc['IsAlone'] = (X_test_enc['SibSp'] + X_test_enc['Parch']).apply(lambda x: 0 if x > 0 else 1)


X_test_enc.info()

In [None]:
X_test_enc['Sex'] = X_test_enc['Sex'].map(Sex_map).astype('int')

X_test_enc['AgeBin'] = X_test_enc['AgeBin'].map(AgeBin_map).astype('int')

X_test_enc['FareBin'] = X_test_enc['FareBin'].map(FareBin_map).astype('int')

X_test_enc['Embarked'] = X_test_enc['Embarked'].fillna('None')
X_test_enc['Embarked'] = X_test_enc['Embarked'].map(Embarked_map).astype('int')

X_test_enc['CabinType'] = X_test_enc['CabinType'].map(CabinType_map).astype('int')

In [None]:
X_test_enc = X_test_enc.drop(['Name', 'Ticket', 'Cabin', 'Age', 'Fare'], axis = 1)
X_test_enc.info()

## Prediction

In [None]:
#PseudoLabel Step: Joining Train and Test sets
X = pd.concat([X_train_enc, X_test_enc], axis=0)
X


In [None]:
Y = pd.concat([Y_train, Y_test], axis=0)
Y

In [None]:
def prediction (X_train, Y_train, model, X_test):
    
    kfold = StratifiedKFold(n_splits = 5)

    y_pred = np.zeros(len(X_test))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, val_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xval = X_train.iloc[val_idx]
        yval = Y_train.iloc[val_idx]
        
        # fit model for current fold
        if model == 'catboost_model':
            print('Catboost')
            model.fit(xtrain, ytrain, early_stopping_rounds = 100, eval_set = [(xval,yval)]
                      ,cat_features = categorical_ft, verbose = False, plot = False)
        else:
            model.fit(xtrain, ytrain, 
                  early_stopping_rounds = 100, eval_set = [(xval,yval)], verbose = False)

        #create predictions
            
        y_pred += model.predict_proba(X_test)[:, 1]/kfold.n_splits
        print(y_pred)
        
        val_pred = model.predict(xval)
        # calculate and append accuracy
        fold_accuracy = metrics.accuracy_score(yval,val_pred)
        print("ACC: {0:0.4f}". format(fold_accuracy))
  
    return y_pred

In [None]:
#xgb_pred = prediction (X_train_enc, Y_train, xgb_model, X_test_enc)
#lgbm_pred = prediction (X_train_enc, Y_train, lgbm_model, X_test_enc)
#catboost_pred = prediction (X_train_enc, Y_train, catboost_model, X_test_enc)

#PseudoLabel
xgb_pred = prediction (X, Y, xgb_model, X_test_enc)
lgbm_pred = prediction (X, Y, lgbm_model, X_test_enc)
catboost_pred = prediction (X, Y, catboost_model, X_test_enc)

In [None]:
xgb_finalpred = np.where(xgb_pred>0.5, 1, 0)
lgbm_finalpred= np.where(lgbm_pred>0.5, 1, 0)
catboost_finalpred= np.where(catboost_pred>0.5, 1, 0)

In [None]:
#Voting
final_pred = xgb_finalpred + lgbm_finalpred + catboost_finalpred
final_pred = np.where(final_pred > 2, 1, 0)
 
#final_test = pd.DataFrame(final_pred)
#final_test.value_counts()

In [None]:
output = pd.DataFrame({'PassengerId': X_test.index,
                     'Survived': final_pred})
output.to_csv('submission.csv', index=False)

output