In [None]:
import lightgbm
import xgboost as xgb
# Scikit Learn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import clone
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

def replaceNullValue(val, new_val):
    '''
    Used to populate a value in a null field
    '''
    if pd.isnull(val):
        return new_val
    else:
        return val

def encodeBinaryLabel(val, one_val):
    if pd.isna(val):
        raise ValueError('Null value found!')
    else:
        if val == one_val:
            return 1
        else:
            return 0

def createCabinFt(val):
    if pd.isna(val):
        new_val = 'NULL'
    else:
        new_val = val[0]
    return new_val

sample_sub_filename = '/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv'

def quickSubmission(sample_sub_path, test_df, model, output_filename):
    sample_submission = pd.read_csv(sample_sub_path)
    x_test = test_df.drop(labels = ['PassengerId'], axis = 1).values
    # updated to use predict instead of predict_proba
    predictions = model.predict(x_test)
    sample_submission['Survived'] = predictions
    sample_submission.to_csv(output_filename, index = False)

def create_folds(dataframe, target_label):
    dataframe['kfold'] = -1
    data = dataframe.sample(frac = 1).reset_index(drop = True)
    bin_num = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, 'bins'] = pd.cut(
        data[target_label], bins = bin_num, labels = False
    )
    kfold = StratifiedKFold(n_splits = 5)
    for f, (t_, v_) in enumerate(kfold.split(X = data, y = data['bins'].values)):
        data.loc[v_, 'kfold'] = f
    data = data.drop(labels = ['bins'], axis = 1)
    return data

def run_folds_proba(dataframe, fold, drop_cols, model):
    drop_cols.append('target')
    df_train = dataframe[dataframe.kfold != fold].reset_index(drop = True)
    df_val = dataframe[dataframe.kfold == fold].reset_index(drop = True)
    x_train = df_train.drop(labels = drop_cols, axis = 1).values
    y_train = df_train['target'].values
    x_val = df_val.drop(labels = drop_cols, axis = 1).values
    y_val = df_val['target'].values
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_val)
    # incorporate auc score
    print(roc_auc_score(y_val, y_pred[:, 1]))
    return model

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

train.head()

### Feature Engineering

Categorical:

- Pclass) could pass it as the integer value or use one-hot encoding
- Sex) binary encoding
- Ticket Number ??? 
- Cabin ???
- Embarked) one-hot encoding

Continuous:
- Age
- SibSp (number of siblings/spouses on board)
- Parch (number of parents/children on board)
- Fare


In [None]:
# Finding features with null values

for col in test.columns:
    if train[col].isnull().sum() > 0 or test[col].isnull().sum():
        print(col + ')')
        print('train:', train[col].isnull().sum())
        print('test:', test[col].isnull().sum())
        print('\n')

In [None]:
# create new feature from the cabin field

train['cabin_letter'] = train['Cabin'].apply(lambda x: createCabinFt(x))
train_cabins = train['cabin_letter'].unique().tolist()
test['cabin_letter'] = test['Cabin'].apply(lambda x: createCabinFt(x))
test_cabins = test['cabin_letter'].unique().tolist()

train_cabins.sort()
test_cabins.sort()
print(train_cabins == test_cabins)

In [None]:
# for age and fare, replace null with average value
# age should be an integer since it is above 1
# fare should have 2 decimal places
average_age = int(train['Age'].mean())
train['Age'] = train['Age'].apply(lambda x: replaceNullValue(x, average_age))
test['Age'] = test['Age'].apply(lambda x: replaceNullValue(x, average_age))

average_fare = round(train['Fare'].mean(), 2)
train['Fare'] = train['Fare'].apply(lambda x: replaceNullValue(x, average_fare))
test['Fare'] = test['Fare'].apply(lambda x: replaceNullValue(x, average_fare))

# we will also want the standard deviation for age and fare features

age_stdev = train['Age'].std()
fare_stdev = train['Fare'].std()

# replace null values in Embarked with N
train['Embarked'] = train['Embarked'].apply(lambda x: replaceNullValue(x, 'N'))
test['Embarked'] = test['Embarked'].apply(lambda x: replaceNullValue(x, 'N'))

def dropColsFromDf(col_name, df):
    if col_name in df.columns.tolist():
        df = df.drop(labels = col_name, axis = 1)
    return df

# drop Cabin from both sets since it is mostly null
train = dropColsFromDf('Cabin', train)
test = dropColsFromDf('Cabin', test)

# also dropping Name 
train = dropColsFromDf('Name', train)
test = dropColsFromDf('Name', test)

# also dropping Ticket 
train = dropColsFromDf('Ticket', train)
test = dropColsFromDf('Ticket', test)

# encode Sex field
train['Sex'] = train['Sex'].apply(lambda x: encodeBinaryLabel(x, 'female'))
test['Sex'] = test['Sex'].apply(lambda x: encodeBinaryLabel(x, 'female'))

one_hot_fts = ['Pclass', 'Embarked', 'cabin_letter']

train = pd.get_dummies(train, columns = one_hot_fts, dummy_na=False)
test = pd.get_dummies(test, columns = one_hot_fts, dummy_na=False)

In [None]:
train.head()

In [None]:
norm_fts = ['Age', 'SibSp', 'Parch', 'Fare']

for ft in norm_fts:
    scaler = StandardScaler()
    train[ft] = scaler.fit_transform(train[[ft]])
    test[ft] = scaler.transform(test[[ft]])

In [None]:
train.head()

### Train and Validation Sets

In [None]:
# create train and validation sets
x = train.drop(labels = ['PassengerId', 'Survived'], axis = 1).values
y = train['Survived'].values

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.25)

df = create_folds(train, 'Survived')

print('Kfold counts: \n', df.kfold.value_counts())

### Model Builds

Scores with default params:

XGBoost Classifier: 0.77948

LGBM Classifier: 0.78457

LGBM Classifier w/ Cabin Ft: 0.79014

In [None]:
#############################
# Random Forest
#############################
# rf_mdl = RandomForestClassifier()
# rf_mdl.fit(x_train, y_train)
# val_predictions = rf_mdl.predict(x_val)
# print(classification_report(y_true = y_val, y_pred = val_predictions))

#############################
# XGBoost Classifier
#############################
# xgb_mdl = xgb.XGBClassifier()
# xgb_mdl.fit(x_train, y_train)
# val_predictions = xgb_mdl.predict(x_val)
# print(classification_report(y_true = y_val, y_pred = val_predictions))

In [None]:
# LGBM
lgbm_mdl = lightgbm.LGBMClassifier()
lgbm_mdl.fit(x_train, y_train)
val_predictions = lgbm_mdl.predict(x_val)
print(classification_report(y_true = y_val, y_pred = val_predictions))

### Retrain and Submit

In [None]:
lgbm_mdl = lightgbm.LGBMClassifier()
lgbm_mdl.fit(x, y)

quickSubmission(
    sample_sub_path = sample_sub_filename, 
    test_df = test, 
    model = lgbm_mdl, 
    output_filename = 'lgbm_default_submission.csv'
)