# Tabular Playground series - August 2021

The dataset is used for this competition is synthetic, but based on a real dataset and generated using a [CTGAN](https://github.com/sdv-dev/CTGAN). The original dataset deals with calculating the loss associated with a loan defaults. Although the features are anonymized, they have properties relating to real-world features.

## Baseline model
Checking the sample_submission.csv within the public leaderboard shows a Root Mean Squared Error score of 10.53201. Aim is to perform initial EDA and build a few baseline models and begin to perform hyperparameter tuning.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Review files in the folder
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Switch on setting to allow all outputs to be displayed
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# EDA

In [None]:
# Create the initial training DataFrame
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

In [None]:
# Review the first five observations
train.head()

In [None]:
# Check the test dataset
test.head()

In [None]:
# Drop the id variable
X = train.drop(['id', 'loss'], axis=1)
y = train['loss']
test_id = test['id']
test_x = test.drop(['id'], axis=1)

# Review train and test after dropping id
X.head()
test_x.head()

In [None]:
# Shape of the dataframe
print(train.shape)
# Find the number of rows within a dataframe
print(len(train))
# Extracting information from the shape tuple
print(f'Number of rows: {train.shape[0]} \nNumber of columns: {train.shape[1]}')

In [None]:
# Review the high level summary details for each variable
train.describe()

In [None]:
# Variable types
train.dtypes.value_counts()
# All variables are numeric so don't have to worry about working with strings

## Missing values review

In [None]:
# Proportion of missing values by column
def isnull_prop(df):
    total_rows = df.shape[0]
    missing_val_dict = {}
    for col in df.columns:
        missing_val_dict[col] = [df[col].isnull().sum(), (df[col].isnull().sum() / total_rows)]
    return missing_val_dict

# Apply the missing value method
null_dict = isnull_prop(train)

In [None]:
# Create a dataframe of the missing value information
df_missing = pd.DataFrame.from_dict(null_dict, orient="index", columns=['missing', 'miss_percent'])
df_missing.loc[(df_missing['missing'] > 0)]

## Target variable - Loss

In [None]:
# Method - review the distribution of the target variable
def sns_displot(df, col):
    # set the histogram, mean and median
    sns.displot(df[col], kde=False)
    plt.axvline(x=df[col].mean(), linewidth=3, color='g', label="mean", alpha=0.5)
    plt.axvline(x=df[col].median(), linewidth=3, color='y', label="median", alpha=0.5)

    # set title, legends and labels
    plt.xlabel(f'{col}')
    plt.ylabel("Count")
    plt.title(f'Distribution of {col}', size=14)
    plt.legend(["mean", "median"]);

    print(f'Mean {col} value {df[col].mean()} \n Median {col} value {df[col].median()} \n Min {col} value {df[col].min()} \n Max {col} value {df[col].max()}')

In [None]:
sns_displot(train, 'loss')

There is a positive skew present as the mean is greater than the median. A large proportion of the values are zero so not all rows have experienced the same loss. 

In [None]:
# Lets understand the common values
print(f'Average rate of zero: {train.loc[(train.loss == 0), "loss"].count() / train.shape[0]}')
print(f'{train.loss.value_counts()}')

Before we get into Model Predictions we need to understand which independent variables help to predict the binary loss classifier. By converting the target variable to a binary classifier first we can explore which models help to predict loss before we aim to predict the loss value

# Dimension Reduction - Binary classifier

In [None]:
# Import modules
# Preprocessing
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler

# Classifiers
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Performance metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Dimension Reduction techniques
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from yellowbrick.model_selection import feature_importances
from sklearn.feature_selection import SelectFromModel

In [None]:
# Lets create the binary y_target variable
y_target = np.where(y > 0, 1, 0)
y_target.view()
print(f'Proportion of loss values {np.average(y_target)}')

In [None]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

In [None]:
# define the pipeline
steps = [
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=10)), 
        ('m', LogisticRegression())
]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y_target, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# Lets try a Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=3,
                                 bootstrap=True, n_jobs=-1,
                                 random_state=0)
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

rf.fit(X_scaled, y_target)

feature_imp = pd.Series(rf.feature_importances_, 
                        index=X.columns).sort_values(ascending=False)

print('Feature importances: ', rf.feature_importances_)
print(sns.barplot(x=feature_imp, y=feature_imp.index))
plt.xlabel('Feature Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title("Visualizing Important Features", fontsize=15, pad=15)

In [None]:
# Lets put a threshold on the feature importance score
selector = SelectFromModel(rf, threshold=0.05)
features_important = selector.fit_transform(X, y_target)

X_vars = X.loc[:, selector.get_support()]
X_vars.head()

In [None]:
# Lets try reviewing the data with the standard scaler switched off
# Lets try a Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=3,
                                 bootstrap=True, n_jobs=-1,
                                 random_state=0)

rf.fit(X, y_target)

feature_imp = pd.Series(rf.feature_importances_, 
                        index=X.columns).sort_values(ascending=False)

print('Feature importances: ', rf.feature_importances_)
print(sns.barplot(x=feature_imp, y=feature_imp.index))
plt.xlabel('Feature Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title("Visualizing Important Features", fontsize=15, pad=15)

In [None]:
# Lets put a threshold on the feature importance score
selector = SelectFromModel(rf, threshold=0.05)
features_important = selector.fit_transform(X, y_target)

X_vars1 = X.loc[:, selector.get_support()]
X_vars1.head()

In [None]:
# Create the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vars1, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X_vars, 
#                                                     y, 
#                                                     test_size=0.3, 
#                                                     random_state=42)

# Model analysis - Negative Binomial

In [None]:
# Create the evaluation metric - RMSE
from sklearn.metrics import mean_squared_error as mse

def rmse(actual, predicted):
    mse_val = mse(actual, predicted)
    return np.sqrt(mse_val)

In [None]:
#train the NB2 model on the training data set
import statsmodels.api as sm

neg_bin = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial()).fit()

#print the training summary
print(neg_bin.summary())

In [None]:
# Lets predict with a negative binomial model
y_pred = np.round(neg_bin.predict(X_test))
rmse(y_test, y_pred)

In [None]:
y_pred.view()
y_test.head()

# Model analysis - Zero Inflated Poisson regression

It might make more sense to split the challenge into two separate issues. First predict if there was a loss or not. Assign 0 to loss values of 0 and 1 to all other values. Then a second element of the task would be to predict the loss for values greater than zero.
***
1. The first challenge would be a binary logistic regression task
2. Perform a poisson regression to predict the losses greater than 0
***
This task can be achieved by using the Zero-Inflated Poisson Regression

In [None]:
# Train the Zero Inflated Poisson model
zip_reg = sm.ZeroInflatedPoisson(endog=y_train, exog=X_train, exog_infl=X_train, inflation='logit').fit()
print(zip_reg.summary())

# Poisson

In [None]:
# Poisson Regression
from sklearn.linear_model import PoissonRegressor

poisson_glm = Pipeline([
    ("scaler", StandardScaler()),
    ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300))
])
poisson_glm.fit(X_train, y_train)
y_pred = poisson_glm.predict(X_test)
rmse(y_test, y_pred)

In [None]:
# Predictions
zip_predictions = zip_reg.predict(X_test,exog_infl=X_test)
predicted_counts=np.round(zip_predictions)
print(f'RMSE : {rmse(y_test, predicted_counts)}')

In [None]:
# # Work to do to get this working correctly
# fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True)
# fig.subplots_adjust(bottom=0.2)
# n_bins = 20
# for row_idx, label, df in zip(range(2),
#                               ["train", "test"],
#                               [y_train, y_test]):
#     df.hist(bins=np.linspace(-1, 30, n_bins),
#                          ax=axes[row_idx, 0])

#     axes[row_idx, 0].set_title("Data")
#     axes[row_idx, 0].set_yscale('log')
#     axes[row_idx, 0].set_xlabel("y (observed Frequency)")
#     axes[row_idx, 0].set_ylim([1e1, 5e5])
#     axes[row_idx, 0].set_ylabel(label + " samples")

#     for idx, model in enumerate([dummy, ridge, poisson_glm]):
#         y_pred = model.predict(X_test)

#         pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
#                                ax=axes[row_idx, idx+1])
#         axes[row_idx, idx + 1].set(
#             title=model[-1].__class__.__name__,
#             yscale='log',
#             xlabel="y_pred (predicted expected Loss)"
#         )
# plt.tight_layout();

## Make submission

In [None]:
# Create model submission method using the test_x and test_id variables
def submission(model, csv_name):
    pred = model.predict(test_x)
    df = pd.DataFrame(data={'id': test_id, 'loss': pred})
    df = df.set_index('id')
    return df.to_csv(f"Submission_file_{csv_name}.csv")

In [None]:
# Create the code for the ZIP prediction
test_x_new = test_x.loc[:, X_train.columns]

In [None]:
# submission(neg_bin, "neg_bin_glm")

In [None]:
# Scaled datasets
def submission_scaled(model, csv_name):
    pred = model.predict(test_x_new)
    df = pd.DataFrame(data={'id': test_id, 'loss': pred})
    df = df.set_index('id')
    return df.to_csv(f"Submission_file_{csv_name}.csv")

In [None]:
# submission_scaled(neg_bin, "neg_bin")
# submission_scaled(zip_reg, "zip_reg")
submission_scaled(poisson_glm, "poisson")