### Imports and Initial Setup

Importing modules/libraries and defining useful functions. 

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

# trying out xgboost's regression 
from xgboost import XGBRegressor

# String variable that can be used to timestamp exported objects
from datetime import datetime
current_tmstmp = datetime.today().strftime('%Y%m%d')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')

train.head()

In [None]:
def listCols(string_identifier, dataframe):
    ft_cols = []
    for field in dataframe.columns:
        if string_identifier in field:
            ft_cols.append(field)
    return ft_cols

def checkForMissingCats(train, test, categorical_feat_list):
    # take a list of categorical features from a dataset
    # compare the distinct list of values between train and test
    for feature in categorical_feat_list:
        # print('Comparing categorical feature: ' + str(feature))
        train_list = []
        test_list = []
        missing_train_count = 0
        missing_test_count = 0
        for category in train[feature].unique():
            train_list.append(category)
        for category in test[feature].unique():
            test_list.append(category)
        for val in train_list:
            if val not in test_list:
                # print('train', val)
                missing_train_count += 1 
        for val in test_list:
            if val not in train_list:
                # print('test', val)
                missing_test_count += 1
        if missing_train_count != 0 or missing_test_count != 0:
            print(feature)

def encodeBinaryLabel(val, one_val):
    if pd.isna(val):
        raise ValueError('Null value found!')
    else:
        if val == one_val:
            return 1
        else:
            return 0

def create_folds(dataframe):
    dataframe['kfold'] = -1
    data = dataframe.sample(frac = 1).reset_index(drop = True)
    bin_num = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, 'bins'] = pd.cut(
        data['target'], bins = bin_num, labels = False
    )
    kfold = StratifiedKFold(n_splits = 5)
    for f, (t_, v_) in enumerate(kfold.split(X = data, y = data['bins'].values)):
        data.loc[v_, 'kfold'] = f
    data = data.drop(labels = ['bins'], axis = 1)
    return data

def run_folds(dataframe, fold, drop_cols, model):
    drop_cols.append('target')
    df_train = dataframe[dataframe.kfold != fold].reset_index(drop = True)
    df_val = dataframe[dataframe.kfold == fold].reset_index(drop = True)
    x_train = df_train.drop(labels = drop_cols, axis = 1).values
    y_train = df_train['target'].values
    x_val = df_val.drop(labels = drop_cols, axis = 1).values
    y_val = df_val['target'].values
    # switch to just fit if i quit using xgboost regression
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    mse = mean_squared_error(y_true = y_val, y_pred = y_pred)
    sq_mse = np.sqrt(mse)
    print('Processing fold: ' + str(fold))
    print('Square Root of MSE: ' + str(sq_mse))
    return model

### Categorical and Continuous Features

It seems that this month's Tabular Playground competition has continuous and categorical features. Some of the categorical columns may be binary so I'll check for those and handle those differently. 

In [None]:
cat_feats = listCols('cat', train)
cont_feats = listCols('cont', train)

Imbalanced Categorical Features:

- cat0
- cat2
- cat4
- cat6
- cat7

It seems like some of the categorical features are not evenly distributed. I'm wondering if it might make sense to remove some of these. 

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (12,6))

sns.countplot(x = 'cat0', data = train, ax = axes[0, 0])
sns.countplot(x = 'cat2', data = train, ax = axes[0, 1])
sns.countplot(x = 'cat4', data = train, ax = axes[1, 0])
sns.countplot(x = 'cat6', data = train, ax = axes[1, 1])

In [None]:
sns.countplot(x = 'cat7', data = train)

In [None]:
corr = train[cont_feats].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

In [None]:
train['target'].plot(kind = 'hist', bins = 1000)

### Data Prep

In [None]:
counter = 0

for col in train.columns:
    if train[col].isnull().sum() > 0:
        print(col, train[col].isnull().sum())
        counter += 1

if counter == 0:
    print('There are no null values in train set.')

counter = 0

for col in test.columns:
    if test[col].isnull().sum() > 0:
        print(col, test[col].isnull().sum())
        counter += 1

if counter == 0:
    print('There are no null values in the test set.')

In [None]:
# check for any categories that may not exist in both the train and test csv's

checkForMissingCats(train = train, test = test, categorical_feat_list = cat_feats)

Since cat6 has different categories between the two datasets, I'm going to combine both dataframes into one and create a new column, train_test_id, that I'll use later to separate the train and test sets. 

In [None]:
train['train_test_id'] = 'train'
test['train_test_id'] = 'test'
test['target'] = ''

train_and_test = [train, test]

df = pd.concat(train_and_test)

df.head()

I'll encode the binary, categorical features into 0, 1 values and use the get_dummies method to handle the others. 

In [None]:
binary_cols = []

for col in df.columns:
    if 'cat' in col:
        print(col, df[col].nunique())
        if df[col].nunique() == 2:
            binary_cols.append(col)

for i in binary_cols:
    print(i, df[i].unique())
    df[i] = df[i].apply(lambda x: encodeBinaryLabel(x, 'A'))

In [None]:
one_hot_cols = []

for ft in df.columns:
    if 'cat' in ft and ft not in binary_cols:
        one_hot_cols.append(ft)
        

df = pd.get_dummies(df, columns = one_hot_cols, dummy_na=False)

df.head()

In [None]:
train = df.loc[df['train_test_id'] == 'train'].drop(labels = ['train_test_id'], axis = 1)
test = df.loc[df['train_test_id'] == 'test'].drop(labels = ['train_test_id', 'target'], axis = 1)

train.head()

In [None]:
num_features = []

for i in train.columns:
    if 'cont' in i:
        num_features.append(i)

for col in num_features:
    prep = StandardScaler()
    train[col] = prep.fit_transform(train[[col]])
    test[col] = prep.transform(test[[col]])

train.head()

Splitting train into training and validation sets. 

In [None]:
x = train.drop(labels = ['id', 'target'], axis = 1).values
y = train['target'].values

seed = 7
np.random.seed(seed)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.25, random_state = seed)

### Model Builds

As of mid-Feb, my best score has been with a XGBoost Regression model using default params. Going forward, I'm going to try to gain more experience with XGBoost and try out differing param settings. 

In [None]:
model = XGBRegressor()

model.fit(x_train, y_train, eval_set = [(x_val, y_val)], early_stopping_rounds = 50, verbose = False)

In [None]:
y_pred = model.predict(x_val)
mse = mean_squared_error(y_true = y_val, y_pred = y_pred)
np.sqrt(mse)

In [None]:
tuned_model = XGBRegressor(max_depth = 4, min_child_weight = 5, gamma = 0.5, alpha = 1)

tuned_model.fit(x_train, y_train, eval_set = [(x_val, y_val)], early_stopping_rounds = 50, verbose = False)

tuned_y_pred = tuned_model.predict(x_val)
mse = mean_squared_error(y_true = y_val, y_pred = tuned_y_pred)
np.sqrt(mse)

In [None]:
prepped_test = test.drop(labels = ['id'], axis = 1).values
predictions = tuned_model.predict(prepped_test)

sub = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')
sub['target'] = predictions

sub.to_csv('tuned_xgb_reg_predictions_{ts}.csv'.format(ts = current_tmstmp), index = False)

A Linear Regression model scored 0.86. As of 02/06, the best score is 0.84191. 

In [None]:
# lin_reg = LinearRegression()

# lin_reg.fit(x_train, y_train)

# y_pred = lin_reg.predict(x_val)
# mse = mean_squared_error(y_true = y_val, y_pred = y_pred)
# np.sqrt(mse)

# prepped_test = test.drop(labels = ['id'], axis = 1)
# predictions = lin_reg.predict(prepped_test)

# sub = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')
# sub['target'] = predictions

# sub.to_csv('lin_reg_predictions.csv', index = False)

In [None]:
# rf_reg = RandomForestRegressor()

# rf_reg.fit(x_train, y_train)

# y_pred = rf_reg.predict(x_val)
# mse = mean_squared_error(y_true = y_val, y_pred = y_pred)
# np.sqrt(mse)

# prepped_test = test.drop(labels = ['id'], axis = 1)
# predictions = rf_reg.predict(prepped_test)

# sub = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')
# sub['target'] = predictions

# sub.to_csv('rf_reg_predictions.csv', index = False)

In [None]:
df = create_folds(train)

df.kfold.value_counts()

In [None]:
drops = ['id', 'kfold']

model_dict = {}

for f in range(len(df['kfold'].unique())):
    model_var = 'model_' + str(f)
    mdl = run_folds(
        dataframe = df, 
        fold = f, 
        drop_cols = drops, 
        model = XGBRegressor(max_depth = 4, min_child_weight = 5, gamma = 0.5, alpha = 1)
    )
    model_dict[model_var] = mdl

### End

After the competition ended, my best submission for the XGBoost Regression model scored 0.84719 on the private leaderboard. I'll wrap up this notebook by exporting the model using joblib in case I would like to work with it again in the future. 

In [None]:
import joblib

joblib.dump(model, 'xgb_regr_tab_feb_2021.bin')