In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

If you're taking the time to look at this file, thank you! This is a synposis of the relatively basic approach I took to February's Competition. It was my first real work for a Kaggle competition, and I think I learned a lot from it. I've cleaned up my work, and this is essentially what it boiled down to. I would love to hear any comments you might have, especially if there's anything I could've done better.

Beginning with reading in the data

In [None]:
df = pd.read_csv(r'../input/tabular-playground-series-feb-2021/train.csv')
df.shape

There's apparently 26 columns, so this will be easier than trying to look at a big table.

In [None]:
columns = [x for x in df]
columns

In [None]:
# Checking for missing values in the dataset

df.isna().sum()

Given there are 13  continuous variables and one continuous target, I want to examine what the distributions look like. Below I've created boxplots for the different variables.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(5,3, figsize=(9, 13), dpi=60)
plt.subplots_adjust(left=2, bottom=2, right=3, top=3)

sns.boxplot(ax=axes[0,0], x='target', data=df)
sns.boxplot(ax=axes[0,1], x='cont0', data=df)
sns.boxplot(ax=axes[0,2], x='cont1', data=df)

sns.boxplot(ax=axes[1,0], x='cont2', data=df)
sns.boxplot(ax=axes[1,1], x='cont3', data=df)
sns.boxplot(ax=axes[1,2], x='cont4', data=df)

sns.boxplot(ax=axes[2,0], x='cont5', data=df)
sns.boxplot(ax=axes[2,1], x='cont6', data=df)
sns.boxplot(ax=axes[2,2], x='cont7', data=df)

sns.boxplot(ax=axes[3,0], x='cont8', data=df)
sns.boxplot(ax=axes[3,1], x='cont9', data=df)
sns.boxplot(ax=axes[3,2], x='cont10', data=df)

sns.boxplot(ax=axes[4,0], x='cont11', data=df)
sns.boxplot(ax=axes[4,1], x='cont12', data=df)
sns.boxplot(ax=axes[4,2], x='cont13', data=df)

Looking at this data, it definitely appears this data has already been normed in some way.

After looking at this data, I wondered if there was any correlation between target and one of the continuous variables. Below is a correlation matrix for the different variables, and while there's some correlation between the variables, there's not any linear relationship between the target variable and the continuous variables.

In [None]:
from matplotlib.pyplot import figure 

# Makes the figure larger
figure(num=None, figsize=(20, 15), dpi=80)

# calculating the correlation values
corr = df.corr()

# Creating a mask to eliminate the annoying doubles of a square correlation matrix
mask = np.triu(np.ones_like(corr, dtype=bool))

# Creating the graph with the indicated values
sns.heatmap(corr, mask=mask, annot=True)

At this point, I spent a while looking at the histograms of some of these distributions in addition trying several linear models for the dataset, such as least squares, ridge regression, lasso regression, and elasticnet. Below is the pandas profile report I used to look through the variables individually before moving on to using XGBoost.

In [None]:
from pandas_profiling import ProfileReport

df_profile = ProfileReport(df, 'EDA')
df_profile

In [None]:
# For XGB, I one-hot encoded the categorical variables.

# Categorical Data
cat_columns = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

# One-Hot encoding the categorical variables
df_one_hot = pd.get_dummies(df, columns=cat_columns)

df_one_hot.shape

I tried using Optuna and a few other functions to find the best hyperparameters, but in the end, I created a custom function to report the best parameters. The code I used is below.

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

def depthandlearning(learn, depth, min_child_weight, n_estimators):
    
    model = XGBRegressor(base_score=0.5, 
                         booster='gbtree', 
                         colsample_bylevel=1,
                         colsample_bynode=1, 
                         colsample_bytree=1, 
                         gamma=0, 
                         importance_type='gain',
                         learning_rate=learn, # Testing learning rate
                         max_delta_step=0, 
                         max_depth=depth, # Testing the depth
                         min_child_weight=min_child_weight, # Testing the child weight
                         missing=None, 
                         n_estimators=n_estimators, #Testing the number of estimators
                         n_jobs=1, 
                         nthread=None, 
                         objective='reg:squarederror',
                         random_state=0, 
                         reg_alpha=0, 
                         reg_lambda=1, 
                         scale_pos_weight=1, 
                         seed=None,
                         silent=None, 
                         subsample=1, 
                         verbosity=0) # I left it as verbose to verify results originally
    
    # Initialize the model on training data
    model.fit(X_train, y_train)
    
    # Predict the outputs using test data
    y_pred = model.predict(X_test)
    
    # Training Score
    training_score = model.score(X_train, y_train)
    
    # Testing Score
    test_score = model.score(X_test, y_test)

    # RMSE -- this was the metric used for the competition
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # This is creating a list object to return the different values used
    parameters = []
    parameters = [test_score]
    parameters.append(rmse)
    parameters.append(learn)
    parameters.append(depth)
    parameters.append(min_child_weight)
    parameters.append(n_estimators)
    
    return parameters

    
X = df_one_hot.drop(['target'], axis=1)
Y = df_one_hot['target']

# 30% testing, 70% training
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

# Scale the data for better results -- small increase in performance
scaler = StandardScaler()
scaler.fit(X_train)
X = scaler.transform(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# These are some of the last values I used
# The learning rate is very high
# I originally was testing 0.01, but these performed better
learn = [0.2, 0.2125, 0.225] 
depth = [4, 5, 6]
min_child_weight = [.5, .75, 1]
n_estimators = [100, 200, 500]

best_score = 0

for i in range(len(learn)):
    for j in range(len(depth)):
        for k in range(len(min_child_weight)):
            for l in range(len(n_estimators)):
                parameters = depthandlearning(learn[i], 
                                              depth[j], 
                                              min_child_weight[k],
                                              n_estimators[l])
                
                # By using an if statement, the best RMSE will end up setting the values
                if parameters[0] > best_score:
                    best_score = parameters[0]
                    best_score_std = parameters[1]
                    best_learn = parameters[2]
                    best_depth = parameters[3]
                    best_min_child_weight = parameters[4]
                    best_n_estimators = parameters[5]
        

print('Highest Score: %.4f' % best_score)
print('Highest Score RMSE: %.4f' % best_score_std)
print('Highest Score Learning: ', best_learn)
print('Highest Score Depth: ', best_depth)
print('Highest Score min_child_weight: ', best_min_child_weight)
print('Highest Score n_estimators: ', best_n_estimators)

Highest Score: 0.0925

Highest Score RMSE: 0.8475

Highest Score Learning:  0.2125

Highest Score Depth:  4

Highest Score min_child_weight:  0.5

Highest Score n_estimators:  200

Above are the parameters that scored the highest with the training dataset and the testing dataset. I'm going to run this one more time to examine the feature importance, and then I'll train the model on all the data available before trying to make predictions about the testing dataset for this competition.

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

model = XGBRegressor(base_score=0.5, 
                     booster='gbtree', 
                     colsample_bylevel=1,
                     colsample_bynode=1, 
                     colsample_bytree=1, 
                     gamma=0, 
                     importance_type='gain',
                     learning_rate=.2125, 
                     max_delta_step=0, 
                     max_depth=4, 
                     min_child_weight=.5,
                     missing=None, 
                     n_estimators=200, 
                     n_jobs=1, 
                     nthread=None, 
                     objective='reg:squarederror',
                     random_state=0, 
                     reg_alpha=0, 
                     reg_lambda=1, 
                     scale_pos_weight=1, 
                     seed=None,
                     silent=None, 
                     subsample=1, 
                     verbosity=1)

X = df_one_hot.drop(['id', 'target'], axis=1)
Y = df_one_hot['target']

#30% testing, 70% training
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

# Scale the data for slightly better results
scaler = StandardScaler()
scaler.fit(X_train)
X = scaler.transform(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
    
training_score = model.score(X_train, y_train)
print('Training Score: ', training_score)

test_score = model.score(X_test, y_test)
print('Test Score: ', test_score)

# MSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: %.4f" % rmse)

It's always interesting to see what the model values when making a prediction. Since we have so many columns, I'm only examining the top 15 most important variables.

In [None]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt 

figure(figsize=(8, 4), dpi=80)

def feat_imp(df, model, n_features):

    d = dict(zip(df.columns, model.feature_importances_))
    ss = sorted(d, key=d.get, reverse=True)
    top_names = ss[0:n_features]

    plt.figure(figsize=(8,4))
    plt.title("Feature Importances", fontsize=16)
    plt.bar(range(n_features), [d[i] for i in top_names], color="r", 
                                                          edgecolor='black', 
                                                          align="center")
    
    plt.xlim(-1, n_features)
    plt.xticks(range(n_features), top_names, rotation='vertical')
    plt.xlabel('Features', fontsize=14)

feat_imp(df_one_hot, model, 15)

Interestingly, this particular dataset seems to find that several categorical values are the most important, and it values two of them significantly more than anything else.

I then trained the data on the entire training set for the comepetition and submitted the predicted results.