In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import warnings
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

In [None]:
data = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
final_test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv")

columns = final_test.columns[1:]

final_test = final_test[columns]

target = data['target']
train = data[columns]
train.head()

In [None]:
print('Null count in train data')
print(list(data.isnull().sum()))
print()
print()

print('Null cout in final test data')
print(list(final_test.isnull().sum()))

There's no missing values.

We have 10 categorical and 14 continuous variables.

Let's check if the values in the training and final test data are the same.

In [None]:

for i in range(10):
    train_cat = list(data['cat{}'.format(i)].unique())
    train_cat.sort()
    
    final_test_cat = list(final_test['cat{}'.format(i)].unique())
    final_test_cat.sort()
    if train_cat != final_test_cat:
        print('cat{} has different values'.format(i))
        print(train_cat,' in training data')
        print(final_test_cat,' in final testing data')

Only variable cat6 had a value in the training data that wasn't in the testing data. Let's see how often G occurs in our training data.

In [None]:
# choose the id column to show how many values of G there are
data['cat6'].value_counts()

In [None]:
warnings.simplefilter("ignore")

# get column names with continuous values
cont_col=[]
for i in range(14):
    cont_col.append('cont{}'.format(i))

num_rows, num_cols = 7,2
f, axes = plt.subplots(num_rows, num_cols, figsize=(10, 20))
f.suptitle('Continuous Feature Distributions', fontsize=16)

for index, column in enumerate(cont_col):
    i,j = (index // num_cols, index % num_cols)
    graph = sns.distplot(train[column], color="blue", 
                         label="Train ~ %.2f"%(train[column].skew()), ax=axes[i,j])
    graph = sns.distplot(final_test[column], color="yellow",
                         label="Final Test ~ %.2f"%(final_test[column].skew()),
                                                                                    ax=axes[i,j])
    graph.legend(loc="best")

plt.tight_layout()
plt.show()

In [None]:
# get column names with continuous values
cat_col=[]
for i in range(10):
    cat_col.append('cat{}'.format(i))
    


In [None]:
# label encode all the categorical variable
label_encoder = preprocessing.LabelEncoder()

# encode both the training and final testing data
for col in cat_col:
    train[col] = label_encoder.fit_transform(data[col])
    final_test[col] = label_encoder.transform(final_test[col])

In [None]:
# view an endcoded column
val_distn = train['cat6'].value_counts().to_dict()
val_distn.keys()

In [None]:

    
num_rows, num_cols = 4,3

f, axes = plt.subplots(num_rows, num_cols, figsize=(15, 15))
f.suptitle('Categorical Feature Distribtuions', fontsize=16)

for index, column in enumerate(cat_col):
    i,j = (index // num_cols, index % num_cols)
    g = sns.distplot(train[column], color="green", 
                     label="Train ~ %.2f"%(train[column].skew()), ax=axes[i,j])
    
    g = sns.distplot(final_test[column], color="red", 
                     label="Final Test ~ %.2f"%(final_test[column].skew()), ax=axes[i,j])
    g = g.legend(loc="best")


plt.tight_layout()
plt.show()

We see that the categorical and continuous variables both follow the same distribution in the training and final test dataset.

In [None]:
train['cat6'].value_counts()

In [None]:
# split the data
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size = 0.20, random_state=2021)

In [None]:
x_train

Baseline xgboost

In [None]:
# baseline model. No feature engineering, only label encoding.
base_xgb = xgb.XGBRegressor()

base_xgb.fit(x_train, y_train)

Baseline RMSE

In [None]:
initial_preds = base_xgb.predict(x_test)

mean_squared_error(y_test, initial_preds, squared=False)
# 0.8487978956208364

In [None]:
base_preds = base_xgb.predict(final_test)
sub['target'] = base_preds
sub.to_csv('baseline_sub.csv', index=False)