In [None]:
import pathlib
import collections
import itertools


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn import model_selection, linear_model, metrics
from sklearn.preprocessing import LabelEncoder

In [None]:
data_folder = pathlib.Path("../input/tabular-playground-series-feb-2021")
list(data_folder.iterdir())

In [None]:
train_filepath = data_folder / "train.csv"
test_filepath = data_folder / "test.csv"
submission_filepath = data_folder / "sample_submission.csv"

In [None]:
train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)

print(f"Total train feats: {len(train_df.columns)}, Features names: {list(train_df.columns)}\n")
print(f"Total test feats: {len(test_df.columns)}, Features names: {list(test_df.columns)}")

## Basic aggreagate stats

In [None]:
train_df.describe()

## Visualise data

#### Feature distributions

In [None]:
fig = plt.figure(figsize=(15, 15))

axes = fig.subplots(7, 4).ravel()

for i, column in enumerate(train_df.columns):
    if "cont" in column:
        x_pos = np.linspace(0, 1, 100)
        kde = stats.gaussian_kde(train_df.loc[:, column].values)(x_pos)

        axis = axes[i]
        axis.plot(x_pos, kde, color="b", linewidth=1.5)
        axis.fill_between(x_pos, kde, color="b", alpha=0.5)
        
        axis.set_title(f"Feature: {column}")
        axis.set_xlabel(f"x")
        axis.set_ylabel(f"kde")
    if "cat" in column:
        value_counts = train_df.loc[:, column].value_counts().to_dict()
        labels, count = value_counts.keys(), value_counts.values()
        
        axis = axes[i]
        axis.bar(labels, count)
        
        axis.set_xticks(range(len(labels)))
        axis.set_xticklabels(labels, rotation=-10)
        axis.set_title(f"Feature: {column}")
        axis.set_xlabel("Categories")
        axis.set_ylabel("count")
    
fig.suptitle("Feature distributions")
fig.show()
fig.tight_layout()

There are a lot of under-represented categories in the data. Hopefully, the ML algorithm will learn the difference without us interfering alot.

### Correlation matrix

In [None]:
df_corr = train_df.drop(["id"], axis=1).corr()
for x_idx, y_idx in itertools.product(range(len(df_corr.index)), range(len(df_corr.columns))):
    if x_idx <= y_idx:
        df_corr.loc[df_corr.index[x_idx], df_corr.columns[y_idx]] = 0

In [None]:
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111)

img = ax.imshow(df_corr.values, cmap="plasma")
fig.colorbar(img, ax=ax)

ax.set_title("Continuous feature correlations")
ax.set_xlabel("continuous features")
ax.set_ylabel("continuous features")

ax.set_xticks(range(len(df_corr.index)))
ax.set_yticks(range(len(df_corr.columns)))

ax.set_xticklabels(df_corr.index, rotation=20)
ax.set_yticklabels(df_corr.columns, rotation=20)

fig.tight_layout()
fig.show()

Since the data is not highly correlated. We can safely move onto training the regressor, without much feature engineerng dedicated towards reducing the redundancy of data.

## Transform features/ encode categorical features

In [None]:
def label_encoder(train_df, test_df):
    """
    Function used for label encoding. Inspiried from: https://www.kaggle.com/rizdelhi/tabular-playground-competition-feb-21#Read-in-the-data-files
    """
    for column in train_df.columns:
        if "cat" in column:
            lbl = LabelEncoder()
            lbl.fit(np.hstack((train_df.loc[:, column].values, test_df.loc[:, column].values)))

            train_df.loc[:, column] = lbl.transform(train_df.loc[:, column].values)
            test_df.loc[:, column] = lbl.transform(test_df.loc[:, column].values)
    return train_df, test_df

In [None]:
train_df, test_df = label_encoder(train_df, test_df)

### Train a base linear regressor

In [None]:
x_train, y_train = train_df.drop(["id", "target"], axis=1).values, train_df.loc[:, "target"].values

In [None]:
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)

In [None]:
y_preds = reg.predict(test_df.drop(["id"], axis=1))
df_submission = pd.read_csv(submission_filepath)
df_submission.loc[:, "target"] = y_preds

In [None]:
submission_filename = submission_filepath.name
df_submission.to_csv(submission_filename, index=False)