In [None]:
import sys
import pathlib
import itertools
import collections

import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt

from sklearn import linear_model

In [None]:
data_folder = pathlib.Path("../input/tabular-playground-series-jan-2021")
train_file = data_folder / "train.csv"
test_file = data_folder / "test.csv"
sample_file = data_folder / "sample_submission.csv"

In [None]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [None]:
print(f"total data points in train set: {train_df.shape[0]}")
print(f"total data points in test set: {test_df.shape[0]}")

In [None]:
train_df = train_df.drop("id", axis=1)
test_df = test_df.drop("id", axis=1)

In [None]:
train_df.describe()

### Visualise the data

First we will see the correlation between features. i.e the redundancy in features

In [None]:
df_corr = np.abs(train_df.drop("target", axis=1).corr())

In [None]:
for x_idx, y_idx in itertools.product(range(len(df_corr.index)), range(len(df_corr.columns))):
    if y_idx>=x_idx:
        df_corr.loc[df_corr.index[x_idx], df_corr.columns[y_idx]] = 0

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)

im = ax.imshow(df_corr.values, cmap="plasma")
fig.colorbar(im, ax=ax)

ax.set_title("feature correlations!!")
ax.set_xticks(range(len(df_corr.index)))
ax.set_xticklabels(df_corr.index, rotation=90)
ax.set_yticks(range(len(df_corr.columns)))
ax.set_yticklabels(df_corr.columns)
fig.show()

From the visual above, we can safely infer that the feature cont8 onwards are highly correlated with other features. These features are expendible and can be discarded. But, we will keep them for our primitive model and drop them when necessary.

## Feature distributions

Visualising feature distribtions

In [None]:
fig = plt.figure(figsize=(20,15))
axes = fig.subplots(5,3).ravel()

for i, ax in enumerate(axes):
    if not i<len(train_df.columns):
        break
    # Get data and columns
    column = train_df.columns[i]
    distribution = train_df.loc[:, column].values
    x = np.linspace(0, 1, 100)
    
    # Get KDE
    kde = gaussian_kde(distribution)(x)
    ax.fill_between(x, kde, alpha=0.3, color="b",linestyle="--")
    ax.set_title(f"distribution for columns: {column}")
    ax.set_ylabel("kde")

fig.suptitle("Feature distributions")
fig.tight_layout()
fig.show()

## Target distribution
Now we do the same thing for our target

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.subplots(1)

x = np.linspace(0, 1, 100)
distribution = train_df.loc[:, "target"]
kde = gaussian_kde(distribution)(x)

ax.fill_between(x, kde, alpha=0.5, color="b")

ax.set_title("Target distribution")
ax.set_ylabel("kde")
ax.set_xlabel("x")

fig.tight_layout()
fig.show()

## Train a model

In [None]:
X,y = train_df.drop("target", axis=1).values, train_df.loc[:, "target"]

In [None]:
reg = linear_model.LinearRegression()
reg.fit(X,y)

## Make predictions

In [None]:
sample_submission = pd.read_csv(sample_file, index_col='id')
preds = reg.predict(test_df.values)
sample_submission.loc[:, "target"] = preds

In [None]:
sample_file_stem = sample_file.name
sample_submission.to_csv(out_filename,index=False)