In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def display_scores(cross_val_scores):
    print('Scores: ', cross_val_scores)
    print('Mean: ', cross_val_scores.mean())
    print('Standard Dev: ', cross_val_scores.std())

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
home_path = '/kaggle/input/tabular-playground-series-jan-2021/'

train = pd.read_csv(home_path + 'train.csv')
test = pd.read_csv(home_path + 'test.csv')

train.head()

In [None]:
print('count(*) from train: ', len(train.index))

sns.distplot(a = train['target'].values)

It looks like the target values are skewed to the right so it may be necessary to remove the samples that have a target value less than 4.

In [None]:
corr = train.drop(labels = ['id'], axis = 1).corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

In [None]:
x = train.drop(labels = ['id', 'target'], axis = 1).values
y = train['target'].values

seed = 7
np.random.seed(seed)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = seed)

lin_reg = LinearRegression()

lin_reg.fit(x_train, y_train)

y_pred = lin_reg.predict(x_test)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)

np.sqrt(mse)

In [None]:
# baseline

# rf_reg = RandomForestRegressor(random_state=42)

# rf_reg.fit(x_train, y_train)

# y_pred = rf_reg.predict(x_test)

# mse = mean_squared_error(y_true = y_test, y_pred = y_pred)

# np.sqrt(mse)

rf_reg_v2 = RandomForestRegressor(max_depth = 14, n_estimators = 250)

rf_reg_v2.fit(x_train, y_train)

y_pred = rf_reg_v2.predict(x_test)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)

np.sqrt(mse)

A Linear Regression model trained on 70% of the original data resulted in a score of 0.72783 on the leaderboard. Though, the one using Random Forest regression scored 0.70981. 

In [None]:
rf_reg_final = RandomForestRegressor(max_depth = 14, n_estimators = 250)

rf_reg_final.fit(x, y)

mse = mean_squared_error(y_true = y_test, y_pred = y_pred)

np.sqrt(mse)

In [None]:
test_pred_df = pd.DataFrame(data = y_pred, columns = ['y_predictions'])
test_pred_df['y_test'] = y_test

# 12,574 rows of differences >= 0.5 between the test and prediction values 
test_pred_df.loc[test_pred_df['y_predictions'] - test_pred_df['y_test'] >= 2].describe()

It seems that roughly 100 records were responsible for the target values appearing to be skewed to the right. When you remove these records you get a more uniform distribution. 

Submission

In [None]:
real_test = test.drop(labels = ['id'], axis = 1)
ids = test['id'].values

real_pred = rf_reg_final.predict(real_test)

In [None]:
sample_submission = pd.read_csv(home_path + 'sample_submission.csv')
sub = sample_submission
sub['target'] = real_pred

print(len(sub.index))

sub.to_csv('rf_reg_v2_submission.csv', index = False)