In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook is based in part on this tutorial by jonas albrecht. Many thanks!:
https://www.kaggle.com/jonas0/beginner-friendly-february-tabular-tutorial/notebook

# Data Preprocessing

In [None]:
# import data
train_data = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")

In [None]:
# explore data
print(train_data.head(), "\n")
print(test_data.head())

In [None]:
print("train set shape", train_data.shape)
print("test set shape", test_data.shape)

In [None]:
train_data.info()

In [None]:
# separate column names based on whether features are numerical or categorical
print(train_data.columns)
cat_features = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9']
num_features = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13']

# Exploratory Data Analysis

In [None]:
# modules for Exploratory Data Analysis (EDA) 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# We will plot a double bar chart to visualize the relationship between the means of the numerical features in the train and test datasets.
# We see that the train dataset and the test dataset have practically the same mean for each numerical feature:

# get the means for the train and the test set
train_means = np.mean(train_data[num_features])
test_means = np.mean(test_data[num_features])

# code for plotting double bar chart taken from the matplotlib documentation:
# https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/barchart.html

x = np.arange(len(num_features)) 
width = 0.35 
fig, ax = plt.subplots(1,1,figsize=(11,5))
rects1 = ax.bar(x - width/2, train_means, width, label='train data')
rects2 = ax.bar(x + width/2, test_means, width, label='test_data')
ax.set_ylabel('Means')
ax.set_title('Comparing the means of numerical data in train and test sets')
ax.set_xticks(x)
ax.set_xticklabels(num_features)
ax.legend(loc='lower right')
plt.show();

In [None]:
# We will use a histogram to explore the distribution of the target. Mean shown by red line.
plt.hist(train_data.target, bins=50)
plt.axvline(x=np.mean(train_data.target), linestyle='--', color='red')
plt.xlabel('target values')
plt.ylabel('frequency') 
plt.title('Histogram of target values')
plt.show();

In [None]:
# scatter plot showing the distribution of each numerical feature around the target values
for i in num_features:
    fig = plt.figure(figsize=(10,6))
    plt.scatter(train_data[i], train_data.target, marker='x')
    plt.title(i)
    plt.show()

In [None]:
# It looks like there is one point where the target is near 0. Let's find it:
outlier = train_data[train_data.target<1.0]
outlier

In [None]:
# I will go ahead and remove it:
train_data.drop(index=outlier.index, axis=0, inplace=True)

In [None]:
# We will explore the relationship between the categorical features of the train and test data, again using a double bar chart

for i in cat_features:
    _ = train_data[i].value_counts().plot(kind='bar', label='train data', color='blue')
    _ = test_data[i].value_counts().plot(kind='bar', label='test data', color='orange')
    plt.xlabel('categories')
    plt.ylabel('count')
    plt.legend()
    plt.title(i)
    plt.show()

In [None]:
# It looks like some categorical features have very few amounts of certain values, and perhaps even 0
# Let's find any categories where the number of possible category values is different between the test set and the train set 
for cat in cat_features:
    if set(train_data[cat]) != set(test_data[cat]):
        print("mismatch in:", cat)

In [None]:
# We see that there is a mismatch in category 6, let's look at the unique values for the train and test sets in category 6
print(train_data.cat6.unique())
print(test_data.cat6.unique())

In [None]:
# Since there are only rows with the "G" label in the training set, we will just drop those rows
print(train_data.cat6.value_counts())
print(train_data.shape)
train_data = train_data[train_data.cat6!="G"]
print(train_data.cat6.value_counts())
print(train_data.shape)

In [None]:
# Finally, we will explore the distrubution of the categorical features around the target values
for i in cat_features: 
    sns.catplot(x=i, y="target", data=train_data)

In [None]:
# correlation matrix to see the correlation between numerical features
fig, ax = plt.subplots(1,1,figsize=(9,6))
correlation_mat = train_data.corr()
sns.heatmap(correlation_mat, ax=ax);

# Feature Engineering

In [None]:
# We will use one-hot encoding to encode the categorical features
train_encoded = pd.get_dummies(train_data)
test_encoded = pd.get_dummies(test_data)

# Train Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as rmse
from sklearn.model_selection import cross_val_score

In [None]:
# Separate training data into a features set and a target set
# drop the id column from both the train and the test set and save the test set id for the solution
X_train = train_encoded.drop(columns=['target', 'id'], axis=1)
y_train = train_encoded.target
X_test = test_encoded.drop(columns='id', axis=1)
test_id = test_encoded.id

In [None]:
# Fit an out-the-box random forest regressor to the train data and predict on the test data
rfreg = RandomForestRegressor(n_estimators=10)
rfreg.fit(X_train, y_train)
y_pred = rfreg.predict(X_test)

In [None]:
submission = pd.DataFrame({'id':test_id, 'target':y_pred})
submission.to_csv('submission.csv', index=False)