# Imports and load data

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

%matplotlib inline

In [None]:
DATA_PATH = '/kaggle/input/tabular-playground-series-feb-2021/'
train_df = pd.read_csv(DATA_PATH + 'train.csv')

In [None]:
train_df.head(10)

# Exploring the data

In [None]:
train_df.describe()

In [None]:
train_df.columns

In [None]:
train_df.shape

In [None]:
train_df.cat0.value_counts()

In [None]:
train_df.target.describe()

In [None]:
train_df.target.hist()

In [None]:
train_df.target.plot()

In [None]:
train_df

In [None]:
cat_cols=['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6','cat7', 'cat8', 'cat9']
for col in cat_cols:
    plt.figure(figsize=(8,4))
    train_df[col].value_counts().plot(kind='bar',color='Red', stacked=True)
    plt.title(col)
    plt.grid()
    plt.show()

### Correlation Matrix

In [None]:
import plotly.express as px

cont_features = [f'cont{i}' for i in range(14)]
cat_features = [f'cat{i}' for i in range(10)]
all_features = cont_features + cat_features

corr = train_df[all_features+['target']].corr()
fig = px.imshow(corr)
fig.show()

Awkwardly enough, there doesn't appear to be much correlation between the dependent feature (target) and any of the independent variables. 

This might be why the model baseline I trained last time didn't achieve a very good score (RMSE = about 86) 

### Countplot to see distribution of features

In [None]:
import seaborn as sns

plt.style.use("ggplot")
plt.figure(figsize=(25,20))
for i,feature in enumerate(cat_features):
    plt.subplot(2,5,i+1);
    sns.countplot(train_df[feature])

In [None]:
features = ['cat4','cat0', 'cat2', 'cat6', 'cat7']
for feature in features: 
    train_df.drop([feature], axis=1, inplace=True)

In [None]:
train_df.head()

### Some nice scatterplots of highly correlated variables

# Training a baseline RandomForestRegressor

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

y = train_df['target']
X = train_df.drop('target', axis=1)


In [None]:
# prepare categorical variables by encoding them
cat_cols = ['cat1','cat3', 'cat5', 'cat8', 'cat9']

for col in cat_cols: 
    X[col] = X[col].astype('category')
    
for col in cat_cols: 
    X[col] = X[col].cat.codes

In [None]:

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
# train a baseline model 
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
y_preds = model.predict(X_valid)

In [None]:
y_preds = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_preds)
rmse = np.sqrt(mse)

In [None]:
mse, rmse

## Tuning the Hyperparameters

In [None]:
# First look at the parameters used for the baseline Random Forest 
from pprint import pprint 
pprint(model.get_params())

In [None]:

model = RandomForestRegressor(n_jobs=-1, random_state=42, criterion='rmse')

### Randomized search CV

In [None]:
X_train.shape, y_train.shape

In [None]:
X_train = X_train[:2400]
y_train = y_train[:2400]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
# reload data
df = pd.read_csv(DATA_PATH + 'train.csv')

features = ['cat4','cat0', 'cat2', 'cat6', 'cat7']
for feature in features: 
    df.drop([feature], axis=1, inplace=True)



In [None]:
X = df.drop('target', axis=1)
y = df['target']
# prepare categorical variables by encoding them
cat_cols = ['cat1','cat3', 'cat5', 'cat8', 'cat9']

for col in cat_cols: 
    X[col] = X[col].astype('category').cat.codes

In [None]:


rf_best = RandomForestRegressor(n_jobs=-1, n_estimators=600,
 min_samples_split=5,
 min_samples_leaf=1,
 max_features='sqrt',
 max_depth=10,
 bootstrap=True, random_state=42)


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
rf_best.fit(X_train, y_train)

In [None]:
rf_best.score(X_valid, y_valid)

In [None]:
mse, rmse

# Evaluating baseline model

In [None]:
train_sizes =[1, 375, 750, 1500, 3000, 6000, 12000, 24000]


from sklearn.model_selection import learning_curve
train_sizes, train_scores, validation_scores = learning_curve(
estimator = RandomForestRegressor(n_jobs=-1, n_estimators=600,
 min_samples_split=5,
 min_samples_leaf=1,
 max_features='sqrt',
 max_depth=10,
 bootstrap=True, random_state=42),
X = X,
y = y, train_sizes = train_sizes, cv = 5,
scoring = 'neg_mean_squared_error', 
shuffle=True)

In [None]:
print('Training scores:\n\n', train_scores)
print('\n', '-' * 70) # separator to make the output easy to read
print('\nValidation scores:\n\n', validation_scores)

In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
print('Mean training scores\n\n', pd.Series(train_scores_mean, index = train_sizes))
print('\n', '-' * 20) # separator
print('\nMean validation scores\n\n',pd.Series(validation_scores_mean, index = train_sizes))

## Plotting the learning curves 

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a linear regression model', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0, 2)

# Prepare the submission 

In [None]:
test = pd.read_csv(DATA_PATH + 'test.csv')

In [None]:
# read in the sample submission data
submission = pd.read_csv(DATA_PATH + 'sample_submission.csv', index_col='id')
submission.head()

In [None]:
# reload data
df = pd.read_csv(DATA_PATH + 'train.csv')
X = df.drop('target', axis=1)
y = df['target']
# prepare categorical variables by encoding them
cat_cols = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9']

for col in cat_cols: 
    X[col] = X[col].astype('category')
    
for col in cat_cols: 
    X[col] = X[col].cat.codes

In [None]:

# prepare categorical variables by encoding them for the test set
cat_cols = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9']

for col in cat_cols: 
    test[col] = test[col].astype('category')
    
for col in cat_cols: 
    test[col] = test[col].cat.codes
    
test.head()

In [None]:
model = rf_best
model.fit(X, y)
preds = model.predict(test)
submission = pd.DataFrame(
    {'id' : test['id'], 
     'target': preds
    })

In [None]:
submission.head()

In [None]:
submission.to_csv('random_forest.csv', index=False)

In [None]:
pd.read_csv('./random_forest.csv')