# Importing Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error

# Loading and Viewing Data

In [None]:
data_dir = '/kaggle/input/tabular-playground-series-jan-2021'
train_path = os.path.join(data_dir, 'train.csv')
test_path = os.path.join(data_dir, 'test.csv')
sam_sub_path = os.path.join(data_dir, 'sample_submission.csv')

df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sam_sub_df = pd.read_csv(sam_sub_path)

# remove target outliers from train
df.drop(df[df['target'] <= 5].index, axis=0, inplace=True)

df.head()

In [None]:
target = df['target'].values
data = df.drop(['target', 'id'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=17, shuffle=False)

# Training Tools

In [None]:
def plot_feature_importance(tree_grid, n_cols=10):
    f_imp = pd.DataFrame({'feature': list(df.drop(['target', 'id'], axis=1).columns),
                          'importance': tree_grid.best_estimator_.feature_importances_}
                         ).sort_values('importance', ascending=False).reset_index()
    f_imp['importance_normalized'] = f_imp['importance'] / f_imp['importance'].sum()

    ax = plt.subplot()
    ax.barh(list(reversed(list(f_imp.index[:n_cols]))),
            f_imp['importance_normalized'].head(n_cols),
            align='center', edgecolor='k')
    ax.set_yticks(list(reversed(list(f_imp.index[:n_cols]))))
    ax.set_yticklabels(f_imp['feature'].head(n_cols))
    plt.show()


def train_by_grid_search(train_set, test_set, model, params, cv, n_cols=10):
    tree_grid = GridSearchCV(model, params, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
    tree_grid.fit(train_set, test_set)

    return tree_grid.best_estimator_, tree_grid


def print_info(model, greed):
    train_score = mean_squared_error(model.predict(X_train), y_train, squared=False)
    test_score = mean_squared_error(model.predict(X_test), y_test, squared=False)
    best_params = greed.best_params_
    print(f'Train Score = {train_score}')
    print(f'Test Score = {test_score}')
    print(f'Best Params:', best_params)

# Hyperparameter Optimization using GridSearchCV

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=17)

params = {
    'max_depth': [None, 2, 3, 5, 7, 8, 10],
    'n_estimators': [None, 15, 50, 100, 125, 150, 200],
}

model, greed = train_by_grid_search(
    X_train, y_train,
    model=RandomForestRegressor(random_state=17), params=params, cv=kf)

In [None]:
print_info(model, greed)

In [None]:
plot_feature_importance(tree_grid=greed, n_cols=14)

# Training Final Model

In [None]:
result_model = RandomForestRegressor(**greed.best_params_).fit(data, target)

In [None]:
print_info(result_model, greed)

# Make Prediction and Save Submission

In [None]:
test = test_df.drop('id', axis=1).values
submission = pd.DataFrame(data={'id': test_df['id'], 'target': result_model.predict(test)})
(sam_sub_df['id'] == submission['id']).all()

output_dir=''
submission.to_csv(os.path.join(output_dir, 'submission.csv'), index=False)