# Import

In [None]:
# Pandas
import pandas as pd
# Numpy
import numpy as np
# Visualisation
import plotly.express as px

In [None]:
Train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv',  index_col='id')
Test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv',  index_col='id')

# Analytics

## Visualise sample data

In [None]:
highest_corr = np.abs(Train.corr()[['loss']]).sort_values('loss', ascending=False).iloc[1:10]

In [None]:
virtualise_data = Train[list(highest_corr.index) + ['loss']]

fig = px.parallel_coordinates(
    virtualise_data.sample(15000, random_state=125),
    color="loss",
    color_continuous_scale=px.colors.diverging.Tealrose,
                             color_continuous_midpoint=2,
    range_color=[
        min(virtualise_data['loss']),
        max(virtualise_data['loss'])
    ]
    
)
fig.show()

## Normalise

In [None]:
from sklearn.preprocessing import PowerTransformer

X_train = Train.drop('loss', axis=1)
y_train = Train[['loss']]

pt = PowerTransformer()

pt.fit(X_train)
X_train = pt.transform(X_train)

## Feature selection

In [None]:
normalise_data = pd.concat([pd.DataFrame(X_train), y_train], axis=1)
highest_corr = np.abs(normalise_data.corr()[['loss']]).sort_values('loss', ascending=False).iloc[1:10]
highest_corr

In [None]:
virtualise_data = normalise_data[list(highest_corr.index) + ['loss']]

fig = px.parallel_coordinates(
    virtualise_data.sample(15000, random_state=1254),
    color="loss",
    color_continuous_scale=px.colors.diverging.Tealrose,
                             color_continuous_midpoint=2,
    range_color=[
        min(virtualise_data['loss']),
        max(virtualise_data['loss'])
    ]
    )
fig.show()

# Set up training process

## Split data

### Set up Kfold

In [None]:
from sklearn.model_selection import KFold

# Config variable
N_FOLD = 20

kf = KFold(n_splits=N_FOLD)
kf.get_n_splits(X_train)

print(kf)

X_train = pd.DataFrame(X_train)

for train_index, test_index in kf.split(X_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_fold, X_test_fold = X_train.loc[train_index], X_train.loc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

### Make it into class based

In [None]:
from copy import deepcopy

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

def RMSE(prediction, actual):
    return mean_squared_error(prediction, actual, squared=False)

# Select features
normalise_data = pd.concat([pd.DataFrame(X_train), y_train], axis=1)
highest_corr = np.abs(normalise_data.corr()[['loss']]).sort_values('loss', ascending=False).iloc[1:11]
selected_features = list(highest_corr.index)

class Trainer:
    def __init__(self, n_fold, model, X_train, y_train):
        self.N_FOLD = n_fold
        self.kf = KFold(n_fold)
        self.kf.get_n_splits(X_train)
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.train_results = {}
    
    @staticmethod
    def evaluate(prediction, actual, metrics: list):
        results = {}
        for metric in metrics:
            results[metric.__name__] = metric(prediction, actual)
        return results
    
    
    def train(self, model_fit_args: dict={}, metrics=[]):
        for k, (train_index, test_index)in enumerate(self.kf.split(self.X_train)):
            X_train_fold, X_test_fold = self.X_train.iloc[train_index], self.X_train.iloc[test_index]
            y_train_fold, y_test_fold = self.y_train.iloc[train_index], self.y_train.iloc[test_index]
            
            X_train_fold_sample = X_train_fold.sample(10000, random_state=1234)[selected_features]
            y_train_fold_sample =  y_train_fold.loc[X_train_fold_sample.index]

            model.fit(X_train_fold_sample, y_train_fold_sample)
            
            train_prediction = model.predict(X_train_fold_sample)
            train_score = self.evaluate(train_prediction, y_train_fold_sample, metrics)
            
            test_prediction = model.predict(X_test_fold[selected_features])
            test_score = self.evaluate(test_prediction, y_test_fold, metrics)
            
            self.train_results[f'Fold {k}'] = {}
            self.train_results[f'Fold {k}']['model'] = deepcopy(model) 
            self.train_results[f'Fold {k}']['train_score'] = train_score
            self.train_results[f'Fold {k}']['test_score'] = test_score
            print(f'Fold {k}')
            print(self.train_results[f'Fold {k}'])
            print('-' * 36)
    
    def predict(self, X):
        predictions = pd.DataFrame()
        for k, fold in self.train_results.items():
            model = fold['model']
            prediction = model.predict(X[selected_features])
            predictions[k] = prediction
        return predictions.mean(axis=1)
        

## Make models

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    max_depth=2,
    criterion='mse',
    random_state=12,
    bootstrap=True
)

# from sklearn.ensemble import HistGradientBoostingRegressor
# model = HistGradientBoostingRegressor()


# Train

In [None]:
trainer = Trainer(N_FOLD, model, X_train, y_train)
trainer.train(metrics=[RMSE, mean_absolute_error])

# Submission

In [None]:
X_valid = pd.DataFrame(pt.transform(Test))[selected_features]
submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv', index_col='id')
submission['loss'] = trainer.predict(X_valid).values
submission.to_csv('submission.csv')