In [1]:
import gc
import time
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Random Forest

In [2]:
train = pd.read_csv("./dataset/preprocess/train.csv")
test = pd.read_csv("./dataset/preprocess/test.csv")

## Feature selection

In [3]:
features = train.columns.tolist()
features.remove('card_id')
features.remove('target')
featureSelect = features[:]

corr = []
for fea in featureSelect:
    corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1]))

se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)
feature_select = ['card_id'] + se[:300].index.tolist()

train = train[feature_select + ['target']]
test = test[feature_select]

## Grid Search

In [7]:
features = train.columns.tolist()
features.remove('card_id')
features.remove('target')

parameter_space = {
    "n_estimators": [79, 80, 81], 
    "min_samples_leaf": [29, 30, 31],
    "min_samples_split": [2, 3],
    "max_depth": [9, 10],
    "max_features": ["auto", 80]
}

clf = RandomForestRegressor(
    criterion="squared_error",
    n_jobs=15,
    random_state=22)

In [8]:
grid = GridSearchCV(clf, parameter_space, cv=2, scoring="neg_mean_squared_error")
grid.fit(train[features].values, train['target'].values)

72 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\hydon\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\hydon\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\hydon\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\hydon\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

In [9]:
grid.best_estimator_

In [10]:
np.sqrt(-grid.best_score_)

3.6900889856014247

In [11]:
grid.best_estimator_.predict(test[features])



array([-3.42895506, -1.05271922, -0.34647055, ...,  0.71331227,
       -2.40402906,  0.29249733])

In [13]:
test['target'] = grid.best_estimator_.predict(test[features])
test[['card_id', 'target']].to_csv("./result/submission_randomforest.csv", index=False)

