## Simple implementation of a Random Forest for House Price Prediction with RandomForestCV to tune hyperparameters
- In this implementation I try to resume to a simple and directly implementation and tuning of a model

In [None]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Dataset
- In this implementation I used a cleaned dataset published on the house price prediction competition.
- Link: https://www.kaggle.com/chandramoulinaidu/house-price-prediction-cleaned-dataset

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

df_train = pd.read_csv('/kaggle/input/house-price-prediction-cleaned-dataset/Cleaned train.csv')
df_test = pd.read_csv('/kaggle/input/house-price-prediction-cleaned-dataset/Cleaned test.csv')

In [None]:
df_train.head()

## Separate the data in train and test

In [None]:
y = df_train['Saleprice'].copy()
X = df_train.drop(['Saleprice'], axis=1).copy() 

X_test = df_test.copy()

test_id = df_test['Id']

X.head()

In [None]:
y.head()

In [None]:
X_test.head()

## Data encode

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

for i in X.columns:
    if X[i].dtype == 'object':
        X[i] = label.fit_transform(X[i])
        
        
for i in X_test.columns:
    if X_test[i].dtype == 'object':
        X_test[i] = label.fit_transform(X_test[i])
        
X.head()

## Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.2)

## Modeling

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score 

rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_valid)
y_pred_test = rf.predict(X_test)
print('Mean squared error on train dataset:', mean_squared_error(y_valid, y_pred_train))

## Hyperparameters tunning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
                               cv = 3,scoring = 'neg_mean_squared_error', verbose=0, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
best_params = rf_random.best_params_
print(best_params)

In [None]:
best_rf =  RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)

y_pred_train = best_rf.predict(X_valid)
y_pred_test = best_rf.predict(X_test)

In [None]:
X_test.head()
X_test['SalePrice'] = y_pred_test
submit = X_test[['Id','SalePrice']]
submit.head()

In [None]:
submit.to_csv('submission.csv',index=False)
