## Random Forest Regression

+ Tuning Hyperparametes using Grid Search

In [None]:
# importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("../input/amsterdam-airbnb/train.csv")
df = pd.DataFrame(data)
df.shape

In [None]:
df.head()

In [None]:
col = df.columns  #getting list of column names

In [None]:
# showing column wise %ge of NaN values they contains 

for i in col:
  print(i,"\t-\t", df[i].isna().mean()*100)
  

> Since no column has signficant missing values, there is no need to drop column here.

> Now start analysis with numerical data. Main objective is to determine the columns fit for predictions by checking their skewness.


In [None]:
y_train = pd.read_csv("../input/amsterdam-airbnb/y_train.csv", header=None)
y_train = pd.DataFrame(y_train)
y_train.columns = ["price"]
y_train.head()

In [None]:
cormap = pd.concat([ df, y_train], axis = 1, sort=False).corr()
fig, ax = plt.subplots(figsize=(16,16))
sns.heatmap(cormap, cmap="YlGnBu", annot = True)

In [None]:
# Simple Function to get the name of top most corelated attributes

def get_corelated_col(cor_dat, threshold): 
  # Cor_data to be column along which corelation to be measured 
  #Threshold be the value above wich of corelation to considered
  feature=[]
  value=[]

  for i ,index in enumerate(cor_dat.index):
    if abs(cor_dat[index]) > threshold:
      feature.append(index)
      value.append(cor_dat[index])

  df = pd.DataFrame(data = value, index = feature, columns=['corr value'])
  return df


In [None]:
top_corelated_values = get_corelated_col(cormap["price"], 0.30)
top_corelated_values

In [None]:
final_df = df[top_corelated_values.index[:-1]]
final_df.head()

In [None]:
sns.pairplot(final_df)
plt.tight_layout()

In [None]:
#now lets split data in test train pairs

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final_df, y_train, test_size = 0.1)

In [None]:
# We can create a random forest and examine the default hyperparameter.

from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(ccp_alpha=3,) # Here no paramemter is provided as we are just checking the default hyperparameters
rfr.fit(X_train, y_train)

In [None]:
rfr.get_params()

In [None]:
# Now let's make grid for tunning the hyper parametes

from sklearn.model_selection import GridSearchCV

n_estimators = [100, 150, 200, 250, 300]
max_features = ['auto', 'sqrt']
max_depth = [30, 35, 40, 45, 50]
min_samples_leaf = [12, 14, 16, 18, 20]
bootstrap = [True, False]


grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [None]:
new_rfr = RandomForestRegressor()

rfr_grid = GridSearchCV(estimator = new_rfr, param_grid = grid, cv = 5, verbose=2)

rfr_grid.fit(X_train, y_train)

In [None]:
rfr_grid.best_params_

> To determine if Grid Search yielded a better model, we compare the base model with the best grid search model.

In [None]:
# Prediction of Base Model

y_pred_1 = rfr.predict(X_test)

y_pred_1[:5]

In [None]:
#Evaluating the Model

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_1))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_1))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_1)))
print('R2 Value:', metrics.r2_score(y_test, y_pred_1))

In [None]:
# Predictin of Grid Search Model

y_pred_2 = rfr_grid.best_estimator_.predict(X_test)

y_pred_2[:5]

In [None]:
#Evaluating the Model

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_2))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_2))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_2)))
print('R2 Value:', metrics.r2_score(y_test, y_pred_2))

> Since there is a rise in R2 score, it shows that Grid Search helped in tunning the hyper parametes of Random Forest and making it more accurate (though its very small here)

***

> Now lets predict prices for test dataset

In [None]:
test_data = pd.read_csv("../input/amsterdam-airbnb/test.csv")
test_df = pd.DataFrame(data)
test_df.shape

In [None]:
test_df.head()

In [None]:
# showing column wise %ge of NaN values they contains 

for i in col:
  print(i,"\t-\t", test_df[i].isna().mean()*100)
  

In [None]:
y_test = pd.read_csv("../input/amsterdam-airbnb/y_test.csv", header=None)
y_test = pd.DataFrame(y_train)
y_test.columns = ["price"]
y_test.head()

In [None]:
final_test_df = test_df[top_corelated_values.index[:-1]]
final_test_df.head()

In [None]:
# Predictin of Grid Search Model

y_pred = rfr_grid.best_estimator_.predict(final_test_df)

y_pred[:5]