# Random Forest Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
df = pd.read_csv('../datasets/transformed_ebay_data.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127778 entries, 0 to 127777
Data columns (total 61 columns):
 #   Column                                                  Non-Null Count   Dtype  
---  ------                                                  --------------   -----  
 0   Unnamed: 0                                              127778 non-null  int64  
 1   num__Price                                              127778 non-null  float64
 2   num__Feedback Score                                     127778 non-null  float64
 3   cat__Listing Type_Auction                               127778 non-null  float64
 4   cat__Listing Type_AuctionWithBIN                        127778 non-null  float64
 5   cat__Listing Type_FixedPrice                            127778 non-null  float64
 6   cat__Listing Type_StoreInventory                        127778 non-null  float64
 7   cat__Shipping Type_Calculated                           127778 non-null  float64
 8   cat__Shipping Type_Calcu

In [None]:
# Drop unnecessary columns 
exclude_indices = list(range(38, 55)) + list(range(56, 60))
df = df.drop(df.columns[exclude_indices], axis=1)
print(df.info())

# Export the DataFrame to a CSV file
df.to_csv('transformed_ebay_data_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127778 entries, 0 to 127777
Data columns (total 40 columns):
 #   Column                                                  Non-Null Count   Dtype  
---  ------                                                  --------------   -----  
 0   Unnamed: 0                                              127778 non-null  int64  
 1   num__Price                                              127778 non-null  float64
 2   num__Feedback Score                                     127778 non-null  float64
 3   cat__Listing Type_Auction                               127778 non-null  float64
 4   cat__Listing Type_AuctionWithBIN                        127778 non-null  float64
 5   cat__Listing Type_FixedPrice                            127778 non-null  float64
 6   cat__Listing Type_StoreInventory                        127778 non-null  float64
 7   cat__Shipping Type_Calculated                           127778 non-null  float64
 8   cat__Shipping Type_Calcu

In [4]:
# Prepare the independent and dependent variables
X = df.iloc[:, :-2].values
y = df.iloc[:, -2].values

In [5]:
print(X)

[[ 0.00000000e+00 -7.29313003e-02 -3.02147387e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 1.00000000e+00 -6.40655606e-02 -3.02490671e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.00000000e+00 -6.48018158e-02 -3.03911520e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 1.27775000e+05 -4.51376666e-02 -2.24890895e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 1.27776000e+05 -4.49536028e-02 -2.24890895e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 1.27777000e+05 -4.50456347e-02 -2.24890895e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [6]:
print(y)

[ 7.         61.04166667 29.74976852 ... 91.04166667 91.04166667
 91.04166667]


## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Random Forest Regression model on the whole dataset

In [8]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [9]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 34.12  31.04]
 [967.44 122.04]
 [783.8   30.  ]
 ...
 [ 61.21  61.04]
 [202.39  61.04]
 [699.15 458.04]]


## Evaluating the Model Performance

In [10]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5292616459116259

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters grid for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [6, 8, 10, None],     # Maximum depth of trees
    'min_samples_split': [2, 5, 10],   # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],     # Minimum samples to be at a leaf node
    'bootstrap': [True, False]         # Whether to use bootstrap samples
}

# Initialize GridSearchCV with RandomForestRegressor
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,         # 5-fold cross-validation
                           n_jobs=-1,    # Use all processors for parallel computation
                           verbose=2)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
