# Random Forest with hyperparameter tuning

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')

In [3]:
train_data.head()

Unnamed: 0,date,campaign,adgroup,ad,impressions,clicks,cost,conversions,revenue
0,01-08-2020,campaign 1,adgroup 1,ad 1,24,6,0.08,0,0.0
1,01-08-2020,campaign 1,adgroup 2,ad 1,1,0,0.0,0,0.0
2,01-08-2020,campaign 1,adgroup 3,ad 1,13,4,0.04,0,0.0
3,01-08-2020,campaign 1,adgroup 4,ad 1,5,4,0.08,0,0.0
4,01-08-2020,campaign 1,adgroup 1,ad 2,247,126,1.29,4,925.71


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4571 entries, 0 to 4570
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         4571 non-null   object 
 1   campaign     4571 non-null   object 
 2   adgroup      4571 non-null   object 
 3   ad           4571 non-null   object 
 4   impressions  4571 non-null   int64  
 5   clicks       4571 non-null   int64  
 6   cost         4571 non-null   float64
 7   conversions  4571 non-null   int64  
 8   revenue      4571 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 321.5+ KB


In [5]:
train_data.drop('date', axis=1, inplace=True)
train_data.drop('campaign', axis=1, inplace=True)
test_data.drop('date', axis=1, inplace=True)
test_data.drop('campaign', axis=1, inplace=True)
train_data.drop('ad', axis=1, inplace=True)
test_data.drop('ad', axis=1, inplace=True)

In [6]:
test_data.head()

Unnamed: 0,adgroup,cost,impressions,clicks,conversions
0,adgroup 1,0.58,121,49,1
1,adgroup 3,0.17,22,12,0
2,adgroup 4,0.05,5,3,0
3,adgroup 2,0.01,2,1,0
4,adgroup 2,0.01,3,1,0


In [7]:
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)
train_data.head()

Unnamed: 0,impressions,clicks,cost,conversions,revenue,adgroup_adgroup 1,adgroup_adgroup 2,adgroup_adgroup 3,adgroup_adgroup 4
0,24,6,0.08,0,0.0,1,0,0,0
1,1,0,0.0,0,0.0,0,1,0,0
2,13,4,0.04,0,0.0,0,0,1,0
3,5,4,0.08,0,0.0,0,0,0,1
4,247,126,1.29,4,925.71,1,0,0,0


In [8]:
test_data.columns

Index(['cost', 'impressions', 'clicks', 'conversions', 'adgroup_adgroup 1',
       'adgroup_adgroup 2', 'adgroup_adgroup 3', 'adgroup_adgroup 4'],
      dtype='object')

In [9]:
X_train = train_data.drop(['revenue'], axis='columns')
y_train = train_data['revenue']
X_test = test_data

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.46415901,  0.77539778,  3.2172016 , ..., -0.43488609,
        -0.75944098, -0.4807078 ],
       [-0.4662263 , -0.21308039,  0.64060838, ..., -0.43488609,
         1.31675802, -0.4807078 ],
       [-0.46683135, -0.38281907,  0.01386949, ..., -0.43488609,
        -0.75944098,  2.08026581],
       ...,
       [-0.46637756, -0.18312651,  0.57097072, ..., -0.43488609,
        -0.75944098, -0.4807078 ],
       [-0.46678093, -0.34288056,  0.08350714, ..., -0.43488609,
         1.31675802, -0.4807078 ],
       [-0.46541955,  0.23622787,  1.6851732 , ..., -0.43488609,
        -0.75944098, -0.4807078 ]])

In [11]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [12]:
print('Parameters currently in use:\n')
print(rf.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [13]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [14]:
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [15]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [18]:
rf_tuned = RandomForestRegressor(n_estimators= 200,
                                min_samples_split= 5,
                                min_samples_leaf= 4,
                                max_features= 'auto',
                                max_depth= 10,
                                bootstrap= True)
rf_tuned.fit(X_train_scaled, y_train)

RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=5,
                      n_estimators=200)

In [19]:
preds = rf_tuned.predict(X_test_scaled)

In [20]:
preds = preds.astype('int64')
preds

array([ 178,    0,    0,    0,    0,    0,  154,    0,  170,    0,  737,
          0,    0,  516,    0,  183,    0,  157,    0,  183,    0,    0,
          0,  737,    0,    0,  997,  157,    0,    0,  558,    0, 1073,
        523,  148,    0,  157,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  517,  159,
        151,    0,    0,    0,  994,    0,  520,    0,  512,    0,    0,
        743,  154,    0,    0,  508,    0,    0,    0,  168,  182,  997,
          0,    0,    0,  746,  153,    0,  182,    0,    0,  997, 1439,
          0,  523,  450,  328,    0,  842,  182,    0,    0,    0,    0,
          0,  182,    0,    0,  153,    0, 2394,    0,  461,    0,  176,
          0, 1010,    0,  522,    0, 2063,    0, 2811,  747,  182,    0,
          0,  738,    0,    0,    0,  737,    0,  154,  157,    0,    0,
          0,    0, 1024,    0,    0,    0,  753,    0,  184,    0,    0,
          0,  164,    0,    0,  486,  735,    0,   

In [21]:
prediction = pd.DataFrame(preds, columns=['revenue']).to_csv('prediction3.csv', index=False)