## Finding the feature importance using random forest algorithm

In [1]:
## importing the required libraries

import pandas as pd
import numpy as np

In [2]:
## importing the dataset
df = pd.read_csv('advertising.csv')

In [3]:
df.shape

(200, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [5]:
df = df.iloc[:,1:]

In [6]:
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [7]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=43756)

In [9]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

rfr = RandomForestRegressor(n_estimators=500,n_jobs=-1,max_depth=3)

etr = ExtraTreesRegressor(n_estimators=500,n_jobs=-1,max_depth=3)

In [10]:
rfr.fit(X_train,y_train)

RandomForestRegressor(max_depth=3, n_estimators=500, n_jobs=-1)

In [11]:
etr.fit(X_train,y_train)

ExtraTreesRegressor(max_depth=3, n_estimators=500, n_jobs=-1)

In [12]:
from sklearn.metrics import r2_score

def r2_adj(r):
    n = df.shape[0]
    m = df.shape[1] - 1
    r_adj = 1 - ((1-r**2)*((n-1)/(n-m-1)))
    return r_adj

In [13]:
## r2 adjusted for the random forest algorithm

print(f"The r2 score for random forest regressor is {r2_score(y_test,rfr.predict(X_test))}")
print(f"The r2 adjusted score for random forest regressor is {r2_adj(r2_score(y_test,rfr.predict(X_test)))}\n")

print(f"The r2 score for extra trees regressor is {r2_score(y_test,etr.predict(X_test))}")
print(f"The r2 adjusted score for extra trees regressor is {r2_adj(r2_score(y_test,etr.predict(X_test)))}")

The r2 score for random forest regressor is 0.9436825518443894
The r2 adjusted score for random forest regressor is 0.8888613008798582

The r2 score for extra trees regressor is 0.9247054099755954
The r2 adjusted score for extra trees regressor is 0.8528619334305543


In [14]:
## finding the importance of the features using random forest regressor

for name, score in zip(df.iloc[:,:-1].columns,rfr.feature_importances_):
    print(f"The importance of the feature '{name}' is {np.round(score,3)}")

The importance of the feature 'TV' is 0.651
The importance of the feature 'Radio' is 0.343
The importance of the feature 'Newspaper' is 0.005
