In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split # train test split package
from sklearn.linear_model import LinearRegression # Linear Regression model
from sklearn.ensemble import RandomForestRegressor # RF Regression
from sklearn.tree import DecisionTreeRegressor # DT Regression
from sklearn.metrics import r2_score, mean_squared_error as mse # r2_score, how much of our independent variable describes the dependent var?
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Outline:

- Preprocessing
- Justify X and Y
- Split Training and Testing Data
- Fit / Train the Data USING MULTIPLE REGRESSION MODELS
- Evaluate which has the highest R2 and lowest RMSE
- Try to perform Grid Search CV (if the regression model is NOT Linear Regression)
- Update the regression model with the best n_estimators and max_depth
- Try to perform K-Fold Cross Validation
- Conclude the regression models and provide recommendations based on the given data

In [None]:
df_ori = pd.read_csv('/kaggle/input/advertising.csv/Advertising.csv')
df = df_ori.copy()
df

In [None]:
df = pd.get_dummies(df, drop_first = True)
df

In [None]:
df = df[['TV', 'radio', 'newspaper', 
       'sales']]
df

In [None]:
df = df.fillna(df.mean())

In [None]:
df.info()

# Justify X and Y

In [None]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1:]

In [None]:
x

In [None]:
y

# Split Training and Testing Data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1)

In [None]:
x_train

In [None]:
x_test

In [None]:
print(len(x_train),len(y_train))
print(len(x_test),len(y_test))

**Linear Regression**

In [None]:
lrr = LinearRegression()
lrr.fit(x_train, y_train)

In [None]:
y_pred = lrr.predict(x_test)

In [None]:
print(y_test.values)
print("         ")
print(y_pred)

In [None]:
r2_score(y_test, y_pred)

In [None]:
mse(y_test, y_pred)**0.5

In [None]:
lrr.coef_

In [None]:
lrr.intercept_

Sales = 0.05 TV + 0.2 Radio + 0.003 Newspaper + 2.87
* For every 1 increase in TV, the sales will increase 0.05
* For every 1 increase in Radio, the sales will increase 0.2
* For every 1 increase in Newspaper, the sales will increase 0.003
* If we don't spend any promotional budget, the sales will increase 2.87

This Linear Regression Model have 95% accuracy and 1 MSE


**Random Forest Regressor**

In [None]:
rfr = RandomForestRegressor(max_depth = 10, n_estimators = 5)
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test) # we UPDATE the Y_pred here, so be careful
print(r2_score(y_test, y_pred))
print(mse(y_test, y_pred)**0.5)

**Decision Tree Regressor**

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
y_pred = dtr.predict(x_test) # we UPDATE the Y_pred here, be careful
print(r2_score(y_test, y_pred))
print(mse(y_test, y_pred)**0.5)

CONCLUSION

Random Forest Regressor is the regression model that has the highest accuracy and lowest RMSE, with 97% accuracy and 0.8 RMSE


# Feature Importances

In [None]:
# Feature Importances Linear Regression
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(lrr, x_test, y_test, random_state = 1)

for i in pimp.importances_mean.argsort()[-10:]:
    print(x.columns[i], pimp.importances_mean[i])

In [None]:
# Feature Importances Random Forest Regressor
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(rfr, x_test, y_test, random_state = 1)

for i in pimp.importances_mean.argsort()[-10:]:
    print(x.columns[i], pimp.importances_mean[i])

In [None]:
# Feature Importances Decision Tree Regressor
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(dtr, x_test, y_test, random_state = 1)

for i in pimp.importances_mean.argsort()[-10:]:
    print(x.columns[i], pimp.importances_mean[i])

# Best Parameters

In [None]:
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
    }

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(rfr, parameters, cv = 5, n_jobs = -1) # Random Forest Regression Model
cv.fit(x_train, y_train)

cv.best_params_

def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

display(cv)

In [None]:
rfr= RandomForestRegressor(max_depth=8,n_estimators= 250)
rfr.fit(x_train,y_train)


In [None]:
predict_Y = rfr.predict(x_test)
r2_score(y_test,y_pred) 