In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split # train test split package
from sklearn.linear_model import LinearRegression # Linear Regression model
from sklearn.ensemble import RandomForestRegressor # RF Regression
from sklearn.tree import DecisionTreeRegressor # DT Regression
from sklearn.metrics import r2_score, mean_squared_error as mse # r2_score, how much of our independent variable describes the dependent var?

# RMSE of 8 means that there is a chance that our prediction deviates -8 or +8 compared with the actual data- Y pred = 90, Y test = 82 to 98

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_ori = pd.read_csv('/kaggle/input/dummy-advertising-and-sales-data/Dummy Data HSS.csv')
df = df_ori.copy()
df

In [None]:
df = pd.get_dummies(df, drop_first = True)
df

In [None]:
df = df[['TV', 'Radio', 'Social Media', 
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano', 'Sales']]
df

In [None]:
df = df.fillna(df.mean())

In [None]:
df.info()

# REGRESSION

* Justify X and Y
* Split Training and Testing Data
* Train (Fit) Data
* Prediction Linear Regression
* Evaluate the Results of Linear Regression 

----

* Prediction using Random Forest
* Prediction using Decision Tree
* Choose the Best Regression Model

----

P.s. We can perform Cross Validation (LATER)

In [None]:
df

In [None]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1:]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1)

In [None]:
lrr = LinearRegression()
lrr.fit(x_train, y_train)

In [None]:
y_pred = lrr.predict(x_test)

In [None]:
print(y_test.values)
print("-----")
print(y_pred)

In [None]:
r2_score(y_test, y_pred)

In [None]:
mse(y_test, y_pred)**0.5

In [None]:
df

In [None]:
lrr.coef_

In [None]:
lrr.intercept_

Sales = 3.5 TV + 0.1 Radio + 0.06 Soc Media + 0.5 Mega + 0.4 Micro + 0.4 Nano - 0.30

* For every 1 increase in TV, the sales will increase by 3.5
* For every 1 increase in Soc Media, the sales will increase only 0.06
* If we do not spend any promotional budget, our sales is predicted to decrease by - 0.3 units

Our Linear Regression Model results in 99.2% accuracy, 8.2 RMSE, and the above regression equation.

In [None]:
rfr = RandomForestRegressor(max_depth = 16, n_estimators = 5)
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test) # we UPDATE the Y_pred here, so be careful
print(r2_score(y_test, y_pred))
print(mse(y_test, y_pred)**0.5)

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
y_pred = dtr.predict(x_test) # we UPDATE the Y_pred here, be careful
print(r2_score(y_test, y_pred))
print(mse(y_test, y_pred)**0.5)

Conclusion:

* Linear Regression is the regression model that has the highest accuracy and lowest RMSE
* We can use Linear Regression if we want to predict our Sales based on the Ad Budget and Type of Influencer

# Feature Importances

What is the most important feature that predict the sales?

In [None]:
# Print feature importances
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(lrr, x_test, y_test, random_state = 1)

for i in pimp.importances_mean.argsort()[-10:]:
    print(x.columns[i], pimp.importances_mean[i])

In [None]:
# Print feature importances
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(rfr, x_test, y_test, random_state = 1)

for i in pimp.importances_mean.argsort()[-10:]:
    print(x.columns[i], pimp.importances_mean[i])

To avoid overfitting and underfitting, you can use Cross Validation

In [None]:
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
    }

from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(rfr, parameters, cv = 20, n_jobs = -1) # CHANGE THE MODEL HERE OTHER THAN LINEAR REGRESSION
cv.fit(x_train, y_train)

cv.best_params_

# for my eyes only: see this
# https://www.datasciencelearner.com/how-to-improve-accuracy-of-random-forest-classifier/

def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

display(cv)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lrr, X = x_train, y = y_train, cv = 10, n_jobs = -1)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

By using cross validation (reiteration of several training and testing data sets), we can infer that the accuracy is consistent on 99.36% with std deviation of 0.54%.

Overall conclusion of our model is:

* Linear Regression is the best regression model with average accuracy of 99.36%
* TV, Radio, and Micro influencers should be applied in our promotional activities - because it has the highest importance
* We whould not use Social Media and other influencers as it has the lowest number of importance

Outline:

* Preprocessing
* Justify X and Y
* Split Training and Testing Data
* Fit / Train the Data USING MULTIPLE REGRESSION MODELS
* Evaluate which has the highest R2 and lowest RMSE
* Try to perform Grid Search CV (if the regression model is NOT Linear Regression)
* Update the regression model with the best n_estimators and max_depth
* Try to perform K-Fold Cross Validation
* Conclude the regression models and provide recommendations based on the given data

P.S. Find dataset that is relevant for performing Regression (PAY ATTENTION ON THE DATA TYPE, PARTICULARLY DEPENDENT VARIABLE MUST BE RATIO/INTERVAL).