## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Dummy Marketing and Sales Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(len(X))
print(len(y))

4572
4572


In [None]:
# print(y)
print(y.ndim)
y = y.reshape(4572, 1)
print(y.ndim)

1
2


## Taking Care of Missing Data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
imputer1 = SimpleImputer()
X[:, 0:3] = imputer.fit_transform(X[:, 0:3])

In [None]:
print(X)

[[16.0 6.566230788 2.907982773 'Mega']
 [13.0 9.237764567 2.409567204 'Mega']
 [41.0 15.88644602 2.913410175 'Mega']
 ...
 [44.0 19.80007236 5.096191875 'Micro']
 [71.0 17.5346403 1.94087322 'Macro']
 [42.0 15.96668752 5.046547629 'Micro']]


In [None]:
print(X.shape)

(4572, 4)


In [None]:
# Imputing y values.
imputer1 = SimpleImputer(strategy='mean')
y = imputer.fit_transform(y)

In [None]:
print(y)

[[ 54.73275715]
 [ 46.67789698]
 [150.1778288 ]
 ...
 [163.6314574 ]
 [253.6104113 ]
 [148.2024141 ]]


In [None]:
print(y.shape)

(4572, 1)


## Encoding Categorical data


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [3])], remainder='passthrough')
X = ct.fit_transform(X)

In [None]:
print(X)

[[0.0 1.0 0.0 ... 16.0 6.566230788 2.907982773]
 [0.0 1.0 0.0 ... 13.0 9.237764567 2.409567204]
 [0.0 1.0 0.0 ... 41.0 15.88644602 2.913410175]
 ...
 [0.0 0.0 1.0 ... 44.0 19.80007236 5.096191875]
 [1.0 0.0 0.0 ... 71.0 17.5346403 1.94087322]
 [0.0 0.0 1.0 ... 42.0 15.96668752 5.046547629]]


## Spliting the dataset into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print(X_train)

[[1.0 0.0 0.0 ... 47.0 15.72822743 2.771904176]
 [0.0 0.0 1.0 ... 25.0 11.09535068 0.337279593]
 [1.0 0.0 0.0 ... 16.0 5.768998355 0.167732483]
 ...
 [0.0 0.0 0.0 ... 12.0 1.578916632 4.637683176]
 [0.0 0.0 1.0 ... 39.0 9.429549985 0.617180797]
 [0.0 0.0 1.0 ... 21.0 4.192298884 1.011604124]]


In [None]:
print(X_test)

[[0.0 0.0 1.0 ... 45.0 17.1275201 1.462856663]
 [0.0 0.0 0.0 ... 74.0 29.52017002 2.333157382]
 [0.0 1.0 0.0 ... 84.0 28.25584373 5.699994101]
 ...
 [0.0 0.0 1.0 ... 74.0 28.1170026 5.894168882]
 [0.0 1.0 0.0 ... 66.0 23.16259764 2.051333546]
 [0.0 0.0 1.0 ... 60.0 11.36731934 0.820350637]]


In [None]:
print(y_train)

[[166.9720246 ]
 [ 90.70924854]
 [ 58.28261531]
 ...
 [ 41.7006822 ]
 [138.2677026 ]
 [ 75.3411398 ]]


In [None]:
print(y_test)

[[157.65616  ]
 [264.5922333]
 [298.8823429]
 ...
 [260.1810487]
 [238.3714378]
 [213.482651 ]]


## Training the multiple linear regression model on training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

##Predicting the test set results

In [None]:
y_pred = regressor.predict(X_test)
y_pred

array([[160.63808743],
       [264.30092753],
       [299.41775266],
       ...,
       [264.40256896],
       [235.24337786],
       [212.02724325]])

In [None]:
# concatenating the values of y_pred and y_test in a 2-D array.
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[160.63808743 157.65616   ]
 [264.30092753 264.5922333 ]
 [299.41775266 298.8823429 ]
 ...
 [264.40256896 260.1810487 ]
 [235.24337786 238.3714378 ]
 [212.02724325 213.482651  ]]


## Predicting a single value

In [None]:
regressor.predict([[0, 0, 1, 0, 17, 9.237764567, 1.40599815]])

array([[61.29213079]])

##Getting the value of coefficent and intercept of the trained model

In [None]:
print(f"Coefficient: {regressor.coef_}")
print(f"Intercept: {regressor.intercept_}")

Coefficient: [[-0.38970715  0.16574184  0.13698929  0.08697602  3.49781683  0.17773415
   0.08453186]]
Intercept: [-0.06846253]


Therefore, the equation of our multiple linear regression model is:
-0.38970715  0.16574184  0.13698929  0.08697602  3.49781683  0.17773415
   0.08453186
$$\textrm{Sales} =  -0.06846253 - 0.38970715 \times \textrm{Dummy State 1} + 0.16574184 \times \textrm{Dummy State 2} + 0.13698929 \times \textrm{Dummy State 3} + 0.08697602 \times \textrm{Dummy State 4} + 3.49781683 \times \textrm{TV} + 0.17773415 \times \textrm{Radio} + 0.08453186 \times \textrm{Social Media}$$

##Testing the accuracy of the model

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9949051741337412