# Advertising data sales prediction

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/advertising-dataset/advertising.csv


In [2]:
data = pd.read_csv("/kaggle/input/advertising-dataset/advertising.csv")

In [3]:
data.head(3)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0


In [4]:
data.dtypes

TV           float64
Radio        float64
Newspaper    float64
Sales        float64
dtype: object

In [5]:
data.describe()

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,15.1305
std,85.854236,14.846809,21.778621,5.283892
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,11.0
50%,149.75,22.9,25.75,16.0
75%,218.825,36.525,45.1,19.05
max,296.4,49.6,114.0,27.0


In [6]:
data.isnull().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

In [7]:
(data == 0).sum(axis=0)

TV           0
Radio        1
Newspaper    0
Sales        0
dtype: int64

the minimum value of radio is zero

# Simple linear regression

In [8]:
from sklearn.preprocessing import scale                # we can also use StandardScalar. 
X = scale(data.TV, with_mean=True, with_std=False).reshape(-1,1)
y = data.Sales   



In [9]:
import sklearn
import sklearn.linear_model as skl_lm
from sklearn.linear_model import LinearRegression

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)


In [11]:
regressor = LinearRegression()
regressor.fit(X, y)

In [12]:
# Predicting the test set results
y_pred = regressor.predict(X_test)

In [13]:
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

rsquare = r2_score(y_test, y_pred)

In [14]:
print('Root mean square error:', rmse)
print('R-square:', rsquare)

Root mean square error: 2.467800028212657
R-square: 0.8029184404682554


In [15]:
# Intercept and coef of the line
print('Intercept of the model:', round(regressor.intercept_, 2))
print('Coefficient of the line:', round(regressor.coef_[0], 2))

Intercept of the model: 15.13
Coefficient of the line: 0.06


# Multi Regression

In [16]:
x = data.drop(["Sales"], axis = 1)
y = data['Sales']

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)

In [18]:
# Importing the required preprocessing libraries:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [19]:
#defining numerical columns in data:
numeric_columns=x_train.select_dtypes(['float64','int64']).columns
numeric_columns

Index(['TV', 'Radio', 'Newspaper'], dtype='object')

In [20]:
#standardscaler for numerical columns
scaler = StandardScaler()
scaler.fit(x_train[numeric_columns])

X_train = pd.DataFrame(scaler.transform(x_train[numeric_columns]), columns=numeric_columns)
X_test = pd.DataFrame(scaler.transform(x_test[numeric_columns]), columns=numeric_columns)

In [21]:
X_train.head(2)

Unnamed: 0,TV,Radio,Newspaper
0,0.709886,0.110952,-0.813444
1,0.040943,1.218103,1.200207


In [22]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()  


In [23]:
X_train.shape

(140, 3)

In [24]:
X_test.shape

(60, 3)

In [25]:
mlr.fit(X_train, y_train)

In [26]:
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(x, mlr.coef_))

Intercept:  15.005714285714287
Coefficients:


[('TV', 4.981018752198336),
 ('Radio', 1.659397711063883),
 ('Newspaper', 0.14181393313850532)]

In [27]:
y_mlr_pred= mlr.predict(X_test)

In [28]:
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_mlr_pred})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
126,6.6,9.352211
104,20.7,20.963446
99,17.2,16.488511
92,19.4,20.10971
111,21.8,21.671484


In [29]:
from sklearn import metrics
mae = metrics.mean_absolute_error(y_test, y_mlr_pred)
mse = metrics.mean_squared_error(y_test, y_mlr_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_mlr_pred))
print('R squared: {:.2f}'.format(mlr.score(x,y)*100))
print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', rmse)


R squared: -2810546.05
Mean Absolute Error: 1.227818356658941
Mean Square Error: 2.6360765623280646
Root Mean Square Error: 1.6235998775338907
