# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Read Data

In [2]:
df = pd.read_csv('supershops.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
df.shape

(50, 5)

# Info of data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Marketing Spend  50 non-null     float64
 1   Administration   50 non-null     float64
 2   Transport        49 non-null     float64
 3   Area             50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


# Statistical Info Of Data

In [5]:
df.describe()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
count,50.0,50.0,49.0,50.0
mean,73721.6156,121344.6396,215331.732449,112012.6392
std,45902.256482,28017.802755,119665.39155,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,134050.07,90138.9025
50%,73051.08,122699.795,214634.81,107978.19
75%,101602.8,144842.18,299737.29,139765.9775
max,165349.2,182645.56,471784.1,192261.83


# Check Null Value

In [6]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

# Handling Null Value

In [7]:
df.fillna(df['Transport'].mean(), inplace=True)

In [8]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

# Drop Duplicate Rows

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.shape

(50, 5)

# Separating Feature and Target

In [11]:
x = df.drop('Profit', axis=1)
y = df['Profit']

In [12]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


In [13]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

# Data Preprocessing

In [14]:
# Apply One Hot Encoding on 'Area' column
one = pd.get_dummies(x['Area'],prefix='Area',drop_first=True)
x = pd.concat([x,one],axis=1).drop('Area',axis=1)

In [15]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,1,0
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,1,0
4,142107.34,91391.77,366168.42,0,1


# LinearRegression

In [16]:
lr = LinearRegression()

In [17]:
lr.fit(x,y)

In [18]:
y_pred = lr.predict(x)
y_pred

array([160252.69208592, 191618.10651463, 179860.16669243, 172713.69233356,
       168595.2539381 , 162084.63391357, 147471.23046668, 158994.91233694,
       151924.74262748, 152771.74693043, 131236.99025515, 132581.79301842,
       127995.57810622, 128357.59301606, 147009.56620418, 177290.51761512,
       119436.73193821, 132071.84946993, 129745.03141221, 121439.67901459,
       120740.126611  , 122763.54596284, 118735.15121259, 113960.46055083,
       107455.49561918, 100956.51954163, 106831.49025092, 121969.17762878,
       100541.31237225,  98797.54576974,  93533.98637471,  94272.02517658,
        92467.15493374,  98593.70095397,  95054.18617874,  91287.3636652 ,
        80218.07848302,  88997.16285717,  71711.86085248,  84538.06662301,
        78890.63476184,  75437.3255538 ,  72765.09451589,  58557.01329118,
        64109.90378563,  46012.06280743,  69095.38530268,  46309.82355645,
        43255.69216641,  49322.10475048])

In [19]:
# accuracy score
lr.score(x,y)

0.9046261977822221

In [20]:
# intercepts
c = lr.intercept_
c

42015.03827491081

In [21]:
# coefficients
m = lr.coef_
m

array([ 6.73161106e-01,  3.17129363e-02,  7.96307951e-02, -7.65160304e+02,
       -1.13750224e+03])

# Prediction Using model object

In [22]:
y_pred_1 = lr.predict([[142107.34,91391.77,366168.42,0,1]])



In [23]:
y_pred_1

array([168595.2539381])

# Prediction Using Coefficient and Intercept

In [24]:
# coefficients assigning
m1 = m[0]
m2 = m[1]
m3 = m[2]
m4 = m[3]
m5 = m[4]

#intercepts = c

#features
x1,x2,x3,x4,x5 = 142107.34,91391.77,366168.42,0,1

y_pred_2 = m1 * x1 + m2 * x2 + m3 * x3 + m4 * x4 + m5 * x5 + c
y_pred_2

168595.25393810222

So, in both cases prediction is same

# r2_score

In [25]:
score = r2_score(y, y_pred)
score

0.9046261977822221