## Simple Linear Regression

In [1]:
import pandas as pd

In [2]:
path = r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/50_Startups.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


## Data Cleaning

In [4]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(0)

## Separate X and Y features
    X: Independent feature => RND
    Y: Dependent feature => PROFIT

In [6]:
df['RND']

0     165349.20
1     162597.70
2     153441.51
3     144372.41
4     142107.34
5     131876.90
6     134615.46
7     130298.13
8     120542.52
9     123334.88
10    101913.08
11    100671.96
12     93863.75
13     91992.39
14    119943.24
15    114523.61
16     78013.11
17     94657.16
18     91749.16
19     86419.70
20     76253.86
21     78389.47
22     73994.56
23     67532.53
24     77044.01
25     64664.71
26     75328.87
27     72107.60
28     66051.52
29     65605.48
30     61994.48
31     61136.38
32     63408.86
33     55493.95
34     46426.07
35     46014.02
36     28663.76
37     44069.95
38     20229.59
39     38558.51
40     28754.33
41     27892.92
42     23640.93
43     15505.73
44     22177.74
45      1000.23
46      1315.46
47         0.00
48       542.05
49         0.00
Name: RND, dtype: float64

In [7]:
df[['RND']]

Unnamed: 0,RND
0,165349.2
1,162597.7
2,153441.51
3,144372.41
4,142107.34
5,131876.9
6,134615.46
7,130298.13
8,120542.52
9,123334.88


In [8]:
X = df[['RND']]
Y = df[['PROFIT']]

In [9]:
X.head()

Unnamed: 0,RND
0,165349.2
1,162597.7
2,153441.51
3,144372.41
4,142107.34


In [10]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


### There are no missing values nor any duplicated rows in this dataset

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [12]:
pipe = make_pipeline(SimpleImputer(strategy='median')).set_output(transform='pandas')

In [13]:
pipe

0,1,2
,steps,"[('simpleimputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


    fit_transform: fit the data and transform the data based on provided conditions
    tranform : will do only the changes
    fit: will only fit the changes

In [17]:
df.dtypes

RND       float64
ADMIN     float64
MKT       float64
STATE      object
PROFIT    float64
dtype: object

In [None]:
#pipe.fit_transform(df)

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'New York'

In [15]:
X_pre = pipe.fit_transform(X)
X_pre.head()

Unnamed: 0,RND
0,165349.2
1,162597.7
2,153441.51
3,144372.41
4,142107.34


## Model Building : Linear Regression
    Equation of Line: y = B0 + B1.X
    B0 => y intercept
    B1 => Slope of the line

    ProfitPredicted = B0 + B1*RND

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
model = LinearRegression()
model.fit(X,Y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
# B0- y intercept
model.intercept_

array([49032.89914125])

In [None]:
# B1- slope of the line
model.coef_

array([[0.85429137]])

ProfitPredicted = 49032.9 + 0.8542*RND

RND=0
ProfitPredicted = 49032.9

Rnd=1
It will increase by 0.854 times


In [23]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [24]:
Ypreds = model.predict(X)
Ypreds

array([[190289.29389289],
       [187938.71118575],
       [180116.65707807],
       [172369.00320589],
       [170433.97345032],
       [161694.19683741],
       [164033.72501421],
       [160345.46724972],
       [152011.33380847],
       [154396.82286103],
       [136096.36397105],
       [135036.08586475],
       [129219.89081021],
       [127621.20411029],
       [151499.37407569],
       [146869.43093301],
       [115678.82583435],
       [129897.69412683],
       [127413.41482014],
       [122860.50313037],
       [114175.91374003],
       [116000.34693472],
       [112245.81324567],
       [106725.35677792],
       [114850.93206678],
       [104275.40289851],
       [113385.70276482],
       [110633.79960036],
       [105460.14271464],
       [105079.09459155],
       [101994.24845109],
       [101261.18102569],
       [103202.54108032],
       [ 96440.90176556],
       [ 88694.29012885],
       [ 88342.27936946],
       [ 73520.10196791],
       [ 86681.47714396],
       [ 663

In [25]:
Ypreds[0:5]

array([[190289.29389289],
       [187938.71118575],
       [180116.65707807],
       [172369.00320589],
       [170433.97345032]])

In [26]:
Ypreds[:5]

array([[190289.29389289],
       [187938.71118575],
       [180116.65707807],
       [172369.00320589],
       [170433.97345032]])

In [27]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


## Evaluation Metrics
    Mean Squared Error
    Mean Absolute Error
    RMSE
    R2 squared- R2 score

In [28]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [36]:
# Mean Squared Error
MSE = mean_squared_error(Y,Ypreds)
# RMSE - Root mean squared error
RMSE = MSE**(1/2)
# MAE 
MAE = mean_absolute_error(Y,Ypreds)
# R2 score
r2 = r2_score(Y,Ypreds)

print(f"Mean Squared Error: {MSE:.2f}")
print(f"Root Mean Squared Error: {RMSE:.2f}")
print(f"Mean Absolute Error: {MAE:.2f}")
print(f"R2 squared: {r2*100:.2f}%")

Mean Squared Error: 85120931.33
Root Mean Squared Error: 9226.10
Mean Absolute Error: 6910.98
R2 squared: 94.65%


## R2 squared value is greater than 80%, we can consider this model for final model building

In [37]:
rnd_sample = [160045,178000,187000,198650]

In [46]:
preds = []
for x in rnd_sample:
    pred = model.predict([[x]]).round(2)
    preds.append(pred)



In [44]:
rnd_sample

[160045, 178000, 187000, 198650]

In [47]:
print(preds)

[array([[185757.96]]), array([[201096.76]]), array([[208785.39]]), array([[218737.88]])]
