## Simple Linear Regression

### Problem Statement: Build a linear regression model that predicts the profit of startups on basis of Administration charges**

In [1]:
import pandas as pd
path = r"C:\Users\SAMRUDHI\OneDrive\Desktop\DATA SCIENCE\Data analytics\Datasets\Startups.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


## Perform basic data quality checks

In [2]:
df.shape

(50, 5)

In [3]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     str    
 4   PROFIT  50 non-null     float64
dtypes: float64(4), str(1)
memory usage: 2.1 KB


## Separate X and Y features

In [4]:
X = df[['ADMIN']]
Y = df[['PROFIT']]

In [5]:
X.head()

Unnamed: 0,ADMIN
0,136897.8
1,151377.59
2,101145.55
3,118671.85
4,91391.77


In [6]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


## Ypredictions = B0 + B1.Admin

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
model = LinearRegression()
model.fit(X,Y)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [9]:
X.head()

Unnamed: 0,ADMIN
0,136897.8
1,151377.59
2,101145.55
3,118671.85
4,91391.77


In [10]:
# B0 -> Intercept
model.intercept_

array([76974.47130542])

In [11]:
# Coefficient -> Slope -> B1
model.coef_

array([[0.2887492]])

## Profit_Predictions = B0 + B1.Admin

Profit_Predictions = 76974.47 + (0.288 * ADMIN)

If the ADMIN = 0, PROFIT_predictions will be 76974
Profits increase by 0.288 times ADMIN charges

In [12]:
model.score(X,Y)

0.04028714077757223

## Performance Metrics
    MSE,RMSE
    MAE
    R2

In [13]:
Ypreds = model.predict(X)
Ypreds

array([[116503.6018596 ],
       [120684.62967237],
       [106180.1681897 ],
       [111240.87333494],
       [103363.77199475],
       [105795.88920124],
       [119478.02760551],
       [118996.16004913],
       [119916.94949302],
       [108355.49495555],
       [108908.4323531 ],
       [103478.93672662],
       [113738.12947398],
       [116098.56469098],
       [122177.41396119],
       [112379.98605074],
       [112085.66687627],
       [118865.50681004],
       [109942.6395962 ],
       [121301.54811819],
       [109853.56335476],
       [121376.42656134],
       [112427.8924309 ],
       [107509.99686616],
       [105641.87903918],
       [117270.33494157],
       [118593.62056111],
       [113895.25812743],
       [129713.23106914],
       [121162.35656519],
       [110365.79866475],
       [121067.02890353],
       [114286.53062157],
       [106732.23933965],
       [122508.46492169],
       [101531.85176777],
       [113661.8505972 ],
       [ 91782.43707469],
       [ 960

In [14]:
Ypreds[:5]

array([[116503.6018596 ],
       [120684.62967237],
       [106180.1681897 ],
       [111240.87333494],
       [103363.77199475]])

In [15]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [16]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [17]:
mse = mean_squared_error(Y,Ypreds)
mae = mean_absolute_error(Y,Ypreds)
rmse = mse**(1/2)
r2 = r2_score(Y,Ypreds)

print(f"MSE:{mse}")
print(f"RMSE:{rmse}")
print(f"MAE:{mae}")
print(f"Rsquared:{r2*100}%")

MSE:1527955397.744143
RMSE:39089.07005473708
MAE:30659.814789071817
Rsquared:4.028714077757223%


## R squared is close to 0. This model is resulting in accurate results.We will not consider this model as our final model.

## ---------------------------------------------------------------------------------
## Build the model that predicts PROFIT for the startups by considering RND amount
X1 = df[['RND']]

In [18]:
X1 = df[['RND']]
Y1 = df[['PROFIT']]

In [19]:
X1.head()

Unnamed: 0,RND
0,165349.2
1,162597.7
2,153441.51
3,144372.41
4,142107.34


In [20]:
Y1.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [21]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X1,Y1)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [22]:
X1.head()

Unnamed: 0,RND
0,165349.2
1,162597.7
2,153441.51
3,144372.41
4,142107.34


In [23]:
model.intercept_

array([49032.89914125])

In [24]:
model.coef_

array([[0.85429137]])

In [25]:
model.score(X1,Y1)

0.9465353160804393

In [26]:
Ypreds = model.predict(X1)
Ypreds

array([[190289.29389289],
       [187938.71118575],
       [180116.65707807],
       [172369.00320589],
       [170433.97345032],
       [161694.19683741],
       [164033.72501421],
       [160345.46724972],
       [152011.33380847],
       [154396.82286103],
       [136096.36397105],
       [135036.08586475],
       [129219.89081021],
       [127621.20411029],
       [151499.37407569],
       [146869.43093301],
       [115678.82583435],
       [129897.69412683],
       [127413.41482014],
       [122860.50313037],
       [114175.91374003],
       [116000.34693472],
       [112245.81324567],
       [106725.35677792],
       [114850.93206678],
       [104275.40289851],
       [113385.70276482],
       [110633.79960036],
       [105460.14271464],
       [105079.09459155],
       [101994.24845109],
       [101261.18102569],
       [103202.54108032],
       [ 96440.90176556],
       [ 88694.29012885],
       [ 88342.27936946],
       [ 73520.10196791],
       [ 86681.47714396],
       [ 663

In [27]:
Ypreds[:5]

array([[190289.29389289],
       [187938.71118575],
       [180116.65707807],
       [172369.00320589],
       [170433.97345032]])

In [28]:
Y1.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [29]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [31]:
mse = mean_squared_error(Y1,Ypreds)
mae = mean_absolute_error(Y1,Ypreds)
rmse = mse**(1/2)
r2 = r2_score(Y1,Ypreds)

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"MSE: {mse}")
print(f"R2 score: {r2*100}%")

MAE: 6910.98435457961
RMSE: 9226.100548285232
MSE: 85120931.32706906
R2 score: 94.65353160804393%
