## Multiple Linear Regression

### Build a model that predicts PROFIT of startups on basis of RND and MKT amounts

## Step 1: Data Gathering

In [2]:
import pandas as pd
path = r"C:\Users\SAMRUDHI\OneDrive\Desktop\DATA SCIENCE\Data analytics\Datasets\Startups.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


## Step 2: Data Quality Checks

In [3]:
df.shape

(50, 5)

In [4]:
df.columns

Index(['RND', 'ADMIN', 'MKT', 'STATE', 'PROFIT'], dtype='str')

In [5]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     str    
 4   PROFIT  50 non-null     float64
dtypes: float64(4), str(1)
memory usage: 2.1 KB


In [6]:
# Check for duplicates
df.duplicated().sum()

np.int64(0)

In [7]:
df = df.drop_duplicates()

In [8]:
# Check for missing data
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

## Step3 : Separate X and Y features

    Y : Target feature, Dependent feature => Profit
    X : Independent features => RND, MKT

In [9]:
X = df[['MKT','RND']]
Y = df[['PROFIT']]

In [10]:
X.head()

Unnamed: 0,MKT,RND
0,471784.1,165349.2
1,443898.53,162597.7
2,407934.54,153441.51
3,383199.62,144372.41
4,366168.42,142107.34


In [11]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


## Build the model

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
model = LinearRegression()
model.fit(X,Y)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [14]:
model.score(X,Y)

0.9504503015559763

## Evaluation of the model

    MSE
    MAE
    RMSE
    R2 score => Coefficient of determination

## Ypredictions = B0 + B1.RND + B2.MKT

B0,B1,B2 are calculated using Gradient Descent algorithm

In [15]:
X.head(2)

Unnamed: 0,MKT,RND
0,471784.1,165349.2
1,443898.53,162597.7


In [16]:
model.intercept_

array([46975.86422072])

In [17]:
model.coef_

array([[0.02990788, 0.79658404]])

## Ypredictions = 46975 + 0.0299.MKT + 0.7965.RND

In [18]:
# Calculate the predictions
ypreds = model.predict(X)
ypreds

array([[192800.45862502],
       [189774.65948019],
       [181405.37809703],
       [173441.30884249],
       [171127.62321762],
       [162879.31081217],
       [158028.13045422],
       [160455.73887656],
       [152317.8036728 ],
       [154343.8139353 ],
       [135011.91472396],
       [134638.87007529],
       [129218.39657898],
       [127812.20546461],
       [150192.49179713],
       [146032.71543309],
       [117025.89184753],
       [130829.44473222],
       [128882.19882756],
       [115816.41833283],
       [116650.89209156],
       [118384.17070857],
       [114990.38463925],
       [109886.18521692],
       [112552.18715137],
       [102612.90924225],
       [110990.79288437],
       [114978.60515008],
       [103125.01275975],
       [102440.42409014],
       [ 99085.21956154],
       [ 98314.54885378],
       [ 98864.66225433],
       [ 97600.73044466],
       [ 90262.64121898],
       [ 89776.4942853 ],
       [ 75824.23391247],
       [ 87974.01451829],
       [ 686

In [19]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

mse = mean_squared_error(Y,ypreds)
mae = mean_absolute_error(Y,ypreds)
rmse = mse**(1/2)
r2 = r2_score(Y,ypreds)

# print the evaluation metrics
print(f"MSE : {mse}")
print(f"MAE : {mae}")
print(f"RMSE : {rmse}")
print(f"R2-score : {r2*100:.2f}%")

MSE : 78887897.00648756
MAE : 6499.319940113649
RMSE : 8881.885892449169
R2-score : 95.05%


## R2 score for the above model is around 95% which states that the model is providing accurate profit results

In [22]:
X.head(2)

Unnamed: 0,MKT,RND
0,471784.1,165349.2
1,443898.53,162597.7


In [23]:
preds = X.copy()
preds.head(2)

Unnamed: 0,MKT,RND
0,471784.1,165349.2
1,443898.53,162597.7


In [25]:
preds['PROFIT PREDICTED'] = ypreds.round(2)

In [26]:
preds.head(2)

Unnamed: 0,MKT,RND,PROFIT PREDICTED
0,471784.1,165349.2,192800.46
1,443898.53,162597.7,189774.66


In [27]:
## Save the results to a csv file
preds.to_csv("Profit Prediction using RND,MKT.csv",index=False)

## Out of sample predictions:

## New data from the business is shared with you. They want you to predict profit for these MKT, RND details

In [28]:
sample = [160000,185000]

In [29]:
new_data_predictions = model.predict([sample])
new_data_predictions



array([[199129.172404]])