In [None]:
#import the necessary library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('50_Startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
# prompt: convert the categorical values into numerical using label encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['State'] = le.fit_transform(data['State'])
data.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [None]:
# Drop Profit from data and store in variable Y
X = data.drop(columns=['Profit'])
y = data['Profit']

In [None]:
#check for rows and column in both x and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (50, 4)
Shape of y: (50,)


In [None]:
#assign values of X and Y in different variables
X = X.values
y = y.values

In [None]:
#split the dataset as 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#check for rows and column in both training data of x and y
print("Shape of X:", X_train.shape)
print("Shape of y:", y_train.shape)

Shape of X: (40, 4)
Shape of y: (40,)


In [None]:
# =======================================
# Multiple Linear Regression
# =======================================
model_multiple = LinearRegression()

# Train the model
model_multiple.fit(X_train, y_train)

y_pred_multiple = model_multiple.predict(X_test)

mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print("\nMultiple Linear Regression")
print(f"  Mean Squared Error: {mse_multiple:.4f}")
print(f"  R¬≤ Score: {r2_multiple:.4f}")


Multiple Linear Regression
  Mean Squared Error: 80929465.4910
  R¬≤ Score: 0.9001


In [None]:
# Print the intercept
print(round(model_multiple.intercept_,2))

54080.72


In [None]:
#print the data type of model_multiple
type(model_multiple)

In [None]:
#predict the values of Y for 1st 5 rows
model_multiple.predict(X[0:5])

array([192405.36723938, 188372.32164914, 183282.8185662 , 174012.86095334,
       173529.22121764])

In [None]:
# Display the first few predictions alongside the actual values
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_multiple})
print(predictions.head())

      Actual      Predicted
0  134307.35  126720.661507
1   81005.76   84909.089619
2   99937.59   98890.318549
3   64926.08   46479.312402
4  125370.37  129113.183188


In [None]:
#serialization
#model persistance (saving and loading trained models)
import pickle

In [None]:
# save: 'with' is package deals with file handeling, wb- write
with open('model.pkl','wb') as f:
    pickle.dump(model_multiple,f)

In [None]:
# load: rb-read
with open('model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [None]:
#model created as clf2 and values are passed into it
clf2.predict(X[0:5])

array([192405.36723938, 188372.32164914, 183282.8185662 , 174012.86095334,
       173529.22121764])

# **Multiple Linear Regression Analysis**
**Model Overview** Using R&D Spend, Administration, Marketing Spend, and State as inputs, Multiple Linear Regression is utilized to forecast profit. To guarantee compatibility with the regression model, the dataset was processed, encoding categorical variables (State) into numerical form.


The parameters of the model

The model forecasts a baseline profit of roughly 54080.72 units if all independent variables (R&D Spend, Administration, Marketing Spend, and the encoded State variables) are zero, as shown by **a) Intercept (ùõΩ0) = 54080.72**.

**b) With an R-Squared value of 0.9001**, the independent variables account for 90.01% of the variation in profit. This suggests that the target variable and the predictors have a close relationship.

The average squared difference between actual and anticipated profits is represented by the following equation: **c) Mean Squared Error = 80929465.4910**. Although this figure offers some insight into the model's error magnitude,its interpretation depends on the scale of the target variable.