### **Creating Multi Linear Regression From Scratch**

In [44]:
# Reimport necessary libraries due to code execution state reset
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 500

# Generate dataset
data = {
    "Experience_Years": np.random.randint(1, 21, size=num_samples),  # Years of experience
    "Education_Level": np.random.choice(["High School", "Bachelor's", "Master's", "PhD"], size=num_samples, p=[0.2, 0.5, 0.2, 0.1]),
    "Age": np.random.randint(22, 60, size=num_samples),  # Age of the person
    "Job_Role": np.random.choice(["Analyst", "Engineer", "Manager", "Executive"], size=num_samples, p=[0.4, 0.3, 0.2, 0.1]),
    "Work_Hours_per_Week": np.random.randint(30, 60, size=num_samples),  # Weekly work hours
    "Annual_Income": np.random.randint(30000, 120000, size=num_samples),  # Annual income in USD
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert categorical data to numeric for regression purposes
df["Education_Level"] = df["Education_Level"].map({
    "High School": 1,
    "Bachelor's": 2,
    "Master's": 3,
    "PhD": 4
})

df["Job_Role"] = df["Job_Role"].map({
    "Analyst": 1,
    "Engineer": 2,
    "Manager": 3,
    "Executive": 4
})

# Add a dependent variable (Target)
# Hypothetical formula for generating income based on other variables
df["Salary"] = (
    2000 * df["Experience_Years"] + 
    3000 * df["Education_Level"] +
    500 * (df["Work_Hours_per_Week"] / 40) +
    np.random.normal(0, 5000, size=num_samples)  # Adding noise for variability
)

df.head()


Unnamed: 0,Experience_Years,Education_Level,Age,Job_Role,Work_Hours_per_Week,Annual_Income,Salary
0,7,4,30,2,50,97843,28084.30881
1,20,2,43,2,32,69901,50897.324856
2,15,3,25,2,55,41205,49490.956361
3,11,3,47,1,59,85661,37374.291216
4,8,3,50,1,52,35906,23410.427602


### Feature engineering:
Convert Salary to thousands (thousands) like 20k 

In [45]:

df["Salary"] = (df["Salary"] / 1000).round(1)


In [46]:
df['Annual_Income'] = (df["Annual_Income"] / 1000).round(1)

In [47]:
df

Unnamed: 0,Experience_Years,Education_Level,Age,Job_Role,Work_Hours_per_Week,Annual_Income,Salary
0,7,4,30,2,50,97.8,28.1
1,20,2,43,2,32,69.9,50.9
2,15,3,25,2,55,41.2,49.5
3,11,3,47,1,59,85.7,37.4
4,8,3,50,1,52,35.9,23.4
...,...,...,...,...,...,...,...
495,5,2,58,1,37,67.8,14.3
496,3,2,25,4,34,88.5,20.8
497,11,3,49,2,57,94.4,29.3
498,11,2,41,2,45,56.8,28.6


In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Equation of MultiLinear Regression

$$y = m1X1 + m2X2 ... mnXn + b$$

In [49]:
df

Unnamed: 0,Experience_Years,Education_Level,Age,Job_Role,Work_Hours_per_Week,Annual_Income,Salary
0,7,4,30,2,50,97.8,28.1
1,20,2,43,2,32,69.9,50.9
2,15,3,25,2,55,41.2,49.5
3,11,3,47,1,59,85.7,37.4
4,8,3,50,1,52,35.9,23.4
...,...,...,...,...,...,...,...
495,5,2,58,1,37,67.8,14.3
496,3,2,25,4,34,88.5,20.8
497,11,3,49,2,57,94.4,29.3
498,11,2,41,2,45,56.8,28.6


In [50]:
X = df.drop(columns=['Annual_Income', 'Salary'], axis=1)

In [51]:
X

Unnamed: 0,Experience_Years,Education_Level,Age,Job_Role,Work_Hours_per_Week
0,7,4,30,2,50
1,20,2,43,2,32
2,15,3,25,2,55
3,11,3,47,1,59
4,8,3,50,1,52
...,...,...,...,...,...
495,5,2,58,1,37
496,3,2,25,4,34
497,11,3,49,2,57
498,11,2,41,2,45


In [52]:
y = df['Salary']

In [53]:
y

0      28.1
1      50.9
2      49.5
3      37.4
4      23.4
       ... 
495    14.3
496    20.8
497    29.3
498    28.6
499    42.1
Name: Salary, Length: 500, dtype: float64

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

$$w=(X 
⊤
 X) 
−1
 X 
⊤
 y$$

- $X ⊤$ 
: Transpose of the feature matrix.

- $(X 
⊤
 X) 
−1
 :$ Inverse of the product of the transpose of 
𝑋 and 
𝑋.

$$b= 
y
ˉ
​
 − 
X
ˉ
 ⋅w$$

- $y$: Mean of the target values.

- $X$: Mean of each feature column in the feature matrix.

In [56]:
class CustomModel:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X_train, y_train):
        X_train = np.insert(X_train, 0, 1, axis=1)
        
        betas = np.linalg.inv(np.dot(X_train.T, X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]
        
    def predict(self,X_test):
        y_pred = np.dot(X_test, self.coef_) + self.intercept_
        return y_pred

In [57]:
X_train.shape

(400, 5)

In [58]:
np.linalg.inv(np.dot(X_train.T, X_train)).dot(X_train.T).dot(y_train)

array([ 2.00097939,  3.0188576 , -0.01545419,  0.17937832,  0.01589843])

In [59]:
model = CustomModel()
model.fit(X_train, y_train)

In [60]:
y_test

361    47.7
73     35.3
374    23.8
155    24.8
104    12.6
       ... 
347    16.8
86     11.3
75      8.5
438    17.4
15     15.1
Name: Salary, Length: 100, dtype: float64

In [61]:
y_pred = model.predict(X_test)

In [62]:
y_pred * 1000

array([46524.5759329 , 35889.93060519, 34712.26089128, 25844.12717488,
        8057.83609886,  7778.59836996, 15078.79208961, 13800.2226955 ,
       31889.88933776, 23950.76979129, 11777.76520863, 18711.66086978,
       10994.0993121 , 22073.17297875, 22423.6195355 , 15950.28197856,
       15745.74443221, 23764.5592944 , 26102.19816732, 41471.16366421,
       16620.39696483, 31683.66039716, 35373.13433638, 39007.84356017,
       16854.15621264, 41242.76434427, 31953.41880456,  8818.77307881,
       30122.89232714, 41579.92302251, 48038.03368771, 48985.77952958,
       27659.03993086,  7519.50609914, 40051.96441074, 14262.38987277,
       10269.34662477, 36040.87878654, 46068.74265248, 33839.62050983,
       37034.89367269, 26878.81274174, 11833.00127471, 13688.15164416,
       41788.49764129, 20857.12234341, 26854.63882074, 12256.86189762,
        8472.81356553, 39472.79777714, 23963.73471312, 29202.25387913,
       22960.11129135, 40931.10398682, 30290.20805407, 14534.74157733,
      

In [63]:
mean_absolute_error(y_test, y_pred)

3.948378663417113

In [64]:
r2_score(y_test, y_pred)

0.8482898787452964