# Multiple Linear Regression

1. Select dataset (multiple regression or polynomial regression) based on what you are assigned in the class 
2. Calculation from scratch: Coefficient, Intercept, R-squared, Adjusted R-squared. 
3. Used sklearn Library to calculate: Coefficient, Intercept, R-squared, Adjusted R-squared. 
4. Use Optimization method to optimize your calculation.

In [2]:
import numpy as np
import pandas as pd

In [3]:
#df = pd.read_csv('multiple_linear_regression_dataset.csv')
df = pd.read_csv('CO2_emission.csv')

In [4]:
df.head()

Unnamed: 0,Car,Model,Volume,Weight,CO2
0,Toyota,Aygo,1000,790,99
1,Mitsubishi,Space Star,1200,1160,95
2,Skoda,Citigo,1000,929,95
3,Fiat,500,900,865,90
4,Mini,Cooper,1500,1140,105


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Car     36 non-null     object
 1   Model   36 non-null     object
 2   Volume  36 non-null     int64 
 3   Weight  36 non-null     int64 
 4   CO2     36 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 1.5+ KB


In [6]:
# Using isnull() to check for missing values in the entire DataFrame
missing_values = df.isnull()

# If you want to check for missing values in a specific column, you can do it like this:
# Replace 'column_name' with the name of the column you want to check
# missing_values = df['column_name'].isnull()

# Count the number of missing values in each column (optional)
missing_count = df.isnull().sum()

# Count the total number of missing values in the DataFrame (optional)
total_missing_count = df.isnull().sum().sum()

# Display the results
#print(missing_values)
print(missing_count)
print("Total missing values:", total_missing_count)

Car       0
Model     0
Volume    0
Weight    0
CO2       0
dtype: int64
Total missing values: 0


### Calculation from scratch: Coefficient, Intercept, R-squared, Adjusted R-squared.

In [7]:
class MultipleLinearRegression:
    def __init__(self):
        self.coefficients = None
        self.intercept = None
    
    def fit(self, X, y):
        # Add a column of ones for the intercept term
        ones_column = np.ones((X.shape[0], 1))
        X = np.concatenate((ones_column, X), axis=1)

        # Compute the coefficients using ordinary least squares formula
        self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        
        # Compute the intercept (bias term)
        self.intercept = self.coefficients[0]
        self.coefficients = self.coefficients[1:]
    
    def predict(self, X):
        if self.coefficients is None or self.intercept is None:
            raise ValueError("Model has not been trained. Call fit() first.")
        
        # Add a column of ones for the intercept term
        ones_column = np.ones((X.shape[0], 1))
        X = np.concatenate((ones_column, X), axis=1)
        
        # Predict the target variable
        y_pred = X.dot(np.concatenate(([self.intercept], self.coefficients)))
        return y_pred

    def r_squared(self, X, y):
        y_pred = self.predict(X)
        total_variation = ((y - np.mean(y))**2).sum()
        residual_variation = ((y - y_pred)**2).sum()
        return 1 - (residual_variation / total_variation)
    
    def adjusted_r_squared(self, X, y):
        r2 = self.r_squared(X, y)
        n = X.shape[0]  # Number of samples
        p = X.shape[1]  # Number of features
        return 1 - (1 - r2) * (n - 1) / (n - p - 1)

In [8]:
def split_data(dataframe, split_ratio):
    # Shuffle the DataFrame
    shuffled_df = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)

    # Calculate the number of samples for training and testing
    train_size = int(split_ratio * len(shuffled_df))
    
    # Split the DataFrame into training and test sets
    train_set = shuffled_df[:train_size]
    test_set = shuffled_df[train_size:]

    return train_set, test_set

In [9]:
df.columns

Index(['Car', 'Model', 'Volume', 'Weight', 'CO2'], dtype='object')

In [11]:
# Split the DataFrame into training and test sets with a 70/30 ratio
train_set, test_set = split_data(df, split_ratio=0.7)

# Print the shapes of the resulting datasets
#print("Train set shape:", train_set.shape)
#print("Test set shape:", test_set.shape)

X_train = train_set[['Volume', 'Weight']]
y_train = train_set['CO2']

# Create and train the model
model = MultipleLinearRegression()

model.fit(X_train, y_train)

# Predict using the trained model
X_test = test_set[['Volume', 'Weight']]
y_test = test_set['CO2']

# Make predictions on the test data
y_pred = model.predict(X_test)

# Get coefficients, intercept, R-squared, and adjusted R-squared
coefficients = model.coefficients
intercept = model.intercept
r_squared = model.r_squared(X_test, y_test)
adjusted_r_squared = model.adjusted_r_squared(X_test, y_test)

print("R-squared:", r_squared)
print("Adjusted R-squared:", adjusted_r_squared)

print("Coefficients:", coefficients)
print("Intercept:", intercept)

R-squared: -0.12476894620399981
Adjusted R-squared: -0.4059611827549998
Coefficients: [0.00578028 0.01183901]
Intercept: 78.68013916849986


### sklearn Library to calculate: Coefficient, Intercept, R-squared, Adjusted R-squared.

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
def adjusted_r_squared(r_squared, n, p):
    return 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

In [14]:
# Sample data for demonstration
# Replace these with your own dataset
# X should be a 2D array with multiple features (independent variables)
# y should be a 1D array with the target variable (dependent variable)

X = df[['Weight', 'Volume']]
y = df[['CO2']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create the Multiple Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Calculate the adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = adjusted_r_squared(r2, n, p)

#print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Ajusted R-squared:", adjusted_r2)

# Coefficients and intercept of the model
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

R-squared: 0.41008714763391063
Ajusted R-squared: 0.2626089345423883
Coefficients: [[0.00720613 0.00482015]]
Intercept: [84.58784535]


### Optimization method to optimize calculation.

In [24]:
import numpy as np

In [38]:
class MultipleLinearRegressionOLS:
    def __init__(self):
        self.coefficients = None
        self.intercept = None
    
    def fit(self, X, y):
        # Add a column of ones for the intercept term
        ones_column = np.ones((X.shape[0], 1))
        X = np.concatenate((ones_column, X), axis=1)

        # Compute the coefficients using OLS formula
        self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        self.intercept = self.coefficients[0]
    
    def predict(self, X):
        if self.coefficients is None:
            raise ValueError("Model has not been trained. Call fit() first.")
        
        # Add a column of ones for the intercept term
        ones_column = np.ones((X.shape[0], 1))
        X = np.concatenate((ones_column, X), axis=1)
        
        # Predict the target variable
        y_pred = X.dot(self.coefficients)
        return y_pred
    
    def r_squared(self, X, y):
        y_pred = self.predict(X)
        total_variation = ((y - np.mean(y))**2).sum()
        residual_variation = ((y - y_pred)**2).sum()
        return 1 - (residual_variation / total_variation)
    
    def adjusted_r_squared(self, X, y):
        r2 = self.r_squared(X, y)
        n = X.shape[0]  # Number of samples
        p = X.shape[1]  # Number of features
        return 1 - (1 - r2) * (n - 1) / (n - p - 1)

In [42]:
# Split the DataFrame into training and test sets with a 70/30 ratio
train_set, test_set = split_data(df, split_ratio=0.7)

X_train = train_set[['Volume', 'Weight']]
y_train = train_set['CO2']

# Predict using the trained model
X_test = test_set[['Volume', 'Weight']]
y_test = test_set['CO2']

# Create and fit the model
mode_ols = MultipleLinearRegressionOLS()
mode_ols.fit(X_train, y_train)

# Test predictions
y_pred = mode_ols.predict(X_test)
#print("Predicted values:", y_pred)

# Get coefficients, intercept, R-squared, and adjusted R-squared
coefficients = mode_ols.coefficients
intercept = mode_ols.intercept
r_squared = mode_ols.r_squared(X_test, y_test)
adjusted_r_squared = mode_ols.adjusted_r_squared(X_test, y_test)

print("R-squared:", r_squared)
print("Adjusted R-squared:", adjusted_r_squared)

print("Coefficients:[", coefficients[1], coefficients[2], "]")
print("Intercept:", intercept)

R-squared: -0.12476894620399981
Adjusted R-squared: -0.4059611827549998
Coefficients:[ 0.005780275855181177 0.011839008946877588 ]
Intercept: 78.68013916849986
