In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression


In [None]:
# Notes:
# This script demonstrates the process of performing multiple linear regression and feature selection
# using the f_regression function from sklearn. 
# The purpose of f_regression is to evaluate the statistical significance of each feature (SAT, Rand 1,2,3)
# in predicting the dependent variable (GPA). 
# The output includes the F-statistics and p-values for each feature, helping us assess their relevance.


In [2]:

# Load the data (replace '1.02. Multiple linear regression.csv' with your actual file path)
data = pd.read_csv('data/1.02. Multiple linear regression.csv')

# Display the first 5 rows of the dataset to understand its structure
print("First 5 rows of the dataset:")
print(data.head())


First 5 rows of the dataset:
    SAT  Rand 1,2,3   GPA
0  1714           1  2.40
1  1664           3  2.52
2  1760           3  2.54
3  1685           3  2.74
4  1693           2  2.83


In [3]:

# Declare the independent (X) and dependent (y) variables
X = data[['SAT', 'Rand 1,2,3']]  # Independent variables (features)
y = data['GPA']  # Dependent variable (target)

# Initialize the LinearRegression model
regressor = LinearRegression()

# Fit the regression model on the data
regressor.fit(X, y)

# Get the coefficients of the regression (slope of the line for each feature)
print("\nRegression Coefficients:")
print(regressor.coef_)



Regression Coefficients:
[ 0.00165354 -0.00826982]


In [4]:

# Get the intercept of the regression (the constant term)
print("\nIntercept:")
print(regressor.intercept_)

# Calculate R-squared to see the proportion of variance explained by the model
r_squared = regressor.score(X, y)
print("\nR-squared of the regression model:", r_squared)



Intercept:
0.29603261264909486

R-squared of the regression model: 0.4066811952814282


In [5]:

# Perform feature selection using f_regression to evaluate individual features
# f_regression returns two arrays: F-statistics and p-values
f_stats, p_values = f_regression(X, y)

# Round the p-values to 3 decimal places for easy interpretation
rounded_p_values = np.round(p_values, 3)

# Prepare the final results in a table format
result_table = pd.DataFrame({
    'Feature': X.columns,
    'F-statistic': np.round(f_stats, 3),
    'p-value': rounded_p_values
})

# Display the results as a table
print("\nFeature Selection Results (F-statistics and p-values):")
print(result_table)



Feature Selection Results (F-statistics and p-values):
      Feature  F-statistic  p-value
0         SAT       56.048    0.000
1  Rand 1,2,3        0.176    0.676


In [None]:
# Final Interpretation Notes:
# - If the p-value is below 0.05, the feature is considered statistically significant.
# - If the p-value is above 0.05, the feature may not contribute significantly to the prediction of GPA.
# - The F-statistic indicates how well the feature explains the variance in the dependent variable.
# - Features with higher F-statistics are more important in explaining the variance of the dependent variable.

# Example Output Interpretation:
# If the p-value for 'SAT' is below 0.05, we would consider it to be a statistically significant predictor for GPA.
# If the p-value for 'Rand 1,2,3' is above 0.05, we may consider removing it from the model.
