<a href="https://colab.research.google.com/github/purvamarkam/ML_LAB/blob/main/Assignment7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
from scipy.stats import zscore
from statsmodels.stats.outliers_influence import variance_inflation_factor

file_path = r"/content/linear_regression_3.csv"

df = pd.read_csv(file_path)


def calculate_vif(df):
    vif_data = pd.DataFrame()-+
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

X = df.drop(columns=['y'])
vif_df = calculate_vif(X)

high_vif_features = vif_df[vif_df["VIF"] > 10]["Feature"].tolist()
print("Features with high VIF (multicollinearity):", high_vif_features)

X_filtered = X.drop(columns=high_vif_features)
df_filtered = X_filtered.copy()
df_filtered["y"] = df["y"]

print("VIF Table After Feature Removal:\n", calculate_vif(X_filtered))

z_scores = df_filtered.apply(zscore)
threshold = 3
df_cleaned = df_filtered[(z_scores < threshold).all(axis=1)]


X_cleaned = df_cleaned.drop(columns=['y'])
y_cleaned = df_cleaned['y']
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

X_train_const = sm.add_constant(X_train)
model_ols = sm.OLS(y_train, X_train_const).fit()
influence = model_ols.get_influence()
cooks_d, _ = influence.cooks_distance

cooks_threshold = 4 / len(X_train)
influential_points = np.where(cooks_d > cooks_threshold)[0]
print("Number of influential points detected:", len(influential_points))


X_train_filtered = X_train.drop(X_train.index[influential_points])
y_train_filtered = y_train.drop(y_train.index[influential_points])


ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_filtered, y_train_filtered)

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_filtered, y_train_filtered)

y_pred_ridge = ridge_model.predict(X_test)
y_pred_lasso = lasso_model.predict(X_test)

r2_ridge = r2_score(y_test, y_pred_ridge)
r2_lasso = r2_score(y_test, y_pred_lasso)

score_ridge = 10 * r2_ridge
score_lasso = 10 * r2_lasso

print("Final VIF Table After Removal:\n", calculate_vif(X_train_filtered))
print("R² Score (Ridge):", r2_ridge)
print("Final Score (Ridge):", score_ridge)
print("R² Score (Lasso):", r2_lasso)
print("Final Score (Lasso):", score_lasso)


Features with high VIF (multicollinearity): ['x1', 'x2', 'x4', 'x5', 'x9', 'x10', 'x11']
VIF Table After Feature Removal:
   Feature       VIF
0      x3  1.005677
1      x6  1.038902
2      x7  1.347616
3      x8  1.356499
Number of influential points detected: 59
Final VIF Table After Removal:
   Feature       VIF
0      x3  1.000699
1      x6  1.053353
2      x7  1.321973
3      x8  1.336733
R² Score (Ridge): 0.07066266989939474
Final Score (Ridge): 0.7066266989939474
R² Score (Lasso): 0.07080312322632731
Final Score (Lasso): 0.7080312322632731
