<a href="https://colab.research.google.com/github/rithima17/xai_lab_2471/blob/main/labA1(XAI).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=4, suppress=True)

def fit_linear_closed_form(X, y):
    """Fit linear regression with intercept using normal equation."""
    n, m = X.shape
    X_design = np.hstack([np.ones((n,1)), X])
    theta = np.linalg.pinv(X_design.T.dot(X_design)).dot(X_design.T).dot(y)
    intercept = float(theta[0])
    coefs = np.array(theta[1:]).astype(float)
    return intercept, coefs

def explain_linear_with_shap_like(X, y, feature_names=None):
    """Compute baseline, SHAP-like contributions, and residuals."""
    n, m = X.shape
    intercept, coefs = fit_linear_closed_form(X, y)
    y_pred = intercept + X.dot(coefs)
    baseline = float(np.mean(y))
    mean_x = np.mean(X, axis=0)
    contribs = (X - mean_x) * coefs  # SHAP-like contributions
    contribs_sum = np.sum(contribs, axis=1)
    residual = y_pred - (baseline + contribs_sum)

    if feature_names is None:
        feature_names = [f"x{i+1}" for i in range(m)]
    df = pd.DataFrame(X, columns=feature_names)
    for i, fname in enumerate(feature_names):
        df[f"SHAP_{fname}"] = contribs[:, i]
    df["Residual_or_Intercept_Adjust"] = residual
    df["Predicted"] = y_pred
    df["Actual"] = y
    df["Baseline(mean_y)"] = baseline
    df["Check_sum"] = baseline + contribs_sum + residual
    return df.round(4), {
        "intercept": intercept,
        "coefs": dict(zip(feature_names, coefs.round(6))),
        "baseline_mean_y": baseline,
        "mean_x": dict(zip(feature_names, mean_x.round(6)))
    }

# ---------------- Q1 ----------------
print("\nQ1: LearnNow – Simple Linear Regression")
x_q1 = np.array([1,3,2,1,3], dtype=float).reshape(-1,1)
y_q1 = np.array([80,120,95,85,130], dtype=float)
df_q1, meta_q1 = explain_linear_with_shap_like(x_q1, y_q1, ["Emails"])
print("Intercept:", meta_q1["intercept"])
print("Coefficient:", meta_q1["coefs"]["Emails"])
print("Baseline:", meta_q1["baseline_mean_y"])
print(df_q1)

# ---------------- Q2 ----------------
print("\nQ2: ShopEase – Multiple Linear Regression")
X_q2 = np.array([[200,10],
                 [300,15],
                 [250,5],
                 [150,10],
                 [100,0]], dtype=float)
y_q2 = np.array([1500,2000,1700,1400,1000], dtype=float)
df_q2, meta_q2 = explain_linear_with_shap_like(X_q2, y_q2, ["AdSpend","Discount"])
print("Intercept:", meta_q2["intercept"])
print("Coefs:", meta_q2["coefs"])
print("Baseline:", meta_q2["baseline_mean_y"])
print(df_q2)

# ---------------- Q3 ----------------
print("\nQ3: Diabetes Dataset – Multiple Linear Regression")
diab = datasets.load_diabetes()
X = diab.data
y = diab.target
feature_names = diab.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df_q3_train, meta_q3 = explain_linear_with_shap_like(X_train, y_train, feature_names)
print("Intercept:", meta_q3["intercept"])
print("Baseline:", meta_q3["baseline_mean_y"])
print("Coefs:", meta_q3["coefs"])
# Example explanation for first test sample
intercept_q3 = meta_q3["intercept"]
coefs_q3 = np.array([meta_q3["coefs"][fn] for fn in feature_names])
mean_x_train = np.array([meta_q3["mean_x"][fn] for fn in feature_names])
y_test_pred = intercept_q3 + X_test.dot(coefs_q3)
contribs_test = (X_test - mean_x_train) * coefs_q3
residual_test = y_test_pred - (meta_q3["baseline_mean_y"] + np.sum(contribs_test, axis=1))
df_test_q3 = pd.DataFrame(X_test, columns=feature_names)
for i, fn in enumerate(feature_names):
    df_test_q3[f"SHAP_{fn}"] = contribs_test[:, i]
df_test_q3["Residual_Adjust"] = residual_test
df_test_q3["Predicted"] = y_test_pred
df_test_q3["Actual"] = y_test
df_test_q3["Baseline"] = meta_q3["baseline_mean_y"]
print(df_test_q3.head())

# ---------------- Q4 ----------------
print("\nQ4: Student Performance – Synthetic Dataset Example")
np.random.seed(0)
n = 12
study_time = np.random.randint(1,6,size=n)
failures = np.random.randint(0,3,size=n)
health = np.random.randint(1,6,size=n)
absences = np.random.randint(0,11,size=n)
parent_edu = np.random.randint(0,3,size=n)
X_q4 = np.vstack([study_time, failures, health, absences, parent_edu]).T.astype(float)
y_q4 = 50 + 5*study_time - 8*failures + 2*health - 0.7*absences + 3*parent_edu + np.random.normal(0,4,size=n)
feature_names_q4 = ["study_time","failures","health","absences","parent_edu"]
df_q4, meta_q4 = explain_linear_with_shap_like(X_q4, y_q4, feature_names_q4)
print("Intercept:", meta_q4["intercept"])
print("Coefs:", meta_q4["coefs"])
print("Baseline:", meta_q4["baseline_mean_y"])
print(df_q4)


Q1: LearnNow – Simple Linear Regression
Intercept: 59.499999999999986
Coefficient: 21.25
Baseline: 102.0
   Emails  SHAP_Emails  Residual_or_Intercept_Adjust  Predicted  Actual  \
0     1.0       -21.25                          -0.0      80.75    80.0   
1     3.0        21.25                          -0.0     123.25   120.0   
2     2.0         0.00                          -0.0     102.00    95.0   
3     1.0       -21.25                          -0.0      80.75    85.0   
4     3.0        21.25                          -0.0     123.25   130.0   

   Baseline(mean_y)  Check_sum  
0             102.0      80.75  
1             102.0     123.25  
2             102.0     102.00  
3             102.0      80.75  
4             102.0     123.25  

Q2: ShopEase – Multiple Linear Regression
Intercept: 628.1481481487334
Coefs: {'AdSpend': np.float64(3.896296), 'Discount': np.float64(14.074074)}
Baseline: 1520.0
   AdSpend  Discount  SHAP_AdSpend  SHAP_Discount  \
0    200.0      10.0       