In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["axes.grid"] = True

In [None]:
CSV_PATH = "/content/drive/MyDrive/ML/student_marks.csv"

df = pd.read_csv(CSV_PATH)

print("First 5 rows:")
display(df.head())
print("\nInfo:")
display(df.info())
print("\nDescribe:")
display(df.describe())
df.isnull().sum()
df.isnull().sum().plot(kind="bar")
plt.show()
df.isnull().sum().plot(kind="pie")
plt.show()
df.isnull().sum().plot(kind="hist")
plt.show()

In [None]:
df["study_hours"] = df["study_hours"].fillna(df["study_hours"].mean())
df["student_marks"] = df["student_marks"].fillna(df["student_marks"].median())


def iqr_cap(series, k=1.5):
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    low, high = q1 - k * iqr, q3 + k * iqr
    return series.clip(lower=low, upper=high)

APPLY_IQR_CAPPING = True
if APPLY_IQR_CAPPING:
    df["study_hours"] = iqr_cap(df["study_hours"], k=1.5)
    df["student_marks"] = iqr_cap(df["student_marks"], k=1.5)


In [None]:
plt.figure()
plt.scatter(df["study_hours"], df["student_marks"], alpha=0.7)
plt.xlabel("Study Hours")
plt.ylabel("Marks")
plt.title("Scatter: Study Hours vs Marks")
plt.show()

def plot_hist(series, title, bins=20):
    plt.figure()
    plt.hist(series, bins=bins, edgecolor="black")
    plt.title(title)
    plt.xlabel(series.name)
    plt.ylabel("Count")
    plt.show()

plot_hist(df["study_hours"], "Distribution: Study Hours")
plot_hist(df["student_marks"], "Distribution: Student Marks")

In [None]:
X = df[["study_hours"]].values
y = df["student_marks"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"\nTrain size: {X_train.shape[0]} | Test size: {X_test.shape[0]}")

In [None]:
pipe = Pipeline(
    steps=[
        ("poly", PolynomialFeatures(include_bias=False)),
        ("lr", LinearRegression())
    ]
)

param_grid = {"poly__degree": [1, 2, 3]}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_degree = grid.best_params_["poly__degree"]
print(f"\nBest polynomial degree (CV): {best_degree}")
print(f"Best CV score (−RMSE): {grid.best_score_:.4f}")


In [None]:
y_pred = best_model.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print("\nTest Metrics:")
print(f"  MAE  : {mae:.4f}")
print(f"  RMSE : {rmse:.4f}")
print(f"  R²   : {r2:.4f}")

In [None]:
# (A) Train+Test scatter + fitted curve
x_all = df[["study_hours"]].values
x_plot = np.linspace(x_all.min(), x_all.max(), 200).reshape(-1, 1)
y_plot = best_model.predict(x_plot)

plt.figure()
plt.scatter(X_train, y_train, label="Train", alpha=0.7)
plt.scatter(X_test, y_test, label="Test", alpha=0.7)
plt.plot(x_plot, y_plot, "r-", linewidth=2, label=f"Fitted (degree={best_degree})")
plt.xlabel("Study Hours")
plt.ylabel("Marks")
plt.title("Fitted Regression (Train/Test)")
plt.legend()
plt.show()

# (B) Actual vs Predicted (Test)
plt.figure()
plt.scatter(y_test, y_pred, alpha=0.8)
min_v = min(y_test.min(), y_pred.min())
max_v = max(y_test.max(), y_pred.max())
plt.plot([min_v, max_v], [min_v, max_v], "r--", label="Perfect fit")
plt.xlabel("Actual Marks")
plt.ylabel("Predicted Marks")
plt.title("Actual vs Predicted (Test)")
plt.legend()
plt.show()

# (C) Residual plot (Test)
residuals = y_test - y_pred
plt.figure()
plt.scatter(y_pred, residuals, alpha=0.8)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Predicted Marks")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Predicted (Test)")
plt.show()

# (D) Error distribution (Test residuals)
plt.figure()
plt.hist(residuals, bins=20, edgecolor="black")
plt.title("Residuals Distribution (Test)")
plt.xlabel("Residual")
plt.ylabel("Count")
plt.show()

# (E) Sample prediction table
comparison = pd.DataFrame(
    {"study_hours": X_test.flatten(), "Actual": y_test, "Predicted": y_pred}
).sort_values("study_hours").reset_index(drop=True)
print("\nSample predictions (sorted by study_hours):")
display(comparison.head(10))