In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import numpy as np
import pandas as pd

df=pd.read_csv('dataset.csv')

# Copy of the dataframe to modify
data = df.copy()

# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Split into features and target
X = data.drop("Exam_Score", axis=1)
y = data["Exam_Score"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42, objective='reg:squarederror')
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    results[name] = {
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

results


{'Linear Regression': {'RMSE': 2.097948359816075,
  'MAE': 1.0156733842910806,
  'R2': 0.688619401927732},
 'Random Forest': {'RMSE': 2.201324350388354,
  'MAE': 1.1229500756429653,
  'R2': 0.6571769331321213},
 'XGBoost': {'RMSE': 2.1797948817576818,
  'MAE': 0.9731103028544139,
  'R2': 0.6638498902320862}}

In [4]:
# Restricting to only Linear Regression and Random Forest
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    results[name] = {
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred),
    }

results

{'Linear Regression': {'RMSE': 2.097948359816075,
  'MAE': 1.0156733842910806,
  'R2': 0.688619401927732},
 'XGBoost': {'RMSE': 2.1797948817576818,
  'MAE': 0.9731103028544139,
  'R2': 0.6638498902320862}}

In [7]:
import joblib

joblib.dump(models, "linear_regression_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']

In [9]:
import joblib

# Save the best model and scaler
joblib.dump(models["Linear Regression"], "linear_regression_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(
    label_encoders, "label_encoders.pkl"
)  # Save encoders for later use in the backend

"linear_regression_model.pkl, scaler.pkl, label_encoders.pkl saved successfully."

'linear_regression_model.pkl, scaler.pkl, label_encoders.pkl saved successfully.'