In [None]:
# Understanding steps of data processing and model training in a machine learning pipeline

In [12]:
from xgboost import XGBRegressor

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df=pd.read_csv("/Users/apple/xxxxx/yt_beast/placement.csv")

In [27]:
df.head(15)

Unnamed: 0,cgpa,package
0,6.89,3.26
1,5.12,1.98
2,7.82,3.25
3,7.42,3.67
4,6.94,3.57
5,7.89,2.99
6,6.73,2.6
7,6.75,2.48
8,6.09,2.31
9,8.31,3.51


In [4]:
from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test=train_test_split(df[["cgpa"]],df["package"],test_size=0.2,random_state=42)

In [15]:
models = {

    "ridge": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", Ridge())
        ]),
        "params": {
            "model__alpha": [0.01, 0.1, 1, 10, 100]
        }
    },

    "lasso": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", Lasso())
        ]),
        "params": {
            "model__alpha": [0.001, 0.01, 0.1, 1, 10]
        }
    },

    "elastic": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", ElasticNet())
        ]),
        "params": {
            "model__alpha": [0.01, 0.1, 1, 10],
            "model__l1_ratio": [0.1, 0.5, 0.9]
        }
    },

    "sgd": {
        "pipeline": Pipeline([
            ("scaler", StandardScaler()),
            ("model", SGDRegressor())
        ]),
        "params": {
            "model__alpha": [1e-4, 1e-3, 1e-2],
            "model__penalty": ["l2", "l1"]
        }
    },

    "rf": {
        "pipeline": Pipeline([
            ("model", RandomForestRegressor())
        ]),
        "params": {
            "model__n_estimators": [100, 200],
            "model__max_depth": [None, 5, 10]
        }
    },
    "xgb": {
        "pipeline": Pipeline([
        ("model", XGBRegressor(
            objective="reg:squarederror",
            random_state=42,
            n_jobs=-1
        ))
    ]),
    "params": {
        "model__n_estimators": [100, 300],
        "model__max_depth": [3, 5, 7],
        "model__learning_rate": [0.01, 0.1, 0.3],
        "model__subsample": [0.7, 1],
        "model__colsample_bytree": [0.7, 1]
    }
}}

In [19]:
import joblib

results = []
trained_models = {}

for name, config in models.items():

    grid = GridSearchCV(
        config["pipeline"],
        config["params"],
        cv=5,
        scoring="r2",
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    trained_models[name] = best_model   # ‚≠ê store model

    y_pred = best_model.predict(X_test)

    results.append({
        "model": name,
        "best_params": grid.best_params_,
        "cv_r2": grid.best_score_,
        "test_r2": r2_score(y_test, y_pred),
        "test_mse": mean_squared_error(y_test, y_pred)
    })

In [20]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="test_r2", ascending=False, inplace=True)

In [21]:
results_df

Unnamed: 0,model,best_params,cv_r2,test_r2,test_mse
2,elastic,"{'model__alpha': 0.01, 'model__l1_ratio': 0.1}",0.766838,0.773888,0.083883
1,lasso,{'model__alpha': 0.001},0.766886,0.773231,0.084127
0,ridge,{'model__alpha': 0.1},0.766884,0.773151,0.084157
3,sgd,"{'model__alpha': 0.001, 'model__penalty': 'l1'}",0.765701,0.76935,0.085567
5,xgb,"{'model__colsample_bytree': 0.7, 'model__learn...",0.724933,0.74851,0.093298
4,rf,"{'model__max_depth': 5, 'model__n_estimators':...",0.705022,0.734656,0.098438


In [22]:
best_model_name = results_df.iloc[0]["model"]
best_model = trained_models[best_model_name]

print("Best model:", best_model_name)

Best model: elastic


In [23]:
joblib.dump(best_model, "best_model.pkl")

['best_model.pkl']

In [24]:
import pandas as pd

new_data = pd.DataFrame({
    "cgpa": [6.5, 7.0, 7.8, 8.2, 9.1]
})

In [25]:
loaded_model = joblib.load("best_model.pkl")
loaded_model.predict(new_data)

array([2.70870955, 2.9928208 , 3.44739881, 3.67468781, 4.18608807])