In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [3]:
df=pd.read_csv("stud.csv")

In [4]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
x=df.drop(columns=['math_score'])
y=df['math_score']

In [7]:
num_features=x.select_dtypes(exclude='object').columns
cat_features=x.select_dtypes(include='object').columns

In [8]:
num_features

Index(['reading_score', 'writing_score'], dtype='object')

In [9]:
cat_features

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')

In [10]:
x.head(3)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93


In [11]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
num_transform=StandardScaler()
oh_transfomer=OneHotEncoder()
preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",oh_transfomer,cat_features),
        ('StandardScaler',num_transform,num_features)
    ]
)

In [12]:
x=preprocessor.fit_transform(x)

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((800, 19), (200, 19))

In [14]:
x_train

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.05694554,  0.45733301],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.94779033,  0.98406266],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.35894946,  1.18158627],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.49126664, -0.99117351],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -1.45063795, -0.99117351],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.4960025 ,  1.37910989]], shape=(800, 19))

evaluation part


In [15]:
from sklearn.metrics import mean_absolute_error


def evaluate_model(true,predict):
    mae=mean_absolute_error(true,predict)
    mse=mean_squared_error(true,predict)
    rmse=np.sqrt(mean_squared_error(true,predict))
    r2score=r2_score(true,predict)
    return mae,mse,rmse,r2score

In [16]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor

In [17]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Neighbours Regression":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False),
    "AdaBoostRegressor":AdaBoostRegressor()
}

model_list=[]
r2_list=[]
for model_name, model in models.items():
    model.fit(x_train,y_train)
    
    y_pred=model.predict(x_test)
    
    model_test_mae,model_test_mse,model_train_rmse,model_r2=evaluate_model(y_test,y_pred)
    print(f"Model: {model_name}")
    print(f"MAE: {model_test_mae:.2f}")
    print(f"MSE: {model_test_mse:.2f}")
    print(f"RMSE: {model_train_rmse:.2f}")
    print(f"R2 Score: {model_r2:.2f}")
    print('..............................')
    model_list.append(model)
    r2_list.append(model_r2)
    print('...................................')


Model: LinearRegression
MAE: 4.21
MSE: 29.10
RMSE: 5.39
R2 Score: 0.88
..............................
...................................
Model: Lasso
MAE: 5.16
MSE: 42.51
RMSE: 6.52
R2 Score: 0.83
..............................
...................................
Model: Ridge
MAE: 4.21
MSE: 29.06
RMSE: 5.39
R2 Score: 0.88
..............................
...................................
Model: K-Neighbours Regression
MAE: 5.64
MSE: 52.68
RMSE: 7.26
R2 Score: 0.78
..............................
...................................
Model: DecisionTreeRegressor
MAE: 6.09
MSE: 58.62
RMSE: 7.66
R2 Score: 0.76
..............................
...................................
Model: K-Neighbours Regression
MAE: 5.64
MSE: 52.68
RMSE: 7.26
R2 Score: 0.78
..............................
...................................
Model: DecisionTreeRegressor
MAE: 6.09
MSE: 58.62
RMSE: 7.66
R2 Score: 0.76
..............................
...................................
Model: RandomForestRegressor
MAE

In [18]:
# Build a results DataFrame from the trained models and their R2 scores
import pandas as pd

# Convert model objects to their class names for readability
model_names = [m.__class__.__name__ for m in model_list]

results_df = pd.DataFrame({
    'model': model_names,
    'r2': r2_list
})

# Ensure r2 column is float and round to 2 decimals (e.g., 0.85)
results_df['r2'] = results_df['r2'].astype(float).round(2)

# Sort by R2 descending and reset index
results_df = results_df.sort_values(by='r2', ascending=False).reset_index(drop=True)

# Display the DataFrame
results_df

Unnamed: 0,model,r2
0,LinearRegression,0.88
1,Ridge,0.88
2,RandomForestRegressor,0.85
3,CatBoostRegressor,0.85
4,AdaBoostRegressor,0.84
5,Lasso,0.83
6,XGBRegressor,0.83
7,KNeighborsRegressor,0.78
8,DecisionTreeRegressor,0.76


In [19]:
model_list

[LinearRegression(),
 Lasso(),
 Ridge(),
 KNeighborsRegressor(),
 DecisionTreeRegressor(),
 RandomForestRegressor(),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...),
 <catboost.core.CatBoostRegressor at 0x2160f8f2da0>,
 AdaBoostRegressor()]

In [None]:
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

from src.exception import CustomException
from src.logger import logging
import os

from src.utils import save_object

@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path=os.path.join('artifacts',"proprocessor.pkl")

class DataTransformation:
    def __init__(self):
        self.data_transformation_config=DataTransformationConfig()

    def get_data_transformer_object(self):
        '''
        This function si responsible for data trnasformation
        
        '''
        try:
            numerical_columns = ["writing_score", "reading_score"]
            categorical_columns = [
                "gender",
                "race_ethnicity",
                "parental_level_of_education",
                "lunch",
                "test_preparation_course",
            ]

            num_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="median")),
                ("scaler",StandardScaler())

                ]
            )

            cat_pipeline=Pipeline(

                steps=[
                ("imputer",SimpleImputer(strategy="most_frequent")),
                ("one_hot_encoder",OneHotEncoder()),
                ("scaler",StandardScaler(with_mean=False))
                ]

            )

            logging.info(f"Categorical columns: {categorical_columns}")
            logging.info(f"Numerical columns: {numerical_columns}")

            preprocessor=ColumnTransformer(
                [
                ("num_pipeline",num_pipeline,numerical_columns),
                ("cat_pipelines",cat_pipeline,categorical_columns)

                ]


            )

            return preprocessor
        
        except Exception as e:
            raise CustomException(e,sys)
        
    def initiate_data_transformation(self,train_path,test_path):

        try:
            train_df=pd.read_csv(train_path)
            test_df=pd.read_csv(test_path)

            logging.info("Read train and test data completed")

            logging.info("Obtaining preprocessing object")

            preprocessing_obj=self.get_data_transformer_object()

            target_column_name="math_score"
            numerical_columns = ["writing_score", "reading_score"]

            input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df=train_df[target_column_name]

            input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df=test_df[target_column_name]

            logging.info(
                f"Applying preprocessing object on training dataframe and testing dataframe."
            )

            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            logging.info(f"Saved preprocessing object.")

            save_object(

                file_path=self.data_transformation_config.preprocessor_obj_file_path,
                obj=preprocessing_obj

            )

            return (
                train_arr,
                test_arr,
                self.data_transformation_config.preprocessor_obj_file_path,
            )
        except Exception as e:
            raise CustomException(e,sys)

Unnamed: 0,model,r2
0,LinearRegression,0.88
1,Ridge,0.88
2,RandomForestRegressor,0.85
3,CatBoostRegressor,0.85
4,AdaBoostRegressor,0.84
5,Lasso,0.83
6,XGBRegressor,0.83
7,KNeighborsRegressor,0.78
8,DecisionTreeRegressor,0.76


Saved results: artifacts\models\results.csv artifacts\models\results_df.pkl
Saved models dict with joblib: artifacts\models\all_models.joblib
Individual model files saved: ['artifacts\\models\\LinearRegression.pkl', 'artifacts\\models\\Lasso.pkl', 'artifacts\\models\\Ridge.pkl', 'artifacts\\models\\K-Neighbours_Regression.pkl', 'artifacts\\models\\DecisionTreeRegressor.pkl', 'artifacts\\models\\RandomForestRegressor.pkl', 'artifacts\\models\\XGBRegressor.pkl', 'artifacts\\models\\CatBoostRegressor.pkl', 'artifacts\\models\\AdaBoostRegressor.pkl']

Directory listing for artifacts\models
- AdaBoostRegressor.pkl
- CatBoostRegressor.pkl
- DecisionTreeRegressor.pkl
- K-Neighbours_Regression.pkl
- Lasso.pkl
- LinearRegression.pkl
- RandomForestRegressor.pkl
- Ridge.pkl
- XGBRegressor.pkl
- all_models.joblib
- results.csv
- results_df.pkl

Attempting to load: artifacts\models\LinearRegression.pkl
Loaded OK (pickle). Type: <class 'sklearn.linear_model._base.LinearRegression'>

All save attempt

linear_regression is working well here