#### Logging Module

In [56]:
import logging
import os
import sys
from datetime import datetime

LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
logs_path = os.path.join(os.getcwd(),"logs",LOG_FILE)
os.makedirs(logs_path, exist_ok=True)

LOG_FILE_PATH = os.path.join(logs_path,LOG_FILE)

logging.basicConfig(
    filename= LOG_FILE_PATH,
    format= "[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
    level=logging.INFO
)


### Exception Module

In [57]:
def error_message_details(error, error_detail:sys):
    _,_,exc_tb = error_detail.exc_info()
    file_name = exc_tb.tb_frame.f_code.co_filename
    error_message = "Error Occured in Python Script name [{0}] line number [{1}] error message [{2}]".format(
        file_name,exc_tb.tb_lineno, str(error)
    )

    return error_message

class CustomException(Exception):
    def __init__(self, error_message, error_detail:sys):
        super().__init__(error_message)
        self.error_message = error_message_details(error_message,error_detail=error_detail)

    def __str__(self):
        return self.error_message

#### Utils Module

In [58]:
import os
import sys

import numpy as np 
import pandas as pd
import dill
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

def save_object(file_path,obj):
    try:
        dir_path = os.path.dirname(file_path)

        os.makedirs(dir_path, exist_ok=True)

        with open(file_path, "wb") as file_obj:
            dill.dump(obj,file_obj)

    except Exception as e:
        raise CustomException(e,sys)
    
def evaluate_models(X_train,y_train,X_test,y_test,models,param):
    try:
        report = {}
        for i in range(len(list(models))):
            model = list(models.values())[i]
            para =  param[list(models.keys())[i]]

            gs = GridSearchCV(model,para,cv=3)
            gs.fit(X_train,y_train)

            model.set_params(**gs.best_params_)
            model.fit(X_train,y_train)
    

            y_train_pred = model.predict(X_train)

            y_test_pred = model.predict(X_test)

            train_model_score = r2_score(y_train,y_train_pred)
            test_model_score = r2_score(y_test,y_test_pred)

            logging.info(f"{list(models.keys())[i]},Train model score :{train_model_score}, Test model score: {test_model_score}")
            
            report[list(models.keys())[i]] = test_model_score

        return report
    except Exception as e:
        raise CustomException(e,sys)
def load_object(file_path):
    try:
        with open(file_path, "rb") as file_obj:
            return dill.load(file_obj)

    except Exception as e:
        raise CustomException(e, sys)

#### Import Data and Split

In [59]:
import os
import sys
import pandas as pd 

from sklearn.model_selection import train_test_split
from dataclasses import dataclass  # to directly define class variable without __init__ using decorator @dataclass

class DataIngestionConfig:
    # if only defining variables then its okay to use dataclass, if have methods prefer init approach
    train_data_path: str=os.path.join('artifacts','train.csv')
    test_data_path: str=os.path.join('artifacts','test.csv')
    raw_data_path: str=os.path.join('artifacts','data.csv')
ingestion_config = DataIngestionConfig()

In [60]:
df = pd.read_csv('../Notebook/data/stud.csv')
logging.info('Read the dataset as dataframe')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [61]:
os.makedirs(os.path.dirname(ingestion_config.train_data_path),exist_ok=True)

In [62]:
df.to_csv(ingestion_config.raw_data_path,index=False, header=True)

In [63]:
logging.info("Train test split initiated")
train_set,test_set = train_test_split(df,test_size=0.2,random_state=42)

In [64]:
train_set.to_csv(ingestion_config.train_data_path, index = False, header = True)
test_set.to_csv(ingestion_config.test_data_path, index = False, header = True)
train_path = ingestion_config.train_data_path
test_path = ingestion_config.test_data_path
logging.info("Ingestion of data is completed")

#### Transform ans Split Data

In [65]:
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl')

In [66]:
data_transformation_config = DataTransformationConfig()

In [67]:
from sklearn.compose import ColumnTransformer  # create pipeline for ohc or standardscaling, if want to use in form of pipeline
from sklearn.impute import SimpleImputer # for missing data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def get_data_transformer_object():

        '''
        This fuction is resposible for Data Transformation
        
        '''

        try:
            numerical_columns = ["writing_score","reading_score"]
            categorical_columns = [
                "gender",
                "race_ethnicity",
                "parental_level_of_education",
                "lunch",
                "test_preparation_course",
            ]

            num_pipeline = Pipeline(
                steps = [
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]
            )
            cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")), # replacing with mode
                    ("one_hot_encoder",OneHotEncoder()),
                    ("scaler",StandardScaler(with_mean=False))
                ]
            )

            logging.info(f"Categorical Columns: {categorical_columns}")
            logging.info(f"Numerical Columns: {numerical_columns}")
            
            preprocessor = ColumnTransformer(
                [
                    ("num_pipeline", num_pipeline,numerical_columns),
                    ("cat_pipeline",cat_pipeline,categorical_columns)
                ]
            )
            return preprocessor

        except Exception as e:
            raise CustomException(e,sys)

In [68]:
train_df=pd.read_csv(train_path)
test_df=pd.read_csv(test_path)

logging.info("Read train and test data completed")
logging.info("Obtaining preprocessing object")

In [69]:
preprocessing_obj = get_data_transformer_object()

In [70]:
target_column_name = "math_score"
numerical_columns = ["writing_score", "reading_score"]

input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = test_df[target_column_name]

logging.info(f"Applying preprocesing object on training and testing dataframe")

In [71]:
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

In [76]:
train_array = np.c_[
    input_feature_train_arr,np.array(target_feature_train_df)
]

test_array = np.c_[
    input_feature_test_arr,np.array(target_feature_test_df)
]

logging.info(f"Saved preprocessing object")

save_object(
    file_path = data_transformation_config.preprocessor_obj_file_path,obj=preprocessing_obj
)

#### Model Training

In [77]:
from catboost import CatBoostRegressor
from sklearn.ensemble import (AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [78]:
class ModelTrainerConfig:
    trained_model_file_path = os.path.join("artifacts", "model.pkl")

In [88]:
model_trainer_config = ModelTrainerConfig()

In [79]:
logging.info("Split training and test input data")
X_train,y_train,X_test,y_test =(
    train_array[:,:-1],
    train_array[:,-1],
    test_array[:,:-1],
    test_array[:,-1]
    )
models = {
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Linear Regression": LinearRegression(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}

In [80]:
# Best way is to use additional config file, yaml file and from that can read hyperparameters
params={
    "Decision Tree": {
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'splitter':['best','random'],
        # 'max_features':['sqrt','log2'],
    },
    "Random Forest":{
        # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        
        # 'max_features':['sqrt','log2',None],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Gradient Boosting":{
        # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate':[.1,.01,.05,.001],
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        # 'criterion':['squared_error', 'friedman_mse'],
        # 'max_features':['auto','sqrt','log2'],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Linear Regression":{},
    "XGBRegressor":{
        'learning_rate':[.1,.01,.05,.001],
        'n_estimators': [8,16,32,64,128,256]
    },
    "CatBoosting Regressor":{
        'depth': [6,8,10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor":{
        'learning_rate':[.1,.01,0.5,.001],
        # 'loss':['linear','square','exponential'],
        'n_estimators': [8,16,32,64,128,256]
    }
    
}        

In [81]:
model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
                                             models=models,param=params)

In [82]:
# to get best model score from dict
best_model_score = max(sorted(model_report.values()))

TypeError: can only concatenate str (not "numpy.float64") to str

In [83]:
print("Best score: " + str(best_model_score))

Best score: 0.8795158595242263


In [84]:
# best model score name
best_model_name = list(model_report.keys())[list(model_report.values()).index(best_model_score)]

best_model = models[best_model_name]

print(best_model_name)

Linear Regression


In [85]:
if best_model_score<0.6:
    raise CustomException("No best model found")
logging.info(f"Best found model on training and testing dataset")

In [89]:
save_object(
    file_path=model_trainer_config.trained_model_file_path,
    obj=best_model
)

In [90]:
predicted = best_model.predict(X_test)

In [91]:
r2_square = r2_score(y_test,predicted)
print(r2_square)

0.8795158595242263
