In [1]:
import numpy as np
import pandas as pd
import warnings

from sklearn.metrics import root_mean_squared_error , r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from src.utils import load_object
from src.utils import save_object
import pickle



In [6]:
df = pd.read_csv("data/student_data.csv")

In [7]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
df = df.rename(columns={'race/ethnicity':"race_ethnicity",'parental level of education':'parental_level_of_education',
                        'test preparation course':'test_preparation_course','math score':'math_score','reading score':'reading_score','writing score':'writing_score'})

In [4]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.to_csv("data/student_data.csv",index=False)

In [5]:
X = df.drop(columns='math score',axis=1)
Y = df['math score']

In [7]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,cat_features),
        ("StandardScaler",numeric_transformer,num_features)
    ]
)

In [8]:
X = preprocessor.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=7)

X_train.shape , X_test.shape

((800, 19), (200, 19))

In [10]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    rmse = root_mean_squared_error(true,predicted)
    r2_square = r2_score(true,predicted)
    return mae , rmse , r2_square

In [11]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'K-NeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=False),
    'AdaBoostRegressor': AdaBoostRegressor()
}

model_list = []
r2_list = []

In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,Y_train)

    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(Y_train,Y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(Y_test,Y_test_pred)

    print(list(models.keys())[i])
    model_list.append(lmodels = {
                "Decision Tree": DecisionTreeRegressor(),
                "Random Forest": RandomForestRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }ist(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:4f}".format(model_train_rmse))
    print("- Mean Absolute error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print("-------------------------------------------------")

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:4f}".format(model_test_rmse))
    print("- Mean Absolute error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')



LinearRegression
Model performance for Training set
- Root Mean Squared Error: 5.353941
- Mean Absolute error: 4.2625
- R2 Score: 0.8788
-------------------------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.548254
- Mean Absolute error: 4.4925
- R2 Score: 0.8454


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.523650
- Mean Absolute error: 5.1625
- R2 Score: 0.8201
-------------------------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.754414
- Mean Absolute error: 5.2640
- R2 Score: 0.7708


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.292758
- Mean Absolute error: 4.2070
- R2 Score: 0.8816
-------------------------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.478869
- Mean Absolute error: 4.4190
- R2 Score: 0.8492


K-NeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 5.894357
- Mean Absolute

In [14]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model Name','R2_Score']).sort_values(by=['R2_Score'],ascending=False)

Unnamed: 0,Model Name,R2_Score
11,Ridge,0.849202
2,Ridge,0.849202
0,LinearRegression,0.845359
9,LinearRegression,0.845359
7,CatBoostRegressor,0.8154
16,CatBoostRegressor,0.8154
5,RandomForestRegressor,0.792782
14,RandomForestRegressor,0.791055
17,AdaBoostRegressor,0.7883
8,AdaBoostRegressor,0.782345


In [11]:
model = load_object("/home/kubuntu/Desktop/my_pc/Generic_ML_Project/artifacts/model.pkl")

In [12]:
model.fit(X_train,Y_train)

In [13]:
pred = model.predict(X_test)

In [14]:
evaluate_model(Y_test,pred)

(4.4925, 5.548254229935755, 0.8453587984824799)

In [20]:
file_path = "/home/kubuntu/Desktop/my_pc/Generic_ML_Project/notebook/model.pkl"

In [21]:
with open(file_path, "wb") as file_obj:
            pickle.dump(model, file_obj)

In [50]:
model = load_object("/home/kubuntu/Desktop/my_pc/Generic_ML_Project/notebook/model.pkl")

In [38]:
model1 = load_object("/home/kubuntu/Desktop/my_pc/Generic_ML_Project/artifacts/model.pkl")

In [51]:
pred = model.predict(X_test)

In [39]:
pred1 = model1.predict(X_test)

In [None]:
pred1

In [52]:
pred

array([69.  , 85.25, 51.25, 90.  , 77.5 , 43.  , 53.  , 83.75, 64.25,
       75.75, 76.5 , 46.75, 70.5 , 72.  , 88.25, 90.25, 81.75, 56.5 ,
       55.5 , 51.  , 69.5 , 64.5 , 50.75, 61.75, 70.25, 59.75, 73.  ,
       47.5 , 69.  , 53.25, 70.5 , 72.75, 66.  , 65.  , 74.25, 81.  ,
       47.5 , 95.  , 69.75, 76.  , 86.25, 77.25, 76.25, 79.25, 61.  ,
       50.  , 45.25, 51.5 , 85.75, 54.  , 54.25, 84.  , 57.5 , 93.25,
       81.75, 53.  , 89.  , 59.  , 71.25, 83.5 , 64.75, 52.  , 51.25,
       53.75, 84.5 , 54.75, 81.5 , 78.5 , 59.25, 91.75, 51.5 , 83.  ,
       82.  , 92.5 , 62.75, 65.5 , 59.25, 67.25, 54.75, 73.25, 75.75,
       41.  , 42.75, 64.25, 80.25, 69.25, 49.5 , 54.  , 70.75, 67.25,
       53.25, 69.5 , 63.5 , 57.  , 50.75, 53.25, 73.5 , 61.25, 76.75,
       43.5 , 80.5 , 63.25, 60.75, 72.5 , 48.5 , 65.  , 61.5 , 65.5 ,
       67.75, 61.75, 45.25, 79.25, 75.25, 89.5 , 76.5 , 85.  , 82.  ,
       47.5 , 67.75, 67.25, 55.  , 87.5 , 81.  , 90.75, 69.75, 78.25,
       63.5 , 44.75,

In [43]:
Y_test

778    72
334    83
271    58
802    87
216    83
       ..
371    45
411    84
644    70
981    81
365    49
Name: math score, Length: 200, dtype: int64

In [40]:
evaluate_model(Y_test,pred1)

(412038567212638.8, 519064691545665.5, -1.3534913514680723e+27)

In [40]:
models = {
                "Decision Tree": DecisionTreeRegressor(),
                "Random Forest": RandomForestRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(),
                "AdaBoost Regressor": AdaBoostRegressor()
            }

In [41]:
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    # 'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }
                
            }


In [None]:
report = {}
best_params = {}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]
    para=params[list(models.keys())[i]]

    gs = GridSearchCV(model,para,cv=3)
    gs.fit(X_train,Y_train)

    best_params[model_name] = gs.best_params_
    #print(gs.best_params_)

    model.set_params(**gs.best_params_)
    model.fit(X_train,Y_train)

    #model.fit(X_train, y_train)  # Train model

    Y_train_pred = model.predict(X_train)

    Y_test_pred = model.predict(X_test)

    train_model_score = r2_score(Y_train, Y_train_pred)

    test_model_score = r2_score(Y_test, Y_test_pred)

    report[list(models.keys())[i]] = test_model_score

In [46]:

            ## To get best model name from dict
best_model_score = max(sorted(report.values()))
best_model_name = list(report.keys())[
list(report.values()).index(best_model_score)
]
best_model = models[best_model_name]
best_model_params = best_params[best_model_name]

best_model.set_params(**best_model_params)
best_model.fit(X_train,Y_train)


In [45]:
best_params

{'Decision Tree': {'criterion': 'absolute_error'},
 'Random Forest': {'n_estimators': 64},
 'Gradient Boosting': {'learning_rate': 0.05,
  'n_estimators': 128,
  'subsample': 0.75},
 'Linear Regression': {},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 32},
 'CatBoosting Regressor': {'depth': 6,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 256}}