In [18]:

import os,sys

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error
import numpy as np




class ModelSelctor:
    def __init__(self, x_train, y_train, x_test, y_test,base_accuracy):
        
        try:
            self.x_train = x_train,
            self.y_train = y_train,
            self.x_test = x_test,
            self.y_test = y_test,
            self.base_accuracy = base_accuracy

            print("Model Selction Started")
        except Exception as e:
            raise SalesException(e,sys) from e




    def get_best_param_xgb(self):
        try:
            best_params={}

            param1={'eta' : [i/100 for i in range(1,20)]}
            param2={'max_depth' : range(3,10,1)}
            param3={'gamma' : [i/10 for i in range(1,10)]}
            param4={'subsample':[i/100.0 for i in range(70,100,5)]}
            param5={'colsample_bytree':[i/100.0 for i in range(70,100,3)]}
            param6={'alpha' : np.arange(0.1, 10, 0.1)}
            param7={'n_estimators':range(10,100,5)}

            parameters=[param1, param2, param3, param4, param5, param6, param7]

            for param in parameters:
                grid=GridSearchCV(XGBRegressor(objective='reg:squarederror'), param, cv=5, n_jobs=-1)
                grid.fit(self.x_train, self.y_train)
                best_params.update(grid.best_params_)

            eta=best_params['eta']
            max_depth=best_params['max_depth']
            gamma=best_params['gamma']
            subsample=best_params['subsample']
            colsample_bytree=best_params['colsample_bytree']
            alpha=best_params['eta']
            n_estimators=best_params['n_estimators']

            model=XGBRegressor(objective='reg:squarederror', eta=eta, max_depth=max_depth,  gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, alpha=alpha, n_estimators=n_estimators)
            model.fit(self.x_train, self.y_train)
            y_pred=model.predict(self.x_test)

            r2=r2_score(self.y_test, self.y_pred)
            return model, r2
        except Exception as e:
            print(e)


    def get_best_param_rf(self):
        try:
            best_params={}

            param1={'criterion': ['squared_error', 'absolute_error']}
            param2={'max_depth' : range(3,10,1)}
            param3={'max_features' : [i/100.0 for i in range(70,100,3)]}
            param4={'max_samples' : [i/100.0 for i in range(70,100,5)]}
            param5={'n_estimators':range(10,100,5)}


            parameters=[param1, param2, param3, param4, param5]

            for param in parameters:
                grid =GridSearchCV(RandomForestRegressor(), param, cv=5, n_jobs=-1)
                grid.fit(self.x_train, self.y_train)
                best_params.update(grid.best_params_)

            criterion=best_params['criterion']
            max_depth=best_params['max_depth']
            max_features=best_params['max_features']
            max_samples=best_params['max_samples']
            n_estimators=best_params['n_estimators']



            model=RandomForestRegressor(criterion=criterion, max_depth = max_depth, max_features = max_features, max_samples = max_samples, n_estimators = n_estimators)
            model.fit(self.x_train, self.y_train)
            y_pred=model.predict(self.x_test)

            r2=r2_score(self.y_test, y_pred)
            return model, r2
        except Exception as e:
            print(e)

    def get_best_model(self):
        xgb_model , xgb_r2 = self.get_best_param_xgb()
        rf_model , rf_r2 = self.get_best_param_rf()

        if xgb_r2 > self.base_accuracy and xgb_r2>rf_r2:
            best_model,model_name = xgb_model ,'XGB'
            print(f'best Model is {model_name} with parameters {best_model} ')
        elif rf_r2 > self.base_accuracy and rf_r2>xgb_r2:
            best_model,model_name = rf_model,'RandomForest'
            print(f'best Model is {model_name} with parameters {best_model} ')
        else:
            print(f"None of model has base accuracy more than {self.base_accuracy}") 

        return best_model,model_name



In [20]:

import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(r"D:\Projects_new\Stores_Sales_Prediction\sales\artifact\data_transformation\2022-07-25-15-23-29\preprocessed_files\train_transformed\train_array_df.csv")

x = train_df.drop(columns=['Item_Outlet_Sales'])
y = train_df['Item_Outlet_Sales']
x_train,  x_test,y_train, y_test = train_test_split(x,y,test_size=0.20)
base_accuracy = 0.5 


model = ModelSelctor(x_train=x_train,x_test=x_test,y_train=y_train,y_test=y_test,base_accuracy=0.4)

model , model_name = model.get_best_model()

print(f'model { model}')
print(f'model_name {model_name}')


Model Selction Started
Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=1.


TypeError: cannot unpack non-iterable NoneType object

In [17]:
y_test

4273    1640.5312
1298    2406.2012
5133     790.9704
3638    1733.7432
4453    3259.7568
          ...    
3260    1448.7808
3248    2659.8710
5284    1537.9980
3820    2631.2416
810     2972.7970
Name: Item_Outlet_Sales, Length: 1364, dtype: float64

In [21]:
def get_best_param_rf(x_train, y_train, x_test, y_test):
        try:
            best_params={}

            param1={'criterion': ['squared_error', 'absolute_error']}
            param2={'max_depth' : range(3,10,1)}
            param3={'max_features' : [i/100.0 for i in range(70,100,3)]}
            param4={'max_samples' : [i/100.0 for i in range(70,100,5)]}
            param5={'n_estimators':range(10,100,5)}


            parameters=[param1, param2, param3, param4, param5]

            for param in parameters:
                grid =GridSearchCV(RandomForestRegressor(), param, cv=5, n_jobs=-1)
                grid.fit(x_train, y_train)
                best_params.update(grid.best_params_)

            criterion=best_params['criterion']
            max_depth=best_params['max_depth']
            max_features=best_params['max_features']
            max_samples=best_params['max_samples']
            n_estimators=best_params['n_estimators']



            model=RandomForestRegressor(criterion=criterion, max_depth = max_depth, max_features = max_features, max_samples = max_samples, n_estimators = n_estimators)
            model.fit(x_train, y_train)
            y_pred=model.predict(x_test)

            r2=r2_score(y_test, y_pred)
            return model, r2
        except Exception as e:
            print(e)

In [22]:
 model , r2 = get_best_param_rf(x_train, y_train, x_test, y_test)

name 'self' is not defined


TypeError: cannot unpack non-iterable NoneType object

In [2]:
import pandas as pd 
train = pd.read_csv('train.csv')

In [6]:
limit = 5
limit = -1 * int(limit)
train[limit:]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.38,Regular,0.046982,Baking Goods,108.157,OUT045,2002,,Tier 2,Supermarket Type1,549.285
8520,NCJ29,10.6,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976
8522,DRG01,14.8,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Small,Tier 1,Supermarket Type1,765.67


In [7]:
train


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976
