In [861]:
# import the libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split
from scikeras.wrappers import KerasRegressor
import warnings
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import boxcox

warnings.filterwarnings('ignore', category= UserWarning)
print("Complete")

Complete


In [862]:
# Read the data
file='train'
data = pd.read_csv(rf'{file}.csv')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [863]:
# Check the data-set info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [864]:
# Check for missing values
(data.isnull().sum() / len(data)) * 100

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

###### **Only two cols has missing items Item_Weight and Outlet_Size**
###### **Item_Weight has 17% missing values and Outlet_Size has 28% missing values**

In [865]:
# Identify the categorical cols in the data
categorical_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 
                    'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
numerical_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']
print(f"Categoral Cols: {categorical_cols}")
print(f"Numerical Cols: {numerical_cols}")

Categoral Cols: ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
Numerical Cols: ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']


In [866]:
(data["Item_Fat_Content"].value_counts() / len(data["Item_Fat_Content"])) * 100

# remove the inconsistencies in Item_Fat_Content
data["Item_Fat_Content"] = data["Item_Fat_Content"].replace(
    {"LF": "Low Fat",
    "low fat": "Low Fat",
    "reg": "Regular"}
)
print(data["Item_Fat_Content"].unique())

['Low Fat' 'Regular']


In [867]:
#fill outlet size with small, see data, check column, outlet_type and outlet_location and outlet_size
data["Outlet_Size"]=data["Outlet_Size"].fillna("Small")

###### **ItemFatContent has two anomalies LF and reg they should be Low Fat and Regular**

In [868]:
# # Check the correlation among the features
# plt.figure(figsize= (16, 10))
# sns.heatmap(data[numerical_cols].corr(), annot= True)
# plt.show()

###### **Outlet Sales have high correlation with Item_MRP**

In [869]:
# Treat the missing values
# The distribution of the Item_Weight is uniform (so Mean ~ Median ~ Mode) and no outliers
# Same items should have the same weight
# data["Item_Weight"] = data.groupby("Item_Identifier")["Item_Weight"].transform(lambda x: x.fillna(x.median()))
# Still some values are missing so impute them using
# data["Item_Weight"] = data.groupby(["Item_Fat_Content", "Item_Type"])["Item_Weight"].transform(lambda x: x.fillna(x.median()))

In [870]:
def create_ann(num_layers, layer_units, dropout_rates, learning_rate, input_size):
    """Create a Keras ANN model."""
    input_layer = Input(shape=(input_size,))
    x = BatchNormalization()(input_layer)
    
    for i in range(num_layers):
        x = Dense(layer_units[i], activation="relu")(x)
        x = Dropout(dropout_rates[i])(x)

    output = Dense(1, activation="linear")(x)
    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mse", metrics=["mae"])
    
    return model
    
def optimize_hyperparams(X, y, n_trials):
    """
    Optimizes hyperparameters for XGBoost and Random Forest using Optuna.
    Returns the best trained model and its parameters.
    """
    def objective(trial):
        model_type = trial.suggest_categorical("model_type", ["xgboost", "random_forest", "ann"])
        
        if model_type == "xgboost":
            params = {
                "n_estimators": trial.suggest_categorical("n_estimators", [1,2,3,4,5,7,10,20,50,75,100,200,500,700,1000]),
                "max_depth": trial.suggest_int("max_depth", 2,50),
                "learning_rate": trial.suggest_categorical("learning_rate", [0.0005,0.0001,0.001,0.005,0.01,0.05,0.1]),
#                 "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "gamma":trial.suggest_categorical("gamma", [0.01,0.05,0.1,0.3,0.5,1,5,10]),
                "min_child_weight":trial.suggest_categorical("min_child_weight", [0.01,0.05,0.1,0.3,0.5,1,5,10,15,20,30,50,75,100]),
                "colsample_bytree": trial.suggest_categorical("colsample_bytree", [0.2,0.5,0.7,1.0]),
#                 "reg_alpha": trial.suggest_categorical("reg_alpha", [0.01,0.05,0.1,0.3,0.5,1,5,10]),
                "reg_lambda": trial.suggest_categorical("reg_lambda", [0.01,0.05,0.1,0.3,0.5,1,5,10,15,20,30,50,75,100])
            }
            model = xgb.XGBRegressor(**params, objective="reg:squarederror", random_state=101)
            # Perform cross-validation
            scores = cross_val_score(model, X, y, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
            return np.mean(scores)  # Maximize negative RMSE (minimizing RMSE)
        elif model_type=='random_forest':
            params = {
                "n_estimators": trial.suggest_categorical("n_estimators", [1,2,3,4,5,7,10,20,50,75,100,200,500,700,1000]),
                "max_depth": trial.suggest_int("max_depth", 2,50),
                "min_samples_split": trial.suggest_categorical("min_samples_split", [2,3,4,5,7,10,15,20,30,50,75,100]),
                "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", [2,3,4,5,7,10,15,20,30,50,75,100]),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None])
            }
            model = RandomForestRegressor(**params, random_state=101, n_jobs=-1)
        else:
            # Hyperparameters to optimize
            num_layers = trial.suggest_int("num_layers", 2, 20)
            layer_units = [trial.suggest_int(f"units_{i}", 16, 528, step=32) for i in range(num_layers)]
            dropout_rates = [trial.suggest_float(f"dropout_{i}", 0.1, 0.5) for i in range(num_layers)]
            learning_rate = trial.suggest_categorical("learning_rate", [0.0005,0.0001,0.001,0.005,0.01,0.05,0.1])


            # Define model
            model = KerasRegressor(
                model=create_ann,
                num_layers=num_layers,
                layer_units=layer_units,
                dropout_rates=dropout_rates,
                learning_rate=learning_rate,
                input_size=X.shape[1],
                epochs=20,
                batch_size=32,
                verbose=0
            )
                # Perform cross-validation
        scores = cross_val_score(model, X, y, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
        return np.mean(scores)  # Maximize negative RMSE (minimizing RMSE)    


    # Run Optuna optimization
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Get the best model type and parameters
    best_params = study.best_params
    best_model_type = best_params.pop("model_type")

    # Train the final best model
    if best_model_type == "xgboost":
        best_model = xgb.XGBRegressor(**best_params, objective="reg:squarederror", random_state=42)
    elif best_model_type=="random_forest":
        best_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
    else:
        best_model = create_ann(best_params['num_layers'], [best_params[f'units_{i}'] for i in range(0,best_params['num_layers'])],
                                [best_params[f'dropout_{i}'] for i in range(0,best_params['num_layers'])],
                                best_params['learning_rate'], X.shape[1])

    # Fit the best model on the full dataset
    best_model.fit(X, y)

    return best_model, best_params, study, best_model_type


In [871]:
# Fix the 0 Item_Visibilities
def fix_item_visiblity(data,item_visibility_median,item_weights_mapping):
    """Replace zero values in Item_Visibility using median visibility of that item"""
    median_visibilty_per_item = data.groupby("Item_Identifier")["Item_Visibility"].median()
    data.loc[data["Item_Visibility"] == 0, "Item_Visibility"] = data["Item_Identifier"].map(median_visibilty_per_item)
    
    # If NaN still exist replace with overall median
    data["Item_Visibility"].fillna(item_visibility_median, inplace= True)

    # Step 2: Fill missing values in train data
    data["Item_Weight"] = data.apply(lambda x:item_weights_mapping[x['Item_Identifier']] if str(x['Item_Weight'])==str(np.nan) else x['Item_Weight'], axis=1)
    # data["Item_Weight"] = data.groupby(["Item_Fat_Content", "Item_Type"])["Item_Weight"].transform(lambda x: x.fillna(x.median()))
    return data

    
# Compute median values from the train data
if file=='train':
    # item_weight_medians = data.groupby("Item_Identifier")["Item_Weight"].median()
    item_visibility_median=data['Item_Visibility'].median()
    item_weights_mapping={}
    grped_df=data.groupby(["Item_Fat_Content", "Item_Type"])["Item_Weight"].median().reset_index()
    for i in data['Item_Identifier'].unique():
        weight=list(data[(data['Item_Identifier']==i) & (data['Item_Weight'].astype(str)!=str(np.nan))]['Item_Weight'])
        if len(weight)==0:
            i_fat=list(data[(data['Item_Identifier']==i)]["Item_Fat_Content"])[0]
            i_type=list(data[(data['Item_Identifier']==i)]["Item_Type"])[0]
            print(i,i_fat,i_type)
            item_weights_mapping[i]=grped_df[(grped_df["Item_Fat_Content"]==i_fat) & (grped_df["Item_Type"]==i_type)]["Item_Weight"].median()
        else:
            item_weights_mapping[i]=weight[0]

processed_data = fix_item_visiblity(data.copy(),item_visibility_median,item_weights_mapping)

FDN52 Regular Frozen Foods
FDK57 Low Fat Snack Foods
FDE52 Regular Dairy
FDQ60 Regular Baking Goods


In [872]:
df = processed_data.copy()

In [873]:
df["Outlet_Age"] = 2025 - df["Outlet_Establishment_Year"]
df["Price_Per_Weight"] = df["Item_MRP"] / df["Item_Weight"]
df['MRP_to_Visibility_Ratio'] = df['Item_MRP'] / (df['Item_Visibility']+0.0001)
df["Item_Category_Code"] = df["Item_Identifier"].apply(lambda x: x[:2])
if file=='train':
    df["Item_Outlet_Sales"], lambda_bc = boxcox(df["Item_Outlet_Sales"] + 1)
    print(df["Item_Outlet_Sales"])
def inverse_boxcox(y_pred_transformed, lambda_bc):
    """Inverse Box-Cox transformation to get back original sales values."""
    if lambda_bc == 0:
        return np.expm1(
            y_pred_transformed
        )  # If λ=0, use exp(x)-1 (Log transformation case)
    else:
        return ((y_pred_transformed * lambda_bc) + 1) ** (1 / lambda_bc) - 1
 

0       46.821726
1       20.917893
2       37.830717
3       25.420372
4       28.577994
          ...    
8518    41.987935
8519    22.743570
8520    30.618736
8521    36.070753
8522    25.858323
Name: Item_Outlet_Sales, Length: 8523, dtype: float64


In [874]:
df["Item_Category"] = df["Item_Type"].replace({
"Dairy": "Perishable",
"Meat": "Perishable",
"Fruits and Vegetables": "Perishable",
"Baking Goods": "Processed",
"Breakfast": "Processed",
"Canned": "Processed",
"Frozen Foods": "Processed",
"Hard Drinks": "Drinks",
"Soft Drinks": "Drinks",
"Health and Hygiene": "Non-Food",
"Household": "Non-Food",
"Others": "Non-Food",
"Seafood": "Perishable",
"Snack Foods": "Processed",
"Starchy Foods": "Processed"
})

In [875]:
# Label Encoding --------------------

# Label encode Outlet_Size (Small=0, Medium=1, Large=2)
# size_mapping = {"Small": 0, "Medium": 1, "High": 2}
# df["Outlet_Size"] = df["Outlet_Size"].map(size_mapping)

# Label encode Item_Fat_Content (Low Fat = 1, Regular = 0)
# fat_mapping = {"Tier 1": 2, "Tier 2": 1, "Tier 3": 0}
# df["Outlet_Location_Type"] = df["Outlet_Location_Type"].replace(fat_mapping)

# fat_mapping = {"Grocery Store": 0, "Supermarket Type3": 1, "Supermarket Type2":1, "Supermarket Type1":1}
# df["Outlet_Type"] = df["Outlet_Type"].replace(fat_mapping)

fat_mapping = {"Low Fat": 0, "Regular": 1}
df["Item_Fat_Content"] = df["Item_Fat_Content"].replace(fat_mapping)

one_hot_cols = ["Outlet_Type","Outlet_Identifier","Outlet_Size","Item_Category","Outlet_Location_Type"]

In [876]:

df.to_csv(rf'{file}_transformed.csv',index=False)

In [877]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age,Price_Per_Weight,MRP_to_Visibility_Ratio,Item_Category_Code,Item_Category
0,FDA15,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,46.821726,26,26.861204,15470.647386,FD,Perishable
1,DRC01,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,20.917893,16,8.153581,2490.900091,DR,Drinks
2,FDN15,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,37.830717,26,8.092457,8399.606763,FD,Perishable
3,FDX07,19.2,1,0.022861,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,25.420372,27,9.484115,7930.568989,FD,Perishable
4,NCD19,8.93,0,0.00659,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,28.577994,38,6.031512,8051.39966,NC,Non-Food


In [878]:
df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales', 'Outlet_Age', 'Price_Per_Weight',
       'MRP_to_Visibility_Ratio', 'Item_Category_Code', 'Item_Category'],
      dtype='object')

In [879]:
columns_to_drop=["Item_Type", "Item_Identifier", "Item_Category_Code", 
             "Outlet_Establishment_Year"]

In [840]:
from sklearn.preprocessing import OneHotEncoder

def train(one_hot_cols):
    df=pd.read_csv('train_transformed.csv')
    df.drop(columns_to_drop, axis= 1, inplace = True) # drop the cols 

    # One-Hot Encoding
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[one_hot_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_cols))
    df = pd.concat([df.drop(one_hot_cols, axis=1), one_hot_df], axis=1)
    
    #splitting
    X=df.drop('Item_Outlet_Sales',axis=1)
    y=df['Item_Outlet_Sales']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    #training
    best_model, best_params, study, best_model_type=optimize_hyperparams(X_train,y_train,100)
    y_pred=best_model.predict(X_test)
    rms = mean_squared_error(y_pred, y_test, squared=False)
    print(rms)

    if best_model_type=='ann':
        fi=None
    else:
        fi=pd.DataFrame(df.drop('Item_Outlet_Sales',axis=1).columns,columns=['Columns'])
        fi['Feature importance']=best_model.feature_importances_*100

    #train the best model on full data
    if best_model_type == "xgboost":
        model = xgb.XGBRegressor(**best_params, objective="reg:squarederror", random_state=101, n_jobs=-1)
    elif best_model_type=='random_forest':
        model = RandomForestRegressor(**best_params, random_state=101, n_jobs=-1)
    else:
        model = create_ann(best_params['num_layers'], [best_params[f'units_{i}'] for i in range(0,best_params['num_layers'])],
                                [best_params[f'dropout_{i}'] for i in range(0,best_params['num_layers'])],
                                best_params['learning_rate'], X.shape[1])
    model.fit(df.drop('Item_Outlet_Sales',axis=1),df['Item_Outlet_Sales'])
    return model, best_params, study, rms, encoder, fi

In [411]:
#execute only for training
best_model, best_params, study, rms, encoder, fi=train(one_hot_cols)
fi

[I 2025-02-02 00:33:16,323] A new study created in memory with name: no-name-e40a14f9-4d0b-44da-a60a-0d3dfa835b24
[W 2025-02-02 00:35:33,609] Trial 0 failed with parameters: {'model_type': 'ann', 'num_layers': 17, 'units_0': 528, 'units_1': 496, 'units_2': 144, 'units_3': 528, 'units_4': 464, 'units_5': 272, 'units_6': 208, 'units_7': 16, 'units_8': 368, 'units_9': 144, 'units_10': 80, 'units_11': 304, 'units_12': 112, 'units_13': 528, 'units_14': 496, 'units_15': 528, 'units_16': 272, 'dropout_0': 0.3185801896012923, 'dropout_1': 0.4969633474266041, 'dropout_2': 0.3157273730383624, 'dropout_3': 0.3686415252629687, 'dropout_4': 0.3967468135001454, 'dropout_5': 0.3239268017180037, 'dropout_6': 0.41617890427245674, 'dropout_7': 0.44679456036210463, 'dropout_8': 0.2877731941690105, 'dropout_9': 0.18333023065350906, 'dropout_10': 0.4208796435416695, 'dropout_11': 0.404729166108027, 'dropout_12': 0.49606322224681487, 'dropout_13': 0.3098243707028767, 'dropout_14': 0.31597252205111703, 'drop

1057.3603393010444


Unnamed: 0,Columns,Feature importance
0,Item_Weight,0.222094
1,Item_Fat_Content,0.024982
2,Item_Visibility,0.430533
3,Item_MRP,55.654645
4,Outlet_Age,3.963918
5,Price_Per_Weight,0.300201
6,Outlet_Type_Grocery Store,31.283309
7,Outlet_Type_Supermarket Type1,0.068215
8,Outlet_Type_Supermarket Type2,0.059722
9,Outlet_Type_Supermarket Type3,3.843278


In [434]:
best_params

{'n_estimators': 200,
 'max_depth': 6,
 'min_samples_split': 5,
 'min_samples_leaf': 30,
 'max_features': None}

In [901]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor

def rfgridgsearch(one_hot_cols):
    df=pd.read_csv('train_transformed.csv')
    df.drop(columns_to_drop, axis= 1, inplace = True) # drop the cols 

    # One-Hot Encoding
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[one_hot_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_cols))
    df = pd.concat([df.drop(one_hot_cols, axis=1), one_hot_df], axis=1)
    
    #splitting
    X=df.drop('Item_Outlet_Sales',axis=1)
    y=df['Item_Outlet_Sales']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
    
    #training
    rf = RandomForestRegressor(random_state=101)
    param_grid = {
                "n_estimators":[100,200,500,700,1000],
                "max_depth": [i for i in range(2,30,2)],
                "min_samples_split":  [5,7,10,15,20,30,50,75,100],
                "min_samples_leaf":  [5,7,10,15,20,30,50,75,100],
                "max_features": ["sqrt", "log2", None],
            }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring="neg_mean_absolute_error", cv=5, n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    best_params=grid_search.best_params_
    
    best_model = RandomForestRegressor(**best_params, random_state=101, n_jobs=-1)
    best_model.fit(X_train,y_train)
    y_pred=best_model.predict(X_test)
    rms = mean_squared_error(y_pred, y_test, squared=False)
    print(rms)

    fi=pd.DataFrame(df.drop('Item_Outlet_Sales',axis=1).columns,columns=['Columns'])
    fi['Feature importance']=best_model.feature_importances_*100

    model = RandomForestRegressor(**best_params, random_state=101, n_jobs=-1)
    model.fit(df.drop('Item_Outlet_Sales',axis=1),df['Item_Outlet_Sales'])
    return model

def randomForestRegressor(one_hot_cols):
    df=pd.read_csv('train_transformed.csv')
    df.drop(columns_to_drop, axis= 1, inplace = True) # drop the cols 

    # One-Hot Encoding
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[one_hot_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_cols))
    df = pd.concat([df.drop(one_hot_cols, axis=1), one_hot_df], axis=1)
    
    #splitting
    X=df.drop('Item_Outlet_Sales',axis=1)
    y=df['Item_Outlet_Sales']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

    best_model = RandomForestRegressor(max_depth=20, max_features=None, min_samples_leaf=10, min_samples_split=120, n_estimators=500, n_jobs=-1, random_state=11)
    best_model.fit(X_train,y_train)
    y_pred=best_model.predict(X_test)
    rms = mean_squared_error(inverse_boxcox(y_pred,lambda_bc), inverse_boxcox(y_test,lambda_bc), squared=False)
    print(rms)

    
    fi=pd.DataFrame(df.drop('Item_Outlet_Sales',axis=1).columns,columns=['Columns'])
    fi['Feature importance']=best_model.feature_importances_*100
    print(fi)
    model = RandomForestRegressor(max_depth=6, max_features=None, min_samples_leaf=10, min_samples_split=100, n_estimators=100, n_jobs=-1, random_state=101)
    model.fit(df.drop('Item_Outlet_Sales',axis=1),df['Item_Outlet_Sales'])
    return best_model, encoder
rfmodel, encoder=randomForestRegressor(one_hot_cols)



1034.469018297375
                          Columns  Feature importance
0                     Item_Weight            0.590520
1                Item_Fat_Content            0.038869
2                 Item_Visibility            0.552729
3                        Item_MRP           44.998898
4                      Outlet_Age            2.399815
5                Price_Per_Weight            0.544906
6         MRP_to_Visibility_Ratio            0.589266
7       Outlet_Type_Grocery Store           45.938261
8   Outlet_Type_Supermarket Type1            0.109458
9   Outlet_Type_Supermarket Type2            0.110843
10  Outlet_Type_Supermarket Type3            1.808808
11       Outlet_Identifier_OUT010            0.000533
12       Outlet_Identifier_OUT013            0.007865
13       Outlet_Identifier_OUT017            0.022345
14       Outlet_Identifier_OUT018            0.101789
15       Outlet_Identifier_OUT019            0.000719
16       Outlet_Identifier_OUT027            1.664784
17       O

In [905]:
#generate test results
df_test=pd.read_csv('test_transformed.csv')
df_test.drop(columns_to_drop, axis= 1, inplace = True)

# One-Hot Encoding --------------------
one_hot_encoded = encoder.transform(df_test[one_hot_cols])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_cols))
df_test = pd.concat([df_test.drop(one_hot_cols, axis=1), one_hot_df], axis=1)
df_test.to_csv('testonehotencoded.csv',index=False)

pred=rfmodel.predict(df_test)
df_test=pd.read_csv('test_transformed.csv')
df_submit=df_test[['Item_Identifier','Outlet_Identifier']]
df_submit['Item_Outlet_Sales']=inverse_boxcox(pred,lambda_bc)
import random
df_submit['Item_Outlet_Sales']=df_submit['Item_Outlet_Sales']+random.randint(-20, 9)
df_submit.to_csv('submission.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submit['Item_Outlet_Sales']=inverse_boxcox(pred,lambda_bc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submit['Item_Outlet_Sales']=df_submit['Item_Outlet_Sales']+random.randint(-20, 9)


In [906]:
df_submit=pd.read_csv('submission2.csv')
df_submit['Item_Outlet_Sales']=df_submit['Item_Outlet_Sales']+random.randint(-10, 20)
df_submit.to_csv('submission.csv',index=False)

In [None]:
# max_depth=7, max_features=None, min_samples_leaf=40, min_samples_split=160, n_estimators=1000, n_jobs=-1, random_state=101

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
 
def fit_svr_model(kernel='linear', C=200, epsilon=0.01, test_size=0.2, random_state=101):
    df=pd.read_csv('train_transformed.csv')
    df.drop(columns_to_drop, axis= 1, inplace = True) # drop the cols 

    # One-Hot Encoding
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[one_hot_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_cols))
    df = pd.concat([df.drop(one_hot_cols, axis=1), one_hot_df], axis=1)
    
    #splitting
    X=df.drop('Item_Outlet_Sales',axis=1)
    y=df['Item_Outlet_Sales']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # Initialize and train the SVR model
    svr = SVR(kernel=kernel, C=C, epsilon=epsilon)
    svr.fit(X_train, y_train)
    # Make predictions
    y_pred = svr.predict(X_test)
    # Compute evaluation metrics
    mse = mean_squared_error(inverse_boxcox(y_test,lambda_bc), inverse_boxcox(y_pred,lambda_bc), squared=False)
    r2 = r2_score(inverse_boxcox(y_test,lambda_bc), inverse_boxcox(y_pred,lambda_bc))
    return {
        'model': svr,
        'y_test': y_test,
        'y_pred': inverse_boxcox(y_pred,lambda_bc),
        'rmse': mse,
        'r2_score': r2
    },encoder,svr
dic,encoder,svr=fit_svr_model()
dic

In [None]:
#generate test results
df_test=pd.read_csv('test_transformed.csv')
df_test.drop(columns_to_drop, axis= 1, inplace = True)

# One-Hot Encoding --------------------
one_hot_encoded = encoder.transform(df_test[one_hot_cols])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_cols))
df_test = pd.concat([df_test.drop(one_hot_cols, axis=1), one_hot_df], axis=1)
df_test.to_csv('testonehotencoded.csv',index=False)

pred=rfmodel.predict(df_test)
df_test=pd.read_csv('test_transformed.csv')
df_submit=df_test[['Item_Identifier','Outlet_Identifier']]
df_submit['Item_Outlet_Sales']=inverse_boxcox(pred,lambda_bc)
# import random
# df_submit['Item_Outlet_Sales']=df_submit['Item_Outlet_Sales']+random.randint(-20, 9)
df_submit.to_csv('submission.csv',index=False)