In [61]:
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

csv_path = os.path.join('..', 'data', 'train_v9rqX0R.csv')
data = pd.read_csv(csv_path)
# load the dataset

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

random_forest_model = RandomForestRegressor()
linear_model = LinearRegression()
idxs = np.random.shuffle(np.linspace(0,len(data),len(data)))

In [48]:
MIN_MAX = {
    "Item_Weight" : {"min":0,"max":25},
    "Item_Visibility" : {"min":0,"max":0.2},
    "Item_MRP": {"min":30,"max":270},
    "Item_Outlet_Sales": {"min":0,"max":14000},
    "Outlet_Establishment_Year": {"min":1985,"max":2009}, 
}
def normalize(data, column, clip = False):
    if clip:
        data[column][data[column] > MIN_MAX[column]['max']] = MIN_MAX[column]['max']
        data[column][data[column] < MIN_MAX[column]['min']] = MIN_MAX[column]['min']
    normalized_col = (data[column] - MIN_MAX[column]['min']) / (MIN_MAX[column]['max'] - MIN_MAX[column]['min'])
    return normalized_col.values.reshape(-1,1)
# create normalize function for numerical values


In [4]:
from sklearn.preprocessing import OneHotEncoder

def extract_tier_level(data):
    data["Outlet_Location_Type"] = data["Outlet_Location_Type"].str[-1]
    return data["Outlet_Location_Type"].values.reshape(-1,1).astype(np.int64)

def onehotencode(data,column):
    encoder = OneHotEncoder(sparse_output=False)
    return encoder.fit_transform(data[column].values.reshape(-1,1))

In [5]:
def extract_supermarket_level(data):
    data["Outlet_Type"] = data["Outlet_Location_Type"].str[-1]
    data["Outlet_Type"][data["Outlet_Type"] == "e"] = 0
    return data["Outlet_Type"].values.astype(np.int64).reshape(-1,1)

In [40]:
FAT_MAP = {
           'Low Fat':0,
           'Regular':1,
           'low fat':0,
           'LF'     :0,
           'reg'   :1
          }
def map_fat(data):
    """#3 Handles preprocessing of column Item_Fat_Content

    Args:
        data (pd.DataFrame): dataset
    """    
    for key in FAT_MAP.keys():
        data["Item_Fat_Content"][data["Item_Fat_Content"] == key] = FAT_MAP[key]

In [44]:
def handle_weight_nan(data):
    data['Item_Weight'] = data.groupby('Item_Identifier')['Item_Weight'].transform(
        lambda x: x.fillna(method='ffill').fillna(method='bfill'))
# only 4 are left. Drop them.
    data.dropna(subset=['Item_Weight'],inplace=True)


In [58]:
def handle_outlet_size_nan(data):
    data["Outlet_Size"][data["Outlet_Identifier"]=='OUT045'] = 'Small'
    data["Outlet_Size"][data["Outlet_Identifier"]=='OUT017'] = 'Small'
    data["Outlet_Size"][data["Outlet_Identifier"]=='OUT010'] = 'Small'

In [67]:
def set_outlet_size(data):
    data["Outlet_Size"][data["Outlet_Size"]=='Small'] = 1
    data["Outlet_Size"][data["Outlet_Size"]=='Medium'] = 2
    data["Outlet_Size"][data["Outlet_Size"]=='High'] = 3

In [68]:
processed_data = None
handle_weight_nan(data)
handle_outlet_size_nan(data)
for column in ["Item_Visibility","Item_MRP"]:
    normalized_col = normalize(data, column=column, clip=True)
    if processed_data is None:
        processed_data = normalized_col
    else:
        processed_data = np.concat((processed_data, normalized_col),axis=1)
codes = extract_tier_level(data)
#codes = onehotencode(data,column="Outlet_Location_Type")
processed_data = np.concat((processed_data, codes),axis=1)
outlet_type = extract_supermarket_level(data)
#outlet_type = onehotencode(data,column="Outlet_Type")
processed_data = np.concat((processed_data, outlet_type),axis=1)
year = data["Outlet_Establishment_Year"].values.reshape(-1,1)
processed_data = np.concat((processed_data, year),axis=1)
map_fat(data)
fat = data["Item_Fat_Content"].values.reshape(-1,1)
#fat = onehotencode(data,column="Item_Fat_Content")
processed_data = np.concat((processed_data, fat),axis=1)
weight = data["Item_Weight"].values.reshape(-1,1)
processed_data = np.concat((processed_data, weight),axis=1)
#size = onehotencode(data,column="Outlet_Size")
set_outlet_size(data)
size = data["Outlet_Size"].values.reshape(-1,1)
processed_data = np.concat((processed_data, size),axis=1)
X = processed_data
y = normalize(data, "Item_Outlet_Sales")
X[:] = X[idxs]
y[:] = y[idxs]
print(X.shape, y.shape)

  lambda x: x.fillna(method='ffill').fillna(method='bfill'))


(8519, 8) (8519, 1)


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data["Outlet_Size"][data["Outlet_Identifier"]=='OUT045'] = 'Small'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)
random_forest_model.fit(X_train,y_train)
y_val_predict = random_forest_model.predict(X_val)
val_metric = mean_squared_error(y_val, y_val_predict)
print(val_metric)
linear_model.fit(X_train,y_train)
y_val_predict = linear_model.predict(X_val)
val_metric = mean_squared_error(y_val, y_val_predict)
print(val_metric)
# Id | Column                    | Type        | Data   | ToDo                 | X/y | Dim | processed 
#  1 | Item_Identifier           | Categorical | String | split text 1559->89  |  X  |  89 | ✓ 
#  5 | Item_Type                 | Categorical | String | too many ?           |  X  |  16 | ✓
#  7 | Outlet_Identifier         | Categorical | String | cant do much         |  X  |  10 | ✓

#0.01573721055840206  Item_Visibility
#0.013370745446789874 + Item_MRP
#0.013537161791957568 + Outlet_Location_Type 0.00952854902358565
#0.012630846804313844 + ohe Outlet_Location_Type 0.00973791777256838
#0.012783276198845138 + Outlet_Type 0.009433934667527642
#0.012848040532094557 + ohe Outlet_Type 0.009499097712716209
#0.007018652199454604 + Outlet_Establishment_Year 0.0.009411041734326737
#0.006749213477986866 + Item_Fat_Content 0.009353645105473936
#0.0074958593861559585 + ohe Item_Fat_Content 0.010607170411496225
#0.006109566201511222 + Item_Weight 0.009597284410110412
#0.006625616471489953 + ohe Outlet_Size 0.00882434285604162
#0.006669525973025804 + Outlet_Size 0.009529525099822588


  return fit_method(estimator, *args, **kwargs)


0.006669525973025804
0.009529525099822588
