In [177]:
import pandas as pd
import pickle
import math

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from pandas.plotting import scatter_matrix
from sklearn.feature_selection import SelectFromModel

In [178]:
train = pd.read_csv('files//train.csv')

In [179]:
test = pd.read_csv('files//test.csv')

In [180]:
test.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type'],
      dtype='object')

In [181]:
common_values = set(test['Item_Identifier']).intersection(train['Item_Identifier'])
print("Common values:", len(common_values))

Common values: 1543


In [182]:
print(test['Item_Identifier'].equals(train['Item_Identifier'])) 

False


In [183]:
diff1 = set(train['Item_Identifier']) - set(test['Item_Identifier'])
diff2 = set(test['Item_Identifier']) - set(train['Item_Identifier'])

print("Values in col1 but not in col2:", diff1)
print("Values in col2 but not in col1:", diff2)

Values in col1 but not in col2: {'FDT07', 'FDG33', 'FDG24', 'FDL34', 'NCQ06', 'FDO19', 'FDA04', 'NCL31', 'FDO52', 'NCY18', 'DRE49', 'FDU19', 'FDL10', 'FDW13', 'FDX04', 'FDX20'}
Values in col2 but not in col1: set()


In [184]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB


In [185]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [186]:
# Categorize outlets
def categorize_age(age):
    if age <= 5:
        return 'New'
    elif age <= 15:
        return 'Mid'
    else:
        return 'Old'

In [187]:
def preprocess_data(dataframe):

    df = dataframe.copy()
    # Fill missing values
    df['Item_Weight'].fillna(df['Item_Weight'].median(), inplace=True)
    df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)
    df['Item_Fat_Content'].replace('LF','Low Fat',inplace=True)
    df['Item_Fat_Content'].replace('low fat','Low Fat',inplace=True)
    df['Item_Fat_Content'].replace('reg','Regular',inplace=True)
        
    reference_year = 2013  # This dataset is from 2013

    # Calculate outlet age
    df['Outlet_Age'] = reference_year - df['Outlet_Establishment_Year']
    #df.drop('Outlet_Establishment_Year', axis=1, inplace=True)

    #df['Outlet_Age_Category'] = df['Outlet_Age'].apply(categorize_age)
    categorical_cols = df.select_dtypes(include=['object','category']).columns
    #categorical_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier','Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type','Outlet_Age_Category']
    le = LabelEncoder()
    df[categorical_cols]= df[categorical_cols].apply(lambda col: le.fit_transform(col))
    
    
    return df

In [188]:
test_df = preprocess_data(test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item_Weight'].fillna(df['Item_Weight'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi

In [189]:
test_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age
0,1103,20.750,0,0.007565,13,107.8622,9,1999,1,0,1,14
1,1067,8.300,1,0.038428,4,87.3198,2,2007,1,1,1,6
2,1406,14.600,0,0.099575,11,241.7538,0,1998,1,2,0,15
3,809,7.315,0,0.015388,13,155.0340,2,2007,1,1,1,6
4,1184,12.500,1,0.118599,4,234.2300,5,1985,1,2,3,28
...,...,...,...,...,...,...,...,...,...,...,...,...
5676,231,10.500,1,0.013496,13,141.3154,8,1997,2,0,1,16
5677,306,7.600,1,0.142991,15,169.1448,3,2009,1,2,2,4
5678,1412,10.000,0,0.073529,8,118.7440,7,2002,1,1,1,11
5679,517,15.300,1,0.000000,3,214.6218,2,2007,1,1,1,6


In [190]:
#test_df.to_csv('files//test_df.csv', index=False)

In [191]:
def model_prediction(identifier, test_df, test):
    modelfile = f'model_{identifier}.pkl'
    with open(modelfile, "rb") as f:
        model = pickle.load(f)
    
   
    predictions = model.predict(test_df)
    print("Predictions from loaded model:", predictions)

    test['Item_Outlet_Sales'] = predictions
    selected_cols = ['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']
    test = test[selected_cols]
    test['Item_Outlet_Sales'] = test['Item_Outlet_Sales'].round()
    test['Item_Outlet_Sales'] = test['Item_Outlet_Sales'].clip(lower=0)
    test.to_csv(f'files//test_predictions1_{identifier}.csv', index=False)
    return predictions

In [192]:
model_prediction('xgb', test_df, test)

Predictions from loaded model: [1477.816338 1313.443634  571.835646 ... 2155.154652 4807.994804
 1430.398062]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Item_Outlet_Sales'] = test['Item_Outlet_Sales'].round()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Item_Outlet_Sales'] = test['Item_Outlet_Sales'].clip(lower=0)


array([1477.816338, 1313.443634,  571.835646, ..., 2155.154652,
       4807.994804, 1430.398062])