In [None]:
# Import required libraries
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the saved model
best_model = joblib.load('models/price_prediction_model.pkl')

# Sample test data (new grocery items for prediction)
test_data = pd.DataFrame({
    'item name': ['Tomato 100g', 'Orange 100g', 'Banana 1000g'],
    'store name': ['D-Mart', 'Big Bazaar', 'Spencer\'s'],
    'stock status': ['In Stock', 'In Stock', 'In Stock'],
    'expiration date': ['2025-05-04', '2025-05-16', '2025-06-13'],
    'quantity_g_ml': [100, 100, 1000],
    'category': ['Vegetables', 'Fruits', 'Fruits'],
    'is_discounted': [0, 0, 0],
    'base_price': [299.86, 212.04, 243.92],
    'days_to_expire': [20, 32, 60]
})

# Clean column names (strip any extra spaces)
test_data.columns = test_data.columns.str.strip()

# Handle missing values (Impute missing store name, category, and stock status with the most frequent value)
imputer = SimpleImputer(strategy='most_frequent')

# Apply imputation separately for each column
test_data['store name'] = imputer.fit_transform(test_data[['store name']]).ravel()  # Flatten the array
test_data['category'] = imputer.fit_transform(test_data[['category']]).ravel()  # Flatten the array
test_data['stock status'] = imputer.fit_transform(test_data[['stock status']]).ravel()  # Flatten the array

# Feature Engineering: Create 'days_to_expire' feature
test_data['days_to_expire'] = pd.to_datetime(test_data['expiration date']) - pd.to_datetime('today')
test_data['days_to_expire'] = test_data['days_to_expire'].dt.days

# Prepare data for prediction (same preprocessing as training)
X_test = test_data.drop(columns=['price', 'expiration date'])

# Define categorical and numeric features (same as during training)
categorical_features = ['store name', 'category', 'stock status']
numeric_features = ['quantity_g_ml', 'base_price', 'days_to_expire']

# Preprocessing for numeric data (Standardization)
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Preprocessing for categorical data (One-Hot Encoding)
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine both transformations into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)]
)

# Create pipeline for prediction
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', best_model)])

# Predict prices using the trained model
predicted_prices = pipeline.predict(X_test)

# Add the predicted prices to the original test data
test_data['predicted_price'] = predicted_prices

# Display the predicted prices
print(test_data[['item name', 'predicted_price']])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   item name        3000 non-null   object 
 1   store name       3000 non-null   object 
 2   price            3000 non-null   float64
 3   stock status     3000 non-null   object 
 4   expiration date  3000 non-null   object 
 5   quantity_g_ml    3000 non-null   int64  
 6   category         3000 non-null   object 
 7   is_discounted    3000 non-null   int64  
 8   base_price       3000 non-null   float64
 9   days_to_expire   3000 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 234.5+ KB
None
item name          0
store name         0
price              0
stock status       0
expiration date    0
quantity_g_ml      0
category           0
is_discounted      0
base_price         0
days_to_expire     0
dtype: int64
Model: Random Forest
MAE: 12.4941
RMSE: 14.6143
R