In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from feature_engine.creation import CyclicalFeatures
from feature_engine.encoding import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

In [2]:
data = pd.read_csv('Dataset.csv',index_col='date',parse_dates=True)
data.head()

Unnamed: 0_level_0,market,commodity,price,unit
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-05-20,Wholesale,Rice,53.562,price in rupees per quintal
2024-05-20,Wholesale,Vanaspati Packed,106.828,price in rupees per hundred litres
2024-05-20,Wholesale,Gram Dal,80.65,price in rupees per quintal
2024-05-20,Wholesale,Tur Arhar Dal,151.984,price in rupees per quintal
2024-05-20,Wholesale,Urad Dal,127.156,price in rupees per quintal


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 127248 entries, 2024-05-20 to 2014-01-02
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   market     127248 non-null  object 
 1   commodity  127248 non-null  object 
 2   price      126079 non-null  float64
 3   unit       127248 non-null  object 
dtypes: float64(1), object(3)
memory usage: 4.9+ MB


In [4]:
#Removing unit column and assigning unit measurements for each category of Commodity
data.groupby(by=['commodity','unit'])['price'].mean()

commodity             unit                              
Atta Wheat            price in rupees per kilogram           37.356470
                      price in rupees per quintal            30.809908
Gram Dal              price in rupees per kilogram           78.108603
                      price in rupees per quintal            69.372876
Groundnut Oil Packed  price in rupees per hundred litres    137.424513
                      price in rupees per litre             149.233083
Gur                   price in rupees per kilogram           49.198762
                      price in rupees per quintal            42.117251
Masoor Dal            price in rupees per kilogram           90.268071
                      price in rupees per quintal            78.209935
Milk                  price in rupees per hundred litres     37.510172
                      price in rupees per litre              39.608280
Moong Dal             price in rupees per kilogram          100.474466
                    

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 127248 entries, 2024-05-20 to 2014-01-02
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   market     127248 non-null  object 
 1   commodity  127248 non-null  object 
 2   price      126079 non-null  float64
 3   unit       127248 non-null  object 
dtypes: float64(1), object(3)
memory usage: 4.9+ MB


In [6]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

df = create_features(data)

In [7]:
df

Unnamed: 0_level_0,market,commodity,price,unit,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-05-20,Wholesale,Rice,53.562,price in rupees per quintal,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Vanaspati Packed,106.828,price in rupees per hundred litres,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Gram Dal,80.650,price in rupees per quintal,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Tur Arhar Dal,151.984,price in rupees per quintal,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Urad Dal,127.156,price in rupees per quintal,0,2,5,2024,141,20,21
...,...,...,...,...,...,...,...,...,...,...,...
2014-01-02,Wholesale,Soya Oil Packed,73.500,price in rupees per hundred litres,3,1,1,2014,2,2,1
2014-01-02,Retail,Vanaspati Packed,67.500,price in rupees per litre,3,1,1,2014,2,2,1
2014-01-02,Wholesale,Vanaspati Packed,66.000,price in rupees per hundred litres,3,1,1,2014,2,2,1
2014-01-02,Wholesale,Rice,27.750,price in rupees per quintal,3,1,1,2014,2,2,1


In [8]:
#Getting Units for each commodity
Units = pd.DataFrame(df[df['market'] == 'Retail'].groupby(by=['commodity','unit'])['price'].agg('mean'))
Units.drop(columns=['price'],inplace = True)

Units = dict(Units.index)
df.drop(columns=['unit'],inplace = True)
df.head()

Unnamed: 0_level_0,market,commodity,price,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-05-20,Wholesale,Rice,53.562,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Vanaspati Packed,106.828,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Gram Dal,80.65,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Tur Arhar Dal,151.984,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Urad Dal,127.156,0,2,5,2024,141,20,21


In [9]:
imputer = KNNImputer(n_neighbors= 5)
df ['price'] = imputer.fit_transform(df[['price']])

In [10]:
df.isna().sum()

market        0
commodity     0
price         0
dayofweek     0
quarter       0
month         0
year          0
dayofyear     0
dayofmonth    0
weekofyear    0
dtype: int64

In [14]:
#Encoding Cyclic Features i.e (Day,Month,dayofweek,dayofmonth,dayofyear,dayofmonth)
cyclic_features = ['dayofweek','quarter','month','dayofyear','dayofmonth','weekofyear']
cat_feature = ['market','commodity']
cyclic_encoder = CyclicalFeatures(variables=cyclic_features,drop_original=True)
cat_encoder = OneHotEncoder(variables = cat_feature)
preprocessor = Pipeline(steps=[
    ('Cyclic_features',cyclic_encoder),
    ('cat_encoder',cat_encoder)
])


In [17]:
df.index.max()

Timestamp('2024-05-20 00:00:00')

In [21]:
split_date = '2024-03-01'
df_train = df.loc[df.index < split_date].copy()
df_test = df.loc[df.index >= split_date].copy()


In [30]:
train_cols = df.drop(columns=['price']).columns
test_col = ['price']
print("Train Columns : ",train_cols)
print("Test Columns : ",test_col)

Train Columns :  Index(['market', 'commodity', 'dayofweek', 'quarter', 'month', 'year',
       'dayofyear', 'dayofmonth', 'weekofyear'],
      dtype='object')
Test Columns :  ['price']


In [37]:
preprocessor.fit(df_train[train_cols])

In [39]:
X_train = preprocessor.transform(df_train[train_cols])
X_test = preprocessor.transform(df_test[train_cols])


In [51]:
df_test[train_cols]

Unnamed: 0_level_0,market,commodity,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-05-20,Wholesale,Rice,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Vanaspati Packed,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Gram Dal,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Tur Arhar Dal,0,2,5,2024,141,20,21
2024-05-20,Wholesale,Urad Dal,0,2,5,2024,141,20,21
...,...,...,...,...,...,...,...,...,...
2024-03-01,Retail,Sugar,4,1,3,2024,61,1,9
2024-03-01,Wholesale,Wheat,4,1,3,2024,61,1,9
2024-03-01,Retail,Masoor Dal,4,1,3,2024,61,1,9
2024-03-01,Wholesale,Masoor Dal,4,1,3,2024,61,1,9


In [65]:
from xgboost import XGBRegressor
xg = XGBRegressor(device="cuda",eval_metric = 'rmse',n_estimators = 500)
xg.fit(X_train,df_train[test_col],verbose = 2)

In [66]:
train_predicts = xg.predict(X_train)
test_predicts = xg.predict(X_test)

In [67]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(df_train[test_col],train_predicts))
print(mean_squared_error(df_test[test_col],test_predicts))

6.183085202058439
12.058731842928411


In [71]:
data = {'Feature_names':xg.feature_names_in_,'Importance':xg.feature_importances_}

In [77]:
pd.DataFrame(data)

Unnamed: 0,Feature_names,Importance
0,year,0.004361
1,dayofweek_sin,7.5e-05
2,dayofweek_cos,6.2e-05
3,quarter_sin,0.000666
4,quarter_cos,0.000909
5,month_sin,0.000464
6,month_cos,0.000638
7,dayofyear_sin,0.000273
8,dayofyear_cos,0.000218
9,dayofmonth_sin,4.1e-05


In [87]:
with open('Preprocessor.pkl',mode = 'wb') as file:
    pickle.dump(preprocessor,file)

In [88]:
with open('Preprocessor.pkl',mode = 'rb') as file:
    pre = pickle.load(file)

In [92]:
pre.feature_names_in_


['market',
 'commodity',
 'dayofweek',
 'quarter',
 'month',
 'year',
 'dayofyear',
 'dayofmonth',
 'weekofyear']

In [93]:
with open('Model.pkl',mode = 'wb') as file:
    pickle.dump(xg,file)

In [94]:
xg