# this code have been written by 

* **[Ahmad Almouse](https://www.linkedin.com/in/ahmadalmousa2000)**
* **[Maha Ismail](https://www.linkedin.com/in/maha-ismail-35a553213)**
* **[Shaker Abu Rassa'](https://www.linkedin.com/in/shaker-abu-rassa)**

project presentation [slides](https://www.canva.com/design/DAFcLA_uZsk/TM_dyLG8LRTAhjv2w0RZWw/view?utm_content=DAFcLA_uZsk&utm_campaign=designshare&utm_medium=link&utm_source=publishsharelink)

# import the needed libraries 

In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt

# import the needed libraries 


In [21]:
df=pd.read_csv('food_prices_jor.csv')


# data prep

## df prep

In [22]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df = df.loc[df.year > 2016 ]
df.drop(['H-date','category','unit'], axis=1,inplace=True)

## A_df prep

In [23]:
A_df=df.copy()
A_df.drop_duplicates(inplace=True) 
A_df=A_df.groupby(['date','commodity','month','year'])['price'].mean().reset_index()
A_df

Unnamed: 0,date,commodity,month,year,price
0,2017-01-15,Apples (red),1,2017,1.335000
1,2017-01-15,Bananas,1,2017,0.802727
2,2017-01-15,"Beans (fava, dry)",1,2017,1.482500
3,2017-01-15,Bread (pita),1,2017,0.167500
4,2017-01-15,Bulgur,1,2017,0.917273
...,...,...,...,...,...
2432,2022-12-15,Salt (iodised),12,2022,0.282500
2433,2022-12-15,Spinach,12,2022,0.431667
2434,2022-12-15,Sugar,12,2022,0.734167
2435,2022-12-15,Tomatoes,12,2022,0.378333


## df2 prep

In [24]:
df2=A_df.merge(df,on=['year','month','commodity'],how='left')
df2 = df2.dropna()
df2.drop(['date_x','date_y'], axis=1,inplace=True)
# label_encoder = preprocessing.LabelEncoder()
commoditys = df2['commodity'].unique().tolist()
markets = df2['market'].unique().tolist()
df2['commodity'] = [commoditys.index(com) for com in df2['commodity']]
df2['market'] = [markets.index(com) for com in df2['market']]

df2

Unnamed: 0,commodity,month,year,price_x,market,price_y
0,0,1,2017,1.335000,0,1.36
1,0,1,2017,1.335000,1,1.41
2,0,1,2017,1.335000,2,1.52
3,0,1,2017,1.335000,3,1.07
4,0,1,2017,1.335000,4,1.28
...,...,...,...,...,...,...
28803,33,12,2022,0.969167,7,0.89
28804,33,12,2022,0.969167,8,1.39
28805,33,12,2022,0.969167,9,0.95
28806,33,12,2022,0.969167,10,1.12


# data spliting

In [25]:
from sklearn.model_selection import train_test_split


In [26]:
y=df2['price_y']
X=df2.drop(['price_y'],axis=1)
train_X, test_X, train_y, test_y = train_test_split(X, y,test_size = 0.3, random_state = 0,shuffle=False)


# modeling 

In [27]:
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE

In [28]:
modle = xg.XGBRegressor(objective ='reg:linear', n_estimators = 10, seed = 123)
modle.fit(train_X, train_y)



## model testing 

In [29]:
pred = modle.predict(train_X)
rmse = np.sqrt(MSE(train_y, pred))
print("RMSE train : % f" %(rmse))

pred = modle.predict(test_X)
rmse = np.sqrt(MSE(test_y, pred))
print("RMSE test: % f" %(rmse))

print("score train: % f" %modle.score(train_X, train_y))

print("score test: % f" %modle.score(test_X,test_y))


RMSE train :  0.182326
RMSE test:  0.218960
score train:  0.994235
score test:  0.991851


## model testing with diffrent df

### data prep

#### Mdf prep

In [30]:
Mdf=pd.read_csv('2023.csv')
Mdf['date'] = pd.to_datetime(Mdf['date'])
Mdf['year'] = Mdf['date'].dt.year
Mdf['month'] = Mdf['date'].dt.month
Mdf = Mdf.loc[Mdf.year > 2022 ]
Mdf

Unnamed: 0,date,market,category,commodity,unit,price,year,month
30897,2023-01-15,Ajloun,cereals and tubers,Bread (pita),KG,0.32,2023,1
30898,2023-01-15,Ajloun,cereals and tubers,Bulgur,KG,0.86,2023,1
30899,2023-01-15,Ajloun,cereals and tubers,Pasta,300 G,0.35,2023,1
30900,2023-01-15,Ajloun,cereals and tubers,Potatoes,KG,0.65,2023,1
30901,2023-01-15,Ajloun,cereals and tubers,Rice (imported),KG,1.37,2023,1
...,...,...,...,...,...,...,...,...
31303,2023-01-15,Zarqa,vegetables and fruits,Cucumbers,KG,0.46,2023,1
31304,2023-01-15,Zarqa,vegetables and fruits,Garlic,KG,3.51,2023,1
31305,2023-01-15,Zarqa,vegetables and fruits,Onions,KG,0.59,2023,1
31306,2023-01-15,Zarqa,vegetables and fruits,Spinach,KG,0.30,2023,1


#### Adf

In [31]:
Adf=pd.read_csv('2023.csv')
Adf.drop_duplicates(inplace=True)
Adf['date'] = pd.to_datetime(Adf['date'])
Adf['year'] = Adf['date'].dt.year
Adf['month'] = Adf['date'].dt.month
Adf = Adf.loc[Adf.year > 2022 ]
Adf=Adf.groupby(['date','commodity','month','year'])['price'].mean().reset_index()
Adf

Unnamed: 0,date,commodity,month,year,price
0,2023-01-15,Apples (red),1,2023,1.3075
1,2023-01-15,Bananas,1,2023,0.902727
2,2023-01-15,"Beans (fava, dry)",1,2023,1.53
3,2023-01-15,Bread (pita),1,2023,0.324167
4,2023-01-15,Bulgur,1,2023,0.9125
5,2023-01-15,Cheese (picon),1,2023,0.814167
6,2023-01-15,"Cheese (white, boiled)",1,2023,5.799167
7,2023-01-15,Chickpeas,1,2023,1.884167
8,2023-01-15,Cucumbers,1,2023,0.475
9,2023-01-15,Eggs,1,2023,3.043333


#### df3 prep

In [32]:
df3=Adf.merge(Mdf,on=['year','month','commodity'],how='left')
df3 = df3.dropna()
df3.drop(['date_x','date_y','category'], axis=1,inplace=True)
df3['commodity'] = [commoditys.index(com) for com in df3['commodity']]
df3['market'] = [markets.index(com) for com in df3['market']]
df3

Unnamed: 0,commodity,month,year,price_x,market,unit,price_y
0,0,1,2023,1.3075,0,KG,1.31
1,0,1,2023,1.3075,1,KG,0.96
2,0,1,2023,1.3075,2,KG,1.20
3,0,1,2023,1.3075,3,KG,1.32
4,0,1,2023,1.3075,4,KG,1.37
...,...,...,...,...,...,...,...
406,33,1,2023,0.9725,7,1.5 KG,0.90
407,33,1,2023,0.9725,8,1.5 KG,1.39
408,33,1,2023,0.9725,9,1.5 KG,0.95
409,33,1,2023,0.9725,10,1.5 KG,1.12


### testing 

In [33]:
ty=df3['price_y']
tX=df3.drop(['price_y','unit'],axis=1)
pred = modle.predict(tX)
rmse = np.sqrt(MSE(ty, pred))
print("RMSE : % f" %(rmse))
print("score : % f" %modle.score(tX,ty))

RMSE :  0.242865
score :  0.989887


# exporting the model

In [34]:
import pickle
with open('dep/market model.pkl', 'wb') as file:
    pickle.dump(modle, file)
with open('dep/market_encoded.pkl', 'wb') as lst:
    pickle.dump(markets, lst)