In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [117]:
dataSet = pd.read_csv("./Data/Kalimati.csv")

In [101]:
dataSet.head()

Unnamed: 0,SN,Commodity,Date,Unit,Minimum,Maximum,Average
0,0,Tomato Big(Nepali),2013-06-16,Kg,35.0,40.0,37.5
1,1,Tomato Small(Local),2013-06-16,Kg,26.0,32.0,29.0
2,2,Potato Red,2013-06-16,Kg,20.0,21.0,20.5
3,3,Potato White,2013-06-16,Kg,15.0,16.0,15.5
4,4,Onion Dry (Indian),2013-06-16,Kg,28.0,30.0,29.0


In [118]:
columns_to_drop = ['SN', 'Unit']
dataSet = dataSet.drop(columns_to_drop, axis=1)

In [111]:
dataSet.head()

Unnamed: 0,Commodity,Date,Minimum,Maximum,Average
0,Tomato Big(Nepali),2013-06-16,35.0,40.0,37.5
1,Tomato Small(Local),2013-06-16,26.0,32.0,29.0
2,Potato Red,2013-06-16,20.0,21.0,20.5
3,Potato White,2013-06-16,15.0,16.0,15.5
4,Onion Dry (Indian),2013-06-16,28.0,30.0,29.0


In [119]:
dataSet["Date"] = pd.to_datetime(dataSet["Date"])
dataSet["Date"]

0        2013-06-16
1        2013-06-16
2        2013-06-16
3        2013-06-16
4        2013-06-16
            ...    
197156   2021-05-13
197157   2021-05-13
197158   2021-05-13
197159   2021-05-13
197160   2021-05-13
Name: Date, Length: 197161, dtype: datetime64[ns]

In [120]:
dataSet["Year"] = dataSet["Date"].dt.year
dataSet["month"] = dataSet["Date"].dt.month
dataSet = dataSet.drop("Date", axis=1)

In [121]:
new_dataSet = dataSet[['Commodity', "Year", "month",  "Average"]]

In [122]:
print(new_dataSet)

                  Commodity  Year  month  Average
0        Tomato Big(Nepali)  2013      6     37.5
1       Tomato Small(Local)  2013      6     29.0
2                Potato Red  2013      6     20.5
3              Potato White  2013      6     15.5
4        Onion Dry (Indian)  2013      6     29.0
...                     ...   ...    ...      ...
197156    Garlic Dry Nepali  2021      5    110.0
197157     Fish Fresh(Rahu)  2021      5    275.0
197158  Fish Fresh(Bachuwa)  2021      5    230.0
197159   Fish Fresh(Chhadi)  2021      5    225.0
197160  Fish Fresh(Mungari)  2021      5    245.0

[197161 rows x 4 columns]


In [126]:
print(new_dataSet.iloc[:,:].isna().any())

Commodity    False
Year         False
month        False
Average      False
dtype: bool


In [127]:
value_counts = new_dataSet['Commodity'].value_counts()
print(value_counts)

Ginger                  2751
Cauli Local             2750
Cabbage(Local)          2749
Chilli Dry              2748
Raddish White(Local)    2747
                        ... 
Maize                     24
Mango(Calcutte)           23
Sweet Lime                14
Musk Melon                10
Mango(Chousa)              2
Name: Commodity, Length: 132, dtype: int64


In [324]:
mask = value_counts[new_dataSet['Commodity']].values < 2750
print(mask)

[ True  True  True ... False  True  True]


In [325]:
new_dataSet = new_dataSet[~mask]

In [326]:
new_dataSet['Commodity'].nunique()

2

In [327]:
value_counts = new_dataSet['Commodity'].value_counts()
print(value_counts)

Ginger         2751
Cauli Local    2750
Name: Commodity, dtype: int64


In [328]:
new_dataSet.head()

Unnamed: 0,Commodity,Year,month,Average
7,Cauli Local,2013,6,32.5
63,Ginger,2013,6,145.0
81,Cauli Local,2013,6,27.5
136,Ginger,2013,6,145.0
154,Cauli Local,2013,6,27.5


In [329]:
x = new_dataSet.iloc[:, :-1].values
y = new_dataSet.iloc[:, -1].values

In [330]:
print(x)
print(len(x))

[['Cauli Local' 2013 6]
 ['Ginger' 2013 6]
 ['Cauli Local' 2013 6]
 ...
 ['Ginger' 2021 5]
 ['Cauli Local' 2021 5]
 ['Ginger' 2021 5]]
5501


In [331]:
print(len(y))

5501


In [345]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0])], remainder='passthrough')
x_t = np.array(ct.fit_transform(x))

In [368]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_t, y, test_size=0.1, random_state= 40)

In [369]:
print(x_t)

[[1.0 0.0 2013 6]
 [0.0 1.0 2013 6]
 [1.0 0.0 2013 6]
 ...
 [0.0 1.0 2021 5]
 [1.0 0.0 2021 5]
 [0.0 1.0 2021 5]]


In [370]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [371]:
y_pred = regressor.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 56.875     57.5     ]
 [102.953125 195.      ]
 [ 55.375     62.5     ]
 ...
 [ 82.140625  65.      ]
 [ 45.75      47.5     ]
 [ 42.015625  47.5     ]]


In [378]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.3665560725448318


In [376]:
from sklearn.tree import DecisionTreeRegressor
d_regressor = DecisionTreeRegressor(random_state=0)
d_regressor.fit(x_train, y_train)

In [379]:
y_pred_1 = d_regressor.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_1)

0.9313112450633353

In [386]:
from sklearn.ensemble import RandomForestRegressor
r_regressor = RandomForestRegressor(n_estimators = 15, random_state = 40)
r_regressor.fit(x_train, y_train)

In [388]:
y_pred_2 = r_regressor.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_2)

0.9311875877217155

In [389]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
x_poly = poly_reg.fit_transform(x_train)
pol_regressor = LinearRegression()
pol_regressor.fit(x_poly, y_train)

In [390]:
x_poly_test = poly_reg.fit_transform(x_test)
y_pred_3 = pol_regressor.predict(x_poly_test)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_3)

0.6187230081442702

**CONCLUSION:**
*Random forest regression and Decision tree regression is used as it has higher accuracy*