In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
### Importing dataset

In [None]:
df=pd.read_csv("/kaggle/input/indian-small-cardamom-price-history-csv/Indian Small Cardamom Price History.csv")

In [None]:
### Reading Dataset and performing EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe(include="all")

In [None]:
df.info()

In [None]:
### Checking for null values

In [None]:
df.isnull().sum()

In [None]:
### Checking for duplicates

In [None]:
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))

In [None]:
df['Auctioneer'].value_counts()

In [None]:
### We need to drop multiple price rows for same days, due to different pricings at multiple auctioneers

In [None]:
df=df.drop_duplicates(subset="Date of Auction")

In [None]:
df.shape

In [None]:
### Now we have a clean dataset with unique price values for each date commencing November 2014 till 22 Apr 2021

In [None]:
### Checking for outliers

In [None]:
cols = ['No.of Lots','Total Qty Arrived (Kgs)','Qty Sold (Kgs)','MaxPrice (Rs./Kg)','Avg.Price (Rs./Kg)']
for i in cols:
    sns.boxplot(df[i],whis=1.5)
    plt.grid()
    plt.title('With Outliers',fontsize=16)
    plt.show();

In [None]:
### Treating Outliers as the same may create havoc in regression models

In [None]:
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
for column in df.columns:
    if df[column].dtype != 'object': 
        lr,ur=remove_outlier(df[column])
        df[column]=np.where(df[column]>ur,ur,df[column])
        df[column]=np.where(df[column]<lr,lr,df[column])

In [None]:
cols = ['No.of Lots','Total Qty Arrived (Kgs)','Qty Sold (Kgs)','MaxPrice (Rs./Kg)','Avg.Price (Rs./Kg)']
for i in cols:
    sns.boxplot(df[i],whis=1.5)
    plt.grid()
    plt.title('After Outlier Removal',fontsize=16)
    plt.show();

In [None]:
### Checkinng the cleamed dataset
df.info()

In [None]:
df.head()

In [None]:
df.drop(['Sno'],axis=1,inplace=True)
df

In [None]:
### The date is in string format and hence needs to be converted to datetime format to help with model creation

In [None]:
from datetime import datetime

# Define dates as strings

date_str1 = '22-Apr-21'

# Define dates as datetime objects
date_dt1 = datetime.strptime(date_str1, '%d-%b-%y')

# Print converted dates
print(date_dt1)

In [None]:
# Complete the call to convert the date column
df['Date of Auction'] =  pd.to_datetime(df['Date of Auction'],
                              format='%d-%b-%y')

# Confirm the date column is in datetime format
print(df.info())

In [None]:
df.head()

In [None]:
### sorting the  data set by date

In [None]:
sorted_df = df. sort_values(by=["Date of Auction"], ascending=True)
sorted_df

In [None]:
# set the index to be the date
df=sorted_df.set_index(pd.DatetimeIndex(sorted_df["Date of Auction"].values))
#show the data
df

In [None]:
#now get only the average price
df=df[['Avg.Price (Rs./Kg)']]
df

In [None]:
# create a variable to store the number of days for which we want to predict the price
prediction_days=1
# create a new variable called prediction
df['prediction']=df[['Avg.Price (Rs./Kg)']].shift(-prediction_days)
df

In [None]:
# Drop the prediction variable from the dataframe
X = np.array(df.drop('prediction', axis=1))
# Remove the last n+1 rows of data where, n is the prediction_days
X=X[:len(df)-prediction_days-1]
X

In [None]:
# Create a dependent variable dataset (y)
y = np.array(df['prediction'])
#Get all the y values except the last n+1 rows
y=y[:-prediction_days-1]
y

In [None]:
#Split the data into train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state=1)

In [None]:
# Use the random forest regressor model
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=2, random_state=1)
model.fit(X_train, y_train)
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
### As can be seen, the model is performing very well with a 98.51% accuracy on the test set

In [None]:
#Show how close the predicted values and actual values are
prediction=model.predict(X_test)
#print the predicted values
print(prediction)
print()
#print the actual values
print(y_test)

In [None]:
#Get the validation data for the model
#create a variable to store all the rows of the data except last n rows
temp_df=df[:-prediction_days]
#create a variable to store the independent price values
X_val=temp_df.tail(1)['Avg.Price (Rs./Kg)'][0]
#show the data
print(X_val)

In [None]:
prediction=model.predict([[X_val]])
#print the price of cardamom for the next n days
print('The price of cardamom in', prediction_days, 'day(s) is predicted to be', prediction)
#print the actual value of the cardamom for the next n days
print('The actual price of cardamom was', temp_df.tail(1)['prediction'][0])

In [None]:
### The above model can predict for one day in future with an accuracy of approximately 98.5%.
### However we require to predict at least one month into future.