In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

In [None]:
df = pd.read_csv('onion_price_2012_to_2024_cleaned.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543922 entries, 0 to 543921
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sl no.                     543922 non-null  float64
 1   District Name              543922 non-null  object 
 2   Market Name                543922 non-null  object 
 3   Commodity                  543922 non-null  object 
 4   Variety                    543922 non-null  object 
 5   Grade                      543921 non-null  object 
 6   Min Price (Rs./Quintal)    543921 non-null  float64
 7   Max Price (Rs./Quintal)    543921 non-null  float64
 8   Modal Price (Rs./Quintal)  543921 non-null  float64
 9   Price Date                 543921 non-null  object 
dtypes: float64(4), object(6)
memory usage: 41.5+ MB


In [None]:
df.head()

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,1.0,Fazilka,Abohar,Onion,Onion,FAQ,450.0,650.0,500.0,02 Jan 2012
1,2.0,Fazilka,Abohar,Onion,Onion,FAQ,450.0,650.0,500.0,07 May 2012
2,3.0,Fazilka,Abohar,Onion,Onion,FAQ,450.0,650.0,500.0,11 May 2012
3,4.0,Fazilka,Abohar,Onion,Onion,FAQ,500.0,800.0,700.0,07 Aug 2012
4,5.0,Fazilka,Abohar,Onion,Onion,FAQ,500.0,800.0,700.0,26 Jul 2012


In [None]:
df.isnull().sum()

Unnamed: 0,0
Sl no.,0
District Name,0
Market Name,0
Commodity,0
Variety,0
Grade,1
Min Price (Rs./Quintal),1
Max Price (Rs./Quintal),1
Modal Price (Rs./Quintal),1
Price Date,1


In [None]:
df['Min Price (Rs./Quintal)'] = df['Min Price (Rs./Quintal)'].fillna(df['Min Price (Rs./Quintal)'].mean())
df['Max Price (Rs./Quintal)'] = df['Max Price (Rs./Quintal)'].fillna(df['Max Price (Rs./Quintal)'].mean())
df['Modal Price (Rs./Quintal)'] = df['Modal Price (Rs./Quintal)'].fillna(df['Modal Price (Rs./Quintal)'].mean())


In [None]:
df = df.drop(columns=['Variety', 'Grade', 'Sl no.'])
# Convert 'Price Date' to datetime
df['Price Date'] = pd.to_datetime(df['Price Date'], errors='coerce')
df.dropna(subset=['Price Date'], inplace=True)

# Extract Year, Month, Day
df['Year'] = df['Price Date'].dt.year
df['Month'] = df['Price Date'].dt.month
df['Day'] = df['Price Date'].dt.day
df.head()

Unnamed: 0,District Name,Market Name,Commodity,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date,Year,Month,Day
0,Fazilka,Abohar,Onion,450.0,650.0,500.0,2012-01-02,2012,1,2
1,Fazilka,Abohar,Onion,450.0,650.0,500.0,2012-05-07,2012,5,7
2,Fazilka,Abohar,Onion,450.0,650.0,500.0,2012-05-11,2012,5,11
3,Fazilka,Abohar,Onion,500.0,800.0,700.0,2012-08-07,2012,8,7
4,Fazilka,Abohar,Onion,500.0,800.0,700.0,2012-07-26,2012,7,26


In [None]:
df.isnull().sum()

Unnamed: 0,0
District Name,0
Market Name,0
Commodity,0
Min Price (Rs./Quintal),0
Max Price (Rs./Quintal),0
Modal Price (Rs./Quintal),0
Price Date,0
Year,0
Month,0
Day,0


In [None]:
# Calculate average price
df['Average Price'] = (df['Min Price (Rs./Quintal)'] + df['Max Price (Rs./Quintal)'] + df['Modal Price (Rs./Quintal)']) / 3

# Categorize the average price into 'Low', 'Mid', 'High'
df['Price Category'] = pd.cut(df['Average Price'], bins=[0, 1000, 2000, float('inf')], labels=['Low', 'Mid', 'High'])

In [None]:
df.head()

Unnamed: 0,District Name,Market Name,Commodity,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date,Year,Month,Day,Average Price,Price Category
0,Fazilka,Abohar,Onion,450.0,650.0,500.0,2012-01-02,2012,1,2,533.333333,Low
1,Fazilka,Abohar,Onion,450.0,650.0,500.0,2012-05-07,2012,5,7,533.333333,Low
2,Fazilka,Abohar,Onion,450.0,650.0,500.0,2012-05-11,2012,5,11,533.333333,Low
3,Fazilka,Abohar,Onion,500.0,800.0,700.0,2012-08-07,2012,8,7,666.666667,Low
4,Fazilka,Abohar,Onion,500.0,800.0,700.0,2012-07-26,2012,7,26,666.666667,Low


In [None]:
# Features and Target
X = df[['Year', 'Month', 'Day']]
y = df['Price Category']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=500)
}

In [None]:
# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2%}")
    print(classification_report(y_test, y_pred))

KNN Accuracy: 60.96%
              precision    recall  f1-score   support

        High       0.62      0.65      0.64     40741
         Low       0.64      0.66      0.65     54796
         Mid       0.58      0.54      0.56     67640

    accuracy                           0.61    163177
   macro avg       0.61      0.62      0.61    163177
weighted avg       0.61      0.61      0.61    163177

Naive Bayes Accuracy: 53.60%
              precision    recall  f1-score   support

        High       0.51      0.43      0.46     40741
         Low       0.58      0.68      0.63     54796
         Mid       0.51      0.48      0.50     67640

    accuracy                           0.54    163177
   macro avg       0.53      0.53      0.53    163177
weighted avg       0.53      0.54      0.53    163177

Decision Tree Accuracy: 67.55%
              precision    recall  f1-score   support

        High       0.76      0.62      0.68     40741
         Low       0.76      0.61      0.68     

In [None]:
# Extra: LSTM Model
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(25))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Reshape data for LSTM
X_lstm = df[['Average Price']].values
scaler_lstm = StandardScaler()
X_lstm = scaler_lstm.fit_transform(X_lstm)

# Create dataset for LSTM
def create_dataset(X, time_step=1):
    Xs, ys = [], []
    for i in range(len(X) - time_step - 1):
        Xs.append(X[i:(i + time_step), 0])
        ys.append(X[i + time_step, 0])
    return np.array(Xs), np.array(ys)

time_step = 10
X_lstm, y_lstm = create_dataset(X_lstm, time_step)
X_lstm = X_lstm.reshape(X_lstm.shape[0], X_lstm.shape[1], 1)

# Split data
train_size = int(len(X_lstm) * 0.7)
X_train_lstm, X_test_lstm = X_lstm[:train_size], X_lstm[train_size:]
y_train_lstm, y_test_lstm = y_lstm[:train_size], y_lstm[train_size:]


In [None]:
# Train LSTM Model
lstm_model = create_lstm_model((X_train_lstm.shape[1], 1))
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=1, verbose=1)
lstm_predictions = lstm_model.predict(X_test_lstm)

NameError: name 'create_lstm_model' is not defined