In [17]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np

In [18]:
# Step 2: Load the Dataset
data = pd.read_csv('combined_data.csv')


In [19]:
# Display the first few rows
print("Dataset Preview:")
print(data.head())

print("\nDataset Information:")
data.info()

Dataset Preview:
         id  district_id district_name  market_id  \
0  64888490            1    Ahmednagar        153   
1  64888491            2         Akola        154   
2  64888492            3     Amarawati       3111   
3  64888493            7       Jalgaon        159   
4  64888494           10        Mumbai       3108   

                    market_name  commodity_id commodity_name variety  \
0                    Ahmednagar            24         Potato   Other   
1                         Akola            24         Potato   Other   
2  Amrawati(Frui & Veg. Market)            24         Potato   Local   
3                       Jalgaon            24         Potato   Other   
4              Vashi New Mumbai            24         Potato   Other   

   min_price  max_price  modal_price        date  
0        800       1800       1300.0  01-01-2024  
1       1000       1400       1300.0  01-01-2024  
2        500       1600       1050.0  01-01-2024  
3        700       1500    

In [20]:
# Check for missing values
print("\nMissing Values in Each Column:")
print(data.isnull().sum())


Missing Values in Each Column:
id                0
district_id       0
district_name     0
market_id         0
market_name       0
commodity_id      0
commodity_name    0
variety           0
min_price         0
max_price         0
modal_price       0
date              0
dtype: int64


In [21]:
# Step 4: Convert Date Column to Datetime
data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y', errors='coerce')

 #Step 5: Validate the Conversion
print("\nData Types After Conversion:")
print(data.dtypes)


Data Types After Conversion:
id                         int64
district_id                int64
district_name             object
market_id                  int64
market_name               object
commodity_id               int64
commodity_name            object
variety                   object
min_price                  int64
max_price                  int64
modal_price              float64
date              datetime64[ns]
dtype: object


In [22]:
# Step 6: Basic Summary Statistics
print("\nSummary Statistics for Numerical Columns:")
print(data.describe())


Summary Statistics for Numerical Columns:
                 id    district_id      market_id   commodity_id  \
count  1.495010e+05  149501.000000  149501.000000  149501.000000   
mean   6.377761e+07      13.272874    1790.479181      66.685628   
min    5.600188e+07       1.000000     153.000000      23.000000   
25%    5.996066e+07      10.000000     177.000000      23.000000   
50%    6.352752e+07      13.000000    1464.000000      24.000000   
75%    6.745467e+07      15.000000    2494.000000      78.000000   
max    7.237475e+07      36.000000   10121.000000     362.000000   
std    4.541550e+06       7.634545    1539.240558      54.561742   

           min_price      max_price    modal_price                        date  
count  149501.000000  149501.000000  149501.000000                      149501  
mean     1331.936469    2361.277336    1921.694020  2023-07-31 15:36:57.994528  
min         1.000000       1.000000       1.000000         2022-01-01 00:00:00  
25%       500.000000

In [23]:
# Step 1: Extract Month and Year from Date
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year

# Step 2: Create a Price Range Feature
data['price_range'] = data['max_price'] - data['min_price']

# Step 3: One-Hot Encode Categorical Features
# Encoding 'district_name', 'market_name', 'commodity_name', 'variety'
encoded_data = pd.get_dummies(data, columns=['district_name', 'market_name', 'commodity_name', 'variety'], drop_first=True)

# Step 4: Verify Feature Engineering
print("\nFeature Engineering Complete. Dataset Preview:")
print(encoded_data.head())

# Optional: Save the Processed Dataset
# encoded_data.to_csv('processed_combined_data.csv', index=False)



Feature Engineering Complete. Dataset Preview:
         id  district_id  market_id  commodity_id  min_price  max_price  \
0  64888490            1        153            24        800       1800   
1  64888491            2        154            24       1000       1400   
2  64888492            3       3111            24        500       1600   
3  64888493            7        159            24        700       1500   
4  64888494           10       3108            24       1000       1600   

   modal_price       date  month  year  ...  commodity_name_Tomato  \
0       1300.0 2024-01-01      1  2024  ...                  False   
1       1300.0 2024-01-01      1  2024  ...                  False   
2       1050.0 2024-01-01      1  2024  ...                  False   
3       1200.0 2024-01-01      1  2024  ...                  False   
4       1300.0 2024-01-01      1  2024  ...                  False   

   variety_2nd Sort  variety_Green Gram Dal  variety_Hybrid  variety_Local  \
0 

In [24]:
from sklearn.model_selection import train_test_split

# Step 1: Define Features (X) and Target (y)
# Drop unnecessary columns
X = encoded_data.drop(columns=['id', 'modal_price', 'date'])  # Drop ID, modal_price (target), and date
y = encoded_data['modal_price']

# Step 2: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Verify the Split
print(f"Training Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")


Training Set Size: (119600, 191)
Testing Set Size: (29901, 191)


In [25]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=15, min_samples_split=10)
model.fit(X_train_scaled, y_train)

# Step 3: Make Predictions
y_pred = model.predict(X_test_scaled)

# Step 4: Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:\nMAE: {mae:.2f}\nRMSE: {rmse:.2f}\nR²: {r2:.2f}")

# Feature Importance
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("\nTop Features by Importance:")
print(feature_importance_df.head(10))


Model Performance:
MAE: 58.53
RMSE: 23211.50
R²: 0.99

Top Features by Importance:
                         Feature  Importance
4                      max_price    0.885968
3                      min_price    0.096148
7                    price_range    0.004667
26          district_name_Nashik    0.002063
1                      market_id    0.001730
2                   commodity_id    0.001256
5                          month    0.001044
78             market_name_Karad    0.000761
70  market_name_Junnar(Alephata)    0.000562
6                           year    0.000536


In [27]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Train an XGBoost Regressor
xgb_model = xgb.XGBRegressor(
    n_estimators=200,      # Number of trees
    max_depth=15,          # Maximum depth of trees
    learning_rate=0.1,     # Step size shrinkage
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
    subsample=0.8,         # Subsample ratio of the training instance
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

# Step 2: Predict on the Test Set
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Step 3: Evaluate XGBoost Model Performance
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Model Performance:\nMAE: {mae_xgb:.2f}\nRMSE: {rmse_xgb:.2f}\nR²: {r2_xgb:.2f}")

# Compare Feature Importance (Optional)
xgb_importance = xgb_model.feature_importances_
xgb_feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': xgb_importance}).sort_values(by='Importance', ascending=False)

print("\nTop Features by Importance (XGBoost):")
print(xgb_feature_importance_df.head(10))


XGBoost Model Performance:
MAE: 49.05
RMSE: 19125.10
R²: 0.99

Top Features by Importance (XGBoost):
                                       Feature  Importance
4                                    max_price    0.285684
70                market_name_Junnar(Alephata)    0.242435
176                  commodity_name_Green Peas    0.087485
175  commodity_name_Green Gram Dal (Moong Dal)    0.035918
3                                    min_price    0.032482
26                        district_name_Nashik    0.021560
61                           market_name_Ghoti    0.013802
2                                 commodity_id    0.008939
84                       market_name_Kopargaon    0.006893
161                        market_name_Solapur    0.006574


In [28]:
import joblib

joblib.dump(model, 'agri.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [29]:
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load the Testing Dataset
test_data = pd.read_csv('Test.csv')
print("Testing Dataset Loaded Successfully!")

# Step 2: Separate Features and Target (assuming 'modal_price' is the target column)
X_test = test_data.drop(columns=['modal_price'], errors='ignore')  # Drop target column if it exists
y_test = test_data['modal_price'] if 'modal_price' in test_data.columns else None  # Extract target if available

# Step 3: One-Hot Encode Categorical Variables
X_test_encoded = pd.get_dummies(X_test, columns=['district_name', 'market_name', 'commodity_name', 'variety'])

# Step 4: Align with Training Data Columns
missing_cols = set(X.columns) - set(X_test_encoded.columns)
for col in missing_cols:
    X_test_encoded[col] = 0
X_test_encoded = X_test_encoded[X.columns]  # Reorder columns to match the training data

# Step 5: Load the Trained Model and Scaler
model = joblib.load('agri.pkl')
scaler = joblib.load('scaler.pkl')

# Step 6: Scale the Testing Data
X_test_scaled = scaler.transform(X_test_encoded)

# Step 7: Predict on the Testing Data
y_pred = model.predict(X_test_scaled)

# Step 8: Evaluate the Model (if Ground Truth Exists)
if y_test is not None:
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Testing Dataset Performance:\nMAE: {mae:.2f}\nRMSE: {rmse:.2f}\nR²: {r2:.2f}")
else:
    print("Ground truth (modal_price) not available in the testing dataset.")

# Step 9: Save Predictions
test_data['Predicted_Modal_Price'] = y_pred
test_data.to_csv('Test_with_Predictions.csv', index=False)
print("Predictions saved to 'Test_with_Predictions.csv'")


Testing Dataset Loaded Successfully!
Testing Dataset Performance:
MAE: 98.23
RMSE: 65581.30
R²: 0.98
Predictions saved to 'Test_with_Predictions.csv'


In [30]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import joblib  # Assuming the model is saved as a .pkl file

# Step 1: Load the trained model
model_path = 'agri.pkl'  # Replace with your actual model file name
model = joblib.load(model_path)

# Step 2: Prepare the input data
def prepare_input(commodity_name, market_name, prediction_date):
    # Create a DataFrame with placeholders for all features
    input_data = pd.DataFrame(
        columns=X.columns,  # X is the DataFrame used for training
        data=[[0] * len(X.columns)]  # Initialize with zeros
    )
    # Set values for commodity_name, market_name, date features
    input_data.loc[0, f"commodity_name_{commodity_name}"] = 1
    input_data.loc[0, f"market_name_{market_name}"] = 1
    # Assuming prediction_date is in 'YYYY-MM-DD' format
    # input_data['date'] = prediction_date  # Include date if your model uses it
    # ... (any other feature encoding logic) ...
    return input_data

# Step 3: Predict the price
commodity_name = "Potato"
market_name = "Akola"
prediction_date = (datetime.now() + timedelta(days=1)).strftime('%Y-%m-%d')  # Tomorrow's date

input_features = prepare_input(commodity_name, market_name, prediction_date)
predicted_price = model.predict(input_features)

# Display the result
print(f"The predicted price for {commodity_name} in {market_name} on {prediction_date} is: {predicted_price[0]:.2f}")

The predicted price for Potato in Akola on 2025-02-17 is: 1874.23




In [31]:
pip install supabase


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
from supabase import create_client, Client
import pandas as pd

# Supabase credentials
SUPABASE_URL = "https://xvvpvlblgkesguqbmblf.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Inh2dnB2bGJsZ2tlc2d1cWJtYmxmIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MzU5NzI5MDQsImV4cCI6MjA1MTU0ODkwNH0.OexNIyW2DkHt3e_UZLYl3Xc9BToGsOFGKyU43hhYBXM"


from datetime import datetime, timedelta




# Initialize Supabase client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Function to insert predictions into Supabase
def insert_predictions_to_supabase(predictions):
    for prediction in predictions:
        response = supabase.table("predicted_prices").insert(prediction).execute()
        print(f"Inserted: {response.data}")

# Get tomorrow's date
tomorrow_date = (datetime.now() + timedelta(days=1)).strftime('%Y-%m-%d')

# Example dataset of commodities and markets (replace with your actual dataset)
commodities = ["Tomato", "Potato", "Onion", "Carrot", "Cabbage"]  # List of commodities
markets = ["Market1", "Market2", "Market3"]  # List of market names

# Example prediction logic for all commodities in all markets
predictions = []  # List to hold all predictions

for market_name in markets:
    for commodity in commodities:
        # Replace this with your actual prediction logic
        predicted_price = 100 + len(commodity) * 10  # Example predicted price

        # Create a dictionary for the prediction
        prediction = {
            "market_name": market_name,
            "commodity": commodity,
            "predicted_price": predicted_price,
            "prediction_date": tomorrow_date
        }

        # Add the prediction to the list
        predictions.append(prediction)

# Insert all predictions into Supabase
insert_predictions_to_supabase(predictions)



ConnectError: [Errno 11001] getaddrinfo failed