In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from IPython.display import display
import os
print("All libraries imported successfully!")

In [None]:
try:
    # Set the base path
    base_path = "dataset/"
    
    # Load all three CSV files with explicit paths
    sales = pd.read_csv(os.path.join(base_path, "K_class_sales_clean.csv"))
    prices = pd.read_csv(os.path.join(base_path, "Price_Agriculture_commodities_Week.csv"))
    state = pd.read_csv(os.path.join(base_path, "state.csv"))
    
    print(" Data loaded successfully from dataset/csv files directory!")
    print(f"Sales data shape: {sales.shape}")
    print(f"Prices data shape: {prices.shape}")
    print(f"State data shape: {state.shape}")
    
except FileNotFoundError as e:
    print(f" Error: {e}")
    print("Please verify:")
    print(f"1. The directory 'dataset/csv files' exists")
    print("2. It contains these exact files:")
    print("   - K_class_sales_clean.csv")
    print("   - Price_Agriculture_commodities_Week.csv")
    print("   - state.csv")
    print(f"Current working directory: {os.getcwd()}")
except Exception as e:
    print(f" Unexpected error: {e}")

In [None]:
if all(df in globals() for df in ['sales', 'prices', 'state']):
    print("\n Sales Data Preview:")
    display(sales.head(2))
    print("\nMissing values:", sales.isna().sum().sum())
    
    print("\n Prices Data Preview:")
    display(prices.head(2))
    print("\nMissing values:", prices.isna().sum().sum())
    
    print("\n State Data Preview:")
    display(state.head(2))
    print("\nMissing values:", state.isna().sum().sum())
else:
    print("Skipping exploration - data not loaded properly")

In [None]:
if all(df in globals() for df in ['sales', 'prices', 'state']):
    try:
        # Convert dates with error handling
        sales['date'] = pd.to_datetime(sales['date'], errors='coerce')
        prices['date'] = pd.to_datetime(prices['date'], errors='coerce')
        
        # Extract temporal features
        sales['month'] = sales['date'].dt.month
        sales['year'] = sales['date'].dt.year
        prices['week'] = prices['date'].dt.isocalendar().week
        
        # Merge datasets
        merged_df = pd.merge(
            sales,
            prices,
            on=['commodity', 'date'],
            how='left'
        )
        
        final_df = pd.merge(
            merged_df,
            state,
            on='state',
            how='left'
        )
       # Handle missing values
        final_df['price'] = final_df['price'].fillna(final_df['price'].median())
        final_df['sales'] = final_df['sales'].fillna(final_df['sales'].median())
        
        # Feature engineering
        final_df['price_per_unit'] = final_df['price'] / final_df['quantity']
        final_df['sales_velocity'] = final_df['sales'] / final_df['quantity']
        
        print("\n✅ Final merged dataset shape:", final_df.shape)
        display(final_df.head(2))
        
    except Exception as e:
        print(f" Error during data processing: {e}")
else:
    print("Skipping data processing - required data not loaded")

In [None]:
if 'final_df' in globals():
    try:
        sample_data = {
            'commodity': ['Tomato'],  # Match exact commodity names from your data
            'state': ['Gujarat'],
            'month': [6],
            'year': [2023],
            'quantity': [100],
            'week': [25]
        }
        
        sample_df = pd.DataFrame(sample_data)
        
        required_columns = ['commodity', 'state', 'month', 'year', 'quantity', 'week']
        if all(col in sample_df.columns for col in required_columns):
            print("\n🔮 Sample Prediction for:")
            display(sample_df)
            
            if 'price_pipeline' in globals():
                price_pred = price_pipeline.predict(sample_df)
                print(f"Predicted Price per Unit: ₹{price_pred[0]:.2f}")
            else:
                print("Price model not trained yet")
                
            if 'demand_pipeline' in globals():
                demand_pred = demand_pipeline.predict(sample_df)
                print(f"Predicted Demand: {demand_pred[0]:.0f} units")
            else:
                print("Demand model not trained yet")
        else:
            print("Sample data missing required columns")
    except Exception as e:
        print(f" Prediction error: {e}")
else:
    print("Cannot make predictions - processed data not available")

In [None]:
# Define features and targets
X = final_df[['commodity', 'state', 'month', 'year', 'quantity', 'week']]
y_price = final_df['price_per_unit']
y_demand = final_df['sales']

# Split data
X_train, X_test, y_price_train, y_price_test, y_demand_train, y_demand_test = train_test_split(
    X, y_price, y_demand, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

In [None]:
numeric_features = ['month', 'year', 'quantity', 'week']
categorical_features = ['commodity', 'state']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("Preprocessor created successfully!")


In [None]:
price_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ))
])

price_pipeline.fit(X_train, y_price_train)

# Evaluate
price_pred = price_pipeline.predict(X_test)
price_rmse = np.sqrt(mean_squared_error(y_price_test, price_pred))
price_r2 = r2_score(y_price_test, price_pred)

print("Price Prediction Results:")
print(f"RMSE: {price_rmse:.2f}")
print(f"R2 Score: {price_r2:.2f}")

# Plot feature importance
feature_importances = price_pipeline.named_steps['regressor'].feature_importances_
feature_names = numeric_features + list(price_pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out())

plt.figure(figsize=(12, 8))
sns.barplot(x=feature_importances, y=feature_names)
plt.title('Feature Importance for Price Prediction')
plt.show()


In [None]:
demand_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(
        n_estimators=150,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    ))
])

demand_pipeline.fit(X_train, y_demand_train)

# Evaluate
demand_pred = demand_pipeline.predict(X_test)
demand_rmse = np.sqrt(mean_squared_error(y_demand_test, demand_pred))
demand_r2 = r2_score(y_demand_test, demand_pred)

print("Demand Prediction Results:")
print(f"RMSE: {demand_rmse:.2f}")
print(f"R2 Score: {demand_r2:.2f}")

In [None]:

joblib.dump(price_pipeline, 'price_model.pkl')
joblib.dump(demand_pipeline, 'demand_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("Models saved successfully:")
print("- price_model.pkl")
print("- demand_model.pkl") 
print("- preprocessor.pkl")

In [None]:
sample_data = {
    'commodity': ['Tomato'],
    'state': ['Gujarat'],
    'month': [6],
    'year': [2023],
    'quantity': [100],
    'week': [25]
}

sample_df = pd.DataFrame(sample_data)
price_pred = price_pipeline.predict(sample_df)
demand_pred = demand_pipeline.predict(sample_df)

print("\nSample Prediction for:")
display(sample_df)
print(f"Predicted Price per Unit: ₹{price_pred[0]:.2f}")
print(f"Predicted Demand: {demand_pred[0]:.0f} units")