<a href="https://colab.research.google.com/github/springboardmentor789r/AgriYield/blob/Intern_LikhitaKoppuravuri/ProphetModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

koppuravurilikhita_cropyield_path = kagglehub.dataset_download('koppuravurilikhita/cropyield')

print('Data source import complete.')


In [None]:
!pip install prophet
from prophet import Prophet



In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns

# Load the data
df = pd.read_csv('/kaggle/input/cropyield/crpDataSet_avg0.csv')

# Display basic info about the dataset
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nDate range:", df['Date'].min(), "to", df['Date'].max())
print("\nMissing values:")
print(df.isnull().sum())

# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df = df.sort_values('Date').reset_index(drop=True)

print(f"\nTotal days: {len(df)}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

Dataset shape: (3624, 10)

Columns: ['Date', 'Soil_pH', 'Temperature', 'Humidity', 'Wind_Speed', 'N', 'P', 'K', 'Crop_Yield', 'Soil_Quality']

Date range: 01-01-2014 to 31-12-2023

Missing values:
Date            0
Soil_pH         0
Temperature     0
Humidity        0
Wind_Speed      0
N               0
P               0
K               0
Crop_Yield      0
Soil_Quality    0
dtype: int64

Total days: 3624
Date range: 2014-01-01 00:00:00 to 2023-12-31 00:00:00


In [None]:
# Select key variables for forecasting
target_variables = ['Crop_Yield', 'Soil_pH', 'Temperature', 'Humidity', 'Soil_Quality']

# Function to create Prophet forecast and calculate metrics
def prophet_forecast(df, target_col, periods=365):
    """Create Prophet forecast for a given target column"""

    # Prepare data for Prophet
    prophet_df = df[['Date', target_col]].copy()
    prophet_df.columns = ['ds', 'y']

    # Remove any missing values
    prophet_df = prophet_df.dropna()

    # Split into train and test (last 20% as test)
    split_idx = int(len(prophet_df) * 0.8)
    train = prophet_df.iloc[:split_idx]
    test = prophet_df.iloc[split_idx:]

    # Create and fit Prophet model
    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=0.05
    )

    model.fit(train)

    # Create future dataframe including test period
    future = model.make_future_dataframe(periods=len(test), include_history=True)
    forecast = model.predict(future)

    # Merge with actual values for metrics calculation
    results = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].merge(
        prophet_df, on='ds', how='left'
    )

    # Calculate metrics on test set
    test_results = results[results['ds'].isin(test['ds'])]
    test_results = test_results.dropna()

    if len(test_results) > 0:
        mae = mean_absolute_error(test_results['y'], test_results['yhat'])
        rmse = np.sqrt(mean_squared_error(test_results['y'], test_results['yhat']))
        r2 = r2_score(test_results['y'], test_results['yhat'])

        metrics = {
            'MAE': mae,
            'RMSE': rmse,
            'R²': r2,
            'Test Points': len(test_results)
        }
    else:
        metrics = {'MAE': np.nan, 'RMSE': np.nan, 'R²': np.nan, 'Test Points': 0}

    return model, forecast, results, metrics

# Create forecasts for all target variables
forecast_results = {}

print("PROPHET TIME SERIES FORECASTING RESULTS")
print("=" * 50)

for target in target_variables:
    print(f"\nForecasting: {target}")
    print("-" * 30)

    try:
        model, forecast, results, metrics = prophet_forecast(df, target)
        forecast_results[target] = {
            'model': model,
            'forecast': forecast,
            'results': results,
            'metrics': metrics
        }

        print(f"MAE: {metrics['MAE']:.4f}")
        print(f"RMSE: {metrics['RMSE']:.4f}")
        print(f"R²: {metrics['R²']:.4f}")
        print(f"Test points: {metrics['Test Points']}")

    except Exception as e:
        print(f"Error forecasting {target}: {str(e)}")
        forecast_results[target] = None

# Create summary metrics table
metrics_summary = []
for target in target_variables:
    if forecast_results[target] is not None:
        metrics = forecast_results[target]['metrics']
        metrics_summary.append({
            'Variable': target,
            'MAE': metrics['MAE'],
            'RMSE': metrics['RMSE'],
            'R²': metrics['R²'],
            'Test Points': metrics['Test Points']
        })

metrics_df = pd.DataFrame(metrics_summary)
print("\n" + "=" * 60)
print("SUMMARY METRICS TABLE")
print("=" * 60)
print(metrics_df.round(4))

PROPHET TIME SERIES FORECASTING RESULTS

Forecasting: Crop_Yield
------------------------------


15:26:39 - cmdstanpy - INFO - Chain [1] start processing
15:26:40 - cmdstanpy - INFO - Chain [1] done processing
15:26:40 - cmdstanpy - INFO - Chain [1] start processing


MAE: 7.2218
RMSE: 9.6504
R²: 0.3728
Test points: 720

Forecasting: Soil_pH
------------------------------


15:26:41 - cmdstanpy - INFO - Chain [1] done processing
15:26:41 - cmdstanpy - INFO - Chain [1] start processing


MAE: 0.2695
RMSE: 0.3576
R²: -0.0072
Test points: 720

Forecasting: Temperature
------------------------------


15:26:42 - cmdstanpy - INFO - Chain [1] done processing
15:26:42 - cmdstanpy - INFO - Chain [1] start processing


MAE: 1.4625
RMSE: 1.8734
R²: 0.8541
Test points: 720

Forecasting: Humidity
------------------------------


15:26:43 - cmdstanpy - INFO - Chain [1] done processing
15:26:43 - cmdstanpy - INFO - Chain [1] start processing


MAE: 1.0867
RMSE: 1.4958
R²: 0.8344
Test points: 720

Forecasting: Soil_Quality
------------------------------


15:26:43 - cmdstanpy - INFO - Chain [1] done processing


MAE: 6.1715
RMSE: 8.0592
R²: -0.0045
Test points: 720

SUMMARY METRICS TABLE
       Variable     MAE    RMSE      R²  Test Points
0    Crop_Yield  7.2218  9.6504  0.3728          720
1       Soil_pH  0.2695  0.3576 -0.0072          720
2   Temperature  1.4625  1.8734  0.8541          720
3      Humidity  1.0867  1.4958  0.8344          720
4  Soil_Quality  6.1715  8.0592 -0.0045          720
