In [1]:
import pandas as pd
import numpy as np
import os

In [8]:
# ==============================================================================
# PART 1: PROCESS ALL 30 DISTRICT WEATHER FILES
# ==============================================================================
district_filenames = {
    'ANUGUL':'anugul.csv',
    'BALANGIR':'balangir.csv',
    'BALESHWAR':'baleshwar.csv',
    'BARGARH':'bargarh.csv',
    'BHADRAK':'bhadrak.csv',
    'BOUDH':'boudh.csv',
    'CUTTACK':'cuttack.csv',
    'DEOGARH':'deogarh.csv',
    'DHENKANAL':'dhenkanal.csv',
    'GAJAPATI':'gajapati.csv',
    'GANJAM':'ganjam.csv',
    'JAGATSINGHPUR':'jagatsinghpur.csv',
    'JAJAPUR':'jajapur.csv',
    'JHARSUGUDA':'jharsuguda.csv',
    'KALAHANDI':'kalahandi.csv',
    'KANDHAMAL':'kandhamal.csv',
    'KENDRAPARA':'kendrapara.csv',
    'KENDUJHAR':'kendujhar.csv',
    'KHORDHA':'khordha.csv',
    'KORAPUT':'koraput.csv',
    'MALKANGIRI':'malkangiri.csv',
    'MAYURBHANJ':'mayurbhanj.csv',
    'NABARANGPUR':'nabarangpur.csv',
    'NAYAGARH':'nayagarh.csv',
    'NUAPADA':'nuapada.csv',
    'PURI': 'puri.csv',
    'RAYAGADA':'rayagada.csv',
    'SAMBALPUR':'sambalpur.csv',
    'SONEPUR':'sonepur.csv',
    'SUNDARGARH':'sundargarh.csv'
}

# An empty list to store the processed data for each district
all_weather_data = []

def get_season(month):
    if 6 <= month <= 10: return 'Kharif'
    elif month in [11, 12, 1, 2, 3]: return 'Rabi'
    else: return 'Summer'

print("Starting to process weather files...")
for district_name, file_name in district_filenames.items():
    try:
        temp_df = pd.read_csv(file_name, skiprows=13)

        # Create a proper date from YEAR and Day of Year (DOY)
        temp_df['DATE'] = pd.to_datetime(temp_df['YEAR'].astype(str) + temp_df['DOY'].astype(str), format='%Y%j')
        # Extract the month from our new DATE column
        temp_df['Month'] = temp_df['DATE'].dt.month

        temp_df['Season'] = temp_df['Month'].apply(get_season)
        
        seasonal_data = temp_df.groupby(['YEAR', 'Season']).agg(
            avg_temp=('T2M', 'mean'),
            max_temp=('T2M_MAX', 'max'),
            min_temp=('T2M_MIN', 'min'),
            total_rainfall=('PRECTOTCORR', 'sum')
        ).reset_index()
        
        seasonal_data.rename(columns={'YEAR': 'Year'}, inplace=True)
        seasonal_data['District_Name'] = district_name
        all_weather_data.append(seasonal_data)
        print(f"  ✅ Successfully processed {file_name} for {district_name}")

    except FileNotFoundError:
        print(f"  ❌ ERROR: File not found for {district_name}: '{file_name}'. Please check the filename. Skipping.")
    except Exception as e:
        print(f"  ❌ ERROR: Could not process {file_name}. Reason: {e}")

final_weather_df = pd.concat(all_weather_data, ignore_index=True)
print("\n✅ All weather files have been processed and combined!")


Starting to process weather files...
  ✅ Successfully processed anugul.csv for ANUGUL
  ✅ Successfully processed balangir.csv for BALANGIR
  ✅ Successfully processed baleshwar.csv for BALESHWAR
  ✅ Successfully processed bargarh.csv for BARGARH
  ✅ Successfully processed bhadrak.csv for BHADRAK
  ✅ Successfully processed boudh.csv for BOUDH
  ✅ Successfully processed cuttack.csv for CUTTACK
  ✅ Successfully processed deogarh.csv for DEOGARH
  ✅ Successfully processed dhenkanal.csv for DHENKANAL
  ✅ Successfully processed gajapati.csv for GAJAPATI
  ✅ Successfully processed ganjam.csv for GANJAM
  ✅ Successfully processed jagatsinghpur.csv for JAGATSINGHPUR
  ✅ Successfully processed jajapur.csv for JAJAPUR
  ✅ Successfully processed jharsuguda.csv for JHARSUGUDA
  ✅ Successfully processed kalahandi.csv for KALAHANDI
  ✅ Successfully processed kandhamal.csv for KANDHAMAL
  ✅ Successfully processed kendrapara.csv for KENDRAPARA
  ✅ Successfully processed kendujhar.csv for KENDUJHAR
  ✅ S

In [9]:
# ==============================================================================
# PART 2 & 3: LOAD CROP DATA AND MERGE
# ==============================================================================

print("\nLoading and cleaning the main crop production data (APY.csv)...")
df_crop = pd.read_csv('APY.csv')
df_crop.columns = df_crop.columns.str.strip()
df_odisha = df_crop[df_crop['State'] == 'Odisha'].copy()
df_odisha.rename(columns={'Crop_Year': 'Year', 'District': 'District_Name'}, inplace=True)
print("✅ Crop data loaded and cleaned.")

print("\nMerging crop data with weather data...")
master_df = pd.merge(df_odisha, final_weather_df, on=['District_Name', 'Year', 'Season'], how='left')
master_df.dropna(subset=['avg_temp'], inplace=True)
print("✅ Merge complete! Your master dataset is ready.")

print("\n--- MASTER DATASET READY FOR MODEL TRAINING ---")
print(master_df.head())
print(f"\nYour final dataset has {len(master_df)} rows.")


Loading and cleaning the main crop production data (APY.csv)...
✅ Crop data loaded and cleaned.

Merging crop data with weather data...
✅ Merge complete! Your master dataset is ready.

--- MASTER DATASET READY FOR MODEL TRAINING ---
Empty DataFrame
Columns: [State, District_Name, Crop, Year, Season, Area, Production, Yield, avg_temp, max_temp, min_temp, total_rainfall]
Index: []

Your final dataset has 0 rows.


In [10]:
import pandas as pd
import numpy as np

# ==============================================================================
# PART 1: PROCESS ALL 30 DISTRICT WEATHER FILES
# ==============================================================================

# Using the dictionary you provided
district_filenames = {
    'ANUGUL':'anugul.csv',
    'BALANGIR':'balangir.csv',
    'BALESHWAR':'baleshwar.csv',
    'BARGARH':'bargarh.csv',
    'BHADRAK':'bhadrak.csv',
    'BOUDH':'boudh.csv',
    'CUTTACK':'cuttack.csv',
    'DEOGARH':'deogarh.csv',
    'DHENKANAL':'dhenkanal.csv',
    'GAJAPATI':'gajapati.csv',
    'GANJAM':'ganjam.csv',
    'JAGATSINGHPUR':'jagatsinghpur.csv',
    'JAJAPUR':'jajapur.csv',
    'JHARSUGUDA':'jharsuguda.csv',
    'KALAHANDI':'kalahandi.csv',
    'KANDHAMAL':'kandhamal.csv',
    'KENDRAPARA':'kendrapara.csv',
    'KENDUJHAR':'kendujhar.csv',
    'KHORDHA':'khordha.csv',
    'KORAPUT':'koraput.csv',
    'MALKANGIRI':'malkangiri.csv',
    'MAYURBHANJ':'mayurbhanj.csv',
    'NABARANGPUR':'nabarangpur.csv',
    'NAYAGARH':'nayagarh.csv',
    'NUAPADA':'nuapada.csv',
    'PURI': 'puri.csv',
    'RAYAGADA':'rayagada.csv',
    'SAMBALPUR':'sambalpur.csv',
    'SONEPUR':'sonepur.csv',
    'SUNDARGARH':'sundargarh.csv'
}

all_weather_data = []

def get_season(month):
    if 6 <= month <= 10: return 'Kharif'
    elif month in [11, 12, 1, 2, 3]: return 'Rabi'
    else: return 'Summer'

print("Starting to process weather files...")
for district_name, file_name in district_filenames.items():
    try:
        temp_df = pd.read_csv(file_name, skiprows=13)
        temp_df['DATE'] = pd.to_datetime(temp_df['YEAR'].astype(str) + temp_df['DOY'].astype(str), format='%Y%j')
        temp_df['Month'] = temp_df['DATE'].dt.month
        temp_df['Season'] = temp_df['Month'].apply(get_season)
        seasonal_data = temp_df.groupby(['YEAR', 'Season']).agg(avg_temp=('T2M', 'mean'),max_temp=('T2M_MAX', 'max'),min_temp=('T2M_MIN', 'min'),total_rainfall=('PRECTOTCORR', 'sum')).reset_index()
        seasonal_data.rename(columns={'YEAR': 'Year'}, inplace=True)
        seasonal_data['District_Name'] = district_name
        all_weather_data.append(seasonal_data)
    except FileNotFoundError:
        print(f"  ❌ ERROR: File not found for {district_name}: '{file_name}'.")
final_weather_df = pd.concat(all_weather_data, ignore_index=True)
print("\n✅ All weather files have been processed and combined!")

Starting to process weather files...

✅ All weather files have been processed and combined!


In [11]:
# ==============================================================================
# PART 2: LOAD AND PREPARE CROP DATA
# ==============================================================================

print("\nLoading and cleaning the main crop production data (APY.csv)...")
df_crop = pd.read_csv('APY.csv')
df_crop.columns = df_crop.columns.str.strip()
df_odisha = df_crop[df_crop['State'] == 'Odisha'].copy()
df_odisha.rename(columns={'Crop_Year': 'Year', 'District': 'District_Name'}, inplace=True)
print("✅ Crop data loaded.")


Loading and cleaning the main crop production data (APY.csv)...
✅ Crop data loaded.


In [12]:
# ==============================================================================
# PART 3: DEBUG AND MERGE
# ==============================================================================

print("\n--- DEBUGGING: CHECKING KEYS BEFORE MERGE ---")

# Standardize the keys
df_odisha['Season_Clean'] = df_odisha['Season'].str.strip()
df_odisha['District_Name_Clean'] = df_odisha['District_Name'].str.upper()

# Print the unique values from both DataFrames
print("\n1. Unique districts in Crop Data:")
print(sorted(df_odisha['District_Name_Clean'].unique()))

print("\n2. Unique districts in Weather Data:")
print(sorted(final_weather_df['District_Name'].unique()))

print("\n3. Unique seasons in Crop Data (after cleaning spaces):")
print(df_odisha['Season_Clean'].unique())

print("\n4. Unique seasons in Weather Data:")
print(final_weather_df['Season'].unique())

print("\n--- END DEBUGGING ---")


# Perform the merge using the CLEANED columns
master_df = pd.merge(df_odisha, final_weather_df,
                     left_on=['District_Name_Clean', 'Year', 'Season_Clean'],
                     right_on=['District_Name', 'Year', 'Season'],
                     how='left')

# Drop helper columns
master_df = master_df.drop(columns=['District_Name_Clean', 'Season_Clean'])

master_df.dropna(subset=['avg_temp'], inplace=True)
print("\n✅ Merge complete!")
print("\n--- MASTER DATASET READY FOR MODEL TRAINING ---")
print(master_df.head())
print(f"\nYour final dataset has {len(master_df)} rows.")


--- DEBUGGING: CHECKING KEYS BEFORE MERGE ---

1. Unique districts in Crop Data:
['ANUGUL', 'BALANGIR', 'BALESHWAR', 'BARGARH', 'BHADRAK', 'BOUDH', 'CUTTACK', 'DEOGARH', 'DHENKANAL', 'GAJAPATI', 'GANJAM', 'JAGATSINGHAPUR', 'JAJAPUR', 'JHARSUGUDA', 'KALAHANDI', 'KANDHAMAL', 'KENDRAPARA', 'KENDUJHAR', 'KHORDHA', 'KORAPUT', 'MALKANGIRI', 'MAYURBHANJ', 'NABARANGPUR', 'NAYAGARH', 'NUAPADA', 'PURI', 'RAYAGADA', 'SAMBALPUR', 'SONEPUR', 'SUNDARGARH']

2. Unique districts in Weather Data:
['ANUGUL', 'BALANGIR', 'BALESHWAR', 'BARGARH', 'BHADRAK', 'BOUDH', 'CUTTACK', 'DEOGARH', 'DHENKANAL', 'GAJAPATI', 'GANJAM', 'JAGATSINGHPUR', 'JAJAPUR', 'JHARSUGUDA', 'KALAHANDI', 'KANDHAMAL', 'KENDRAPARA', 'KENDUJHAR', 'KHORDHA', 'KORAPUT', 'MALKANGIRI', 'MAYURBHANJ', 'NABARANGPUR', 'NAYAGARH', 'NUAPADA', 'PURI', 'RAYAGADA', 'SAMBALPUR', 'SONEPUR', 'SUNDARGARH']

3. Unique seasons in Crop Data (after cleaning spaces):
['Autumn' 'Summer' 'Winter' 'Kharif' 'Rabi' 'Whole Year']

4. Unique seasons in Weather Data