In [1]:
import pandas as pd
import numpy as np

In [2]:
# 1. Load the dataset
df = pd.read_csv('data/APY.csv')

In [3]:
print("Columns as read from the file:", df.columns.tolist())

Columns as read from the file: ['State', 'District ', 'Crop', 'Crop_Year', 'Season', 'Area ', 'Production', 'Yield']


In [4]:
# Clean all column names to remove leading/trailing spaces
df.columns = df.columns.str.strip()

In [5]:
# Print the columns again to confirm they are cleaned
print("Columns after cleaning:", df.columns.tolist())

Columns after cleaning: ['State', 'District', 'Crop', 'Crop_Year', 'Season', 'Area', 'Production', 'Yield']


In [6]:
# Filter for the state of 'Odisha'
df_odisha = df[df['State'] == 'Odisha'].copy()

In [7]:
# Handle potential zero values in 'Area'
df_odisha['Area'] = df_odisha['Area'].replace(0, np.nan)
df_odisha.dropna(subset=['Area'], inplace=True)

In [8]:
# Ensure 'Production' is numeric
df_odisha['Production'] = pd.to_numeric(df_odisha['Production'], errors='coerce')
df_odisha.dropna(subset=['Production'], inplace=True)

In [9]:
# Calculate the 'Yield'
df_odisha['Calculated_Yield'] = df_odisha['Production'] / df_odisha['Area']

In [10]:
print("\n✅ Success! Your Odisha DataFrame is ready.")
print(df_odisha.head())


✅ Success! Your Odisha DataFrame is ready.
         State District       Crop  Crop_Year       Season    Area  \
212582  Odisha   ANUGUL  Arhar/Tur       1997  Autumn       3555.0   
212583  Odisha   ANUGUL  Arhar/Tur       1997  Summer        469.0   
212584  Odisha   ANUGUL  Arhar/Tur       1997  Winter        660.0   
212585  Odisha   ANUGUL  Arhar/Tur       1999  Kharif       7960.0   
212586  Odisha   ANUGUL  Arhar/Tur       2000  Kharif       8930.0   

        Production  Yield  Calculated_Yield  
212582       739.0   0.21          0.207876  
212583       115.0   0.25          0.245203  
212584       114.0   0.17          0.172727  
212585      5010.0   0.63          0.629397  
212586      6430.0   0.72          0.720045  
