# Climate-Smart Agriculture: Data Exploration

This notebook demonstrates exploratory data analysis for the crop yield prediction project.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
import sys
sys.path.append('../src')

from preprocess import load_data, engineer_features

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Load and Inspect Data

In [None]:
# Load data
df = load_data("../data/merged_agri.csv")
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check missing values
missing = df.isnull().sum()
missing[missing > 0]

## 2. Target Variable Analysis

In [None]:
# Yield distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['yield_t_ha'], bins=50, edgecolor='black')
axes[0].set_xlabel('Yield (t/ha)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Yield Distribution')

axes[1].boxplot(df['yield_t_ha'])
axes[1].set_ylabel('Yield (t/ha)')
axes[1].set_title('Yield Boxplot')

plt.tight_layout()
plt.show()

print(f"Mean yield: {df['yield_t_ha'].mean():.2f} t/ha")
print(f"Median yield: {df['yield_t_ha'].median():.2f} t/ha")
print(f"Std dev: {df['yield_t_ha'].std():.2f} t/ha")

## 3. Crop Analysis

In [None]:
# Yield by crop
crop_yields = df.groupby('crop')['yield_t_ha'].agg(['mean', 'std', 'count']).sort_values('mean', ascending=False)
print(crop_yields)

# Visualize
plt.figure(figsize=(12, 6))
df.boxplot(column='yield_t_ha', by='crop', figsize=(12, 6))
plt.xticks(rotation=45)
plt.ylabel('Yield (t/ha)')
plt.title('Yield Distribution by Crop')
plt.suptitle('')  # Remove default title
plt.tight_layout()
plt.show()

## 4. Weather Impact Analysis

In [None]:
# Correlation with weather variables
weather_vars = ['rainfall_monsoon_mm', 'rainy_days', 'avg_temp_season_c', 'dry_spell_days']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for i, var in enumerate(weather_vars):
    axes[i].scatter(df[var], df['yield_t_ha'], alpha=0.5)
    axes[i].set_xlabel(var)
    axes[i].set_ylabel('Yield (t/ha)')
    axes[i].set_title(f'Yield vs {var}')
    
    # Add trend line
    z = np.polyfit(df[var].dropna(), df.loc[df[var].notna(), 'yield_t_ha'], 1)
    p = np.poly1d(z)
    axes[i].plot(df[var].sort_values(), p(df[var].sort_values()), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

## 5. Soil and Nutrient Analysis

In [None]:
# NPK impact
nutrient_vars = ['N', 'P', 'K', 'soil_pH', 'org_carbon']

# Correlation matrix
corr_data = df[nutrient_vars + ['yield_t_ha']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_data, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation: Soil/Nutrients vs Yield')
plt.tight_layout()
plt.show()

## 6. Temporal Trends

In [None]:
# Yield trends over years
yearly_yield = df.groupby('year')['yield_t_ha'].agg(['mean', 'std'])

plt.figure(figsize=(12, 6))
plt.errorbar(yearly_yield.index, yearly_yield['mean'], yerr=yearly_yield['std'], 
             marker='o', capsize=5, capthick=2)
plt.xlabel('Year')
plt.ylabel('Average Yield (t/ha)')
plt.title('Average Yield Trends Over Years')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Feature Engineering Preview

In [None]:
# Apply feature engineering
df_eng = engineer_features(df)

# Check new features
new_features = ['rainfall_intensity', 'gdd', 'moisture_stress', 'n_ratio', 'p_ratio', 'k_ratio']
df_eng[new_features].head(10)

In [None]:
# Correlation of engineered features with yield
feature_corr = df_eng[new_features + ['yield_t_ha']].corr()['yield_t_ha'].sort_values(ascending=False)
print("\nCorrelation with Yield:")
print(feature_corr)

## 8. Key Insights

Based on the analysis above, summarize:
1. Which crops have highest/lowest yields?
2. How does rainfall affect yield?
3. What's the optimal soil pH range?
4. Are there temporal trends?
5. Which features show strongest correlation with yield?

In [None]:
# Summary statistics by irrigation type
irrigation_summary = df.groupby('irrigation_access')['yield_t_ha'].agg(['mean', 'median', 'std', 'count'])
print("\nYield by Irrigation Type:")
print(irrigation_summary)