# Soil Health Dataset Exploration

This notebook explores the synthetic soil health monitoring dataset that contains 5000 samples with comprehensive soil parameters, weather data, and vegetation indices.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the dataset
df = pd.read_csv('data/soil_health_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## Dataset Overview

In [None]:
# Display first few rows
display(df.head())

# Dataset info
print("\nDataset Info:")
df.info()

In [None]:
# Basic statistics
print("\nNumerical Features Summary:")
numerical_cols = df.select_dtypes(include=[np.number]).columns
display(df[numerical_cols].describe())

## Data Distribution Analysis

In [None]:
# Soil Health Score Distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Soil health score histogram
axes[0,0].hist(df['soil_health_score'], bins=30, alpha=0.7, color='green')
axes[0,0].set_title('Soil Health Score Distribution')
axes[0,0].set_xlabel('Soil Health Score')
axes[0,0].set_ylabel('Frequency')

# Health category distribution
health_counts = df['health_category'].value_counts()
axes[0,1].pie(health_counts.values, labels=health_counts.index, autopct='%1.1f%%')
axes[0,1].set_title('Health Category Distribution')

# Soil type distribution
soil_counts = df['soil_type'].value_counts()
axes[1,0].bar(soil_counts.index, soil_counts.values, color='brown', alpha=0.7)
axes[1,0].set_title('Soil Type Distribution')
axes[1,0].set_xlabel('Soil Type')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Crop type distribution
crop_counts = df['crop_type'].value_counts()
axes[1,1].bar(crop_counts.index, crop_counts.values, color='darkgreen', alpha=0.7)
axes[1,1].set_title('Crop Type Distribution')
axes[1,1].set_xlabel('Crop Type')
axes[1,1].set_ylabel('Count')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Key Soil Parameters Analysis

In [None]:
# Key soil parameters
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Soil moisture
axes[0,0].hist(df['soil_moisture_percent'], bins=25, alpha=0.7, color='blue')
axes[0,0].set_title('Soil Moisture Distribution')
axes[0,0].set_xlabel('Moisture (%)')
axes[0,0].set_ylabel('Frequency')

# pH level
axes[0,1].hist(df['ph_level'], bins=25, alpha=0.7, color='red')
axes[0,1].set_title('pH Level Distribution')
axes[0,1].set_xlabel('pH Level')
axes[0,1].set_ylabel('Frequency')

# Temperature
axes[0,2].hist(df['soil_temperature_celsius'], bins=25, alpha=0.7, color='orange')
axes[0,2].set_title('Soil Temperature Distribution')
axes[0,2].set_xlabel('Temperature (°C)')
axes[0,2].set_ylabel('Frequency')

# Nitrogen
axes[1,0].hist(df['nitrogen_ppm'], bins=25, alpha=0.7, color='green')
axes[1,0].set_title('Nitrogen Content Distribution')
axes[1,0].set_xlabel('Nitrogen (ppm)')
axes[1,0].set_ylabel('Frequency')

# Organic matter
axes[1,1].hist(df['organic_matter_percent'], bins=25, alpha=0.7, color='brown')
axes[1,1].set_title('Organic Matter Distribution')
axes[1,1].set_xlabel('Organic Matter (%)')
axes[1,1].set_ylabel('Frequency')

# NDVI
axes[1,2].hist(df['ndvi_vegetation_index'], bins=25, alpha=0.7, color='darkgreen')
axes[1,2].set_title('NDVI Distribution')
axes[1,2].set_xlabel('NDVI Index')
axes[1,2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix of Soil Parameters')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Relationships with Soil Health Score

In [None]:
# Scatter plots showing relationship with soil health score
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# pH vs Soil Health
axes[0,0].scatter(df['ph_level'], df['soil_health_score'], alpha=0.5, color='red')
axes[0,0].set_xlabel('pH Level')
axes[0,0].set_ylabel('Soil Health Score')
axes[0,0].set_title('pH vs Soil Health Score')

# Moisture vs Soil Health
axes[0,1].scatter(df['soil_moisture_percent'], df['soil_health_score'], alpha=0.5, color='blue')
axes[0,1].set_xlabel('Soil Moisture (%)')
axes[0,1].set_ylabel('Soil Health Score')
axes[0,1].set_title('Moisture vs Soil Health Score')

# Organic Matter vs Soil Health
axes[0,2].scatter(df['organic_matter_percent'], df['soil_health_score'], alpha=0.5, color='brown')
axes[0,2].set_xlabel('Organic Matter (%)')
axes[0,2].set_ylabel('Soil Health Score')
axes[0,2].set_title('Organic Matter vs Soil Health Score')

# Nitrogen vs Soil Health
axes[1,0].scatter(df['nitrogen_ppm'], df['soil_health_score'], alpha=0.5, color='green')
axes[1,0].set_xlabel('Nitrogen (ppm)')
axes[1,0].set_ylabel('Soil Health Score')
axes[1,0].set_title('Nitrogen vs Soil Health Score')

# NDVI vs Soil Health
axes[1,1].scatter(df['ndvi_vegetation_index'], df['soil_health_score'], alpha=0.5, color='darkgreen')
axes[1,1].set_xlabel('NDVI Index')
axes[1,1].set_ylabel('Soil Health Score')
axes[1,1].set_title('NDVI vs Soil Health Score')

# Electrical Conductivity vs Soil Health
axes[1,2].scatter(df['electrical_conductivity_ds_per_m'], df['soil_health_score'], alpha=0.5, color='purple')
axes[1,2].set_xlabel('Electrical Conductivity (dS/m)')
axes[1,2].set_ylabel('Soil Health Score')
axes[1,2].set_title('EC vs Soil Health Score')

plt.tight_layout()
plt.show()

## Seasonal and Regional Analysis

In [None]:
# Box plots for seasonal variations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Soil health by season
sns.boxplot(data=df, x='season', y='soil_health_score', ax=axes[0,0])
axes[0,0].set_title('Soil Health Score by Season')
axes[0,0].tick_params(axis='x', rotation=45)

# Soil health by region
sns.boxplot(data=df, x='region', y='soil_health_score', ax=axes[0,1])
axes[0,1].set_title('Soil Health Score by Region')
axes[0,1].tick_params(axis='x', rotation=45)

# Soil health by soil type
sns.boxplot(data=df, x='soil_type', y='soil_health_score', ax=axes[1,0])
axes[1,0].set_title('Soil Health Score by Soil Type')
axes[1,0].tick_params(axis='x', rotation=45)

# Soil health by crop type
sns.boxplot(data=df, x='crop_type', y='soil_health_score', ax=axes[1,1])
axes[1,1].set_title('Soil Health Score by Crop Type')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Interactive Visualizations

In [None]:
# Interactive 3D scatter plot
fig = px.scatter_3d(df, 
                    x='ph_level', 
                    y='soil_moisture_percent', 
                    z='nitrogen_ppm',
                    color='soil_health_score',
                    color_continuous_scale='Viridis',
                    title='3D Scatter Plot: pH, Moisture, Nitrogen vs Soil Health',
                    hover_data=['soil_type', 'crop_type', 'health_category'])
fig.show()

In [None]:
# Interactive correlation heatmap
fig = px.imshow(correlation_matrix,
                text_auto=True,
                aspect="auto",
                title="Interactive Correlation Matrix",
                color_continuous_scale='RdBu_r')
fig.show()

## Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found!")

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Data type summary
print("\nData Types:")
print(df.dtypes)

## Feature Importance Analysis

In [None]:
# Calculate correlation with soil health score
correlations_with_target = df[numerical_cols].corr()['soil_health_score'].abs().sort_values(ascending=False)
correlations_with_target = correlations_with_target.drop('soil_health_score')  # Remove self-correlation

plt.figure(figsize=(10, 8))
correlations_with_target.plot(kind='barh')
plt.title('Feature Correlation with Soil Health Score (Absolute Values)')
plt.xlabel('Absolute Correlation')
plt.tight_layout()
plt.show()

print("Top 10 features correlated with soil health score:")
print(correlations_with_target.head(10))

## Dataset Summary

This comprehensive soil health dataset includes:

- **5000 samples** from different agricultural fields
- **21 features** including soil properties, weather data, and vegetation indices
- **Multi-year data** spanning from 2023 to 2025
- **Realistic relationships** between soil parameters and health outcomes
- **Actionable recommendations** for soil management

### Key Features:
1. **Soil Properties**: pH, moisture, temperature, nutrients (NPK), organic matter
2. **Physical Properties**: Bulk density, electrical conductivity
3. **Environmental Data**: Weather conditions, rainfall, humidity
4. **Vegetation Index**: NDVI from satellite imagery
5. **Context**: Soil type, crop type, season, region
6. **Target Variables**: Soil health score and category
7. **Recommendations**: Automated suggestions for soil management

This dataset is ready for machine learning model development for soil health prediction and management recommendations.