In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
bhadramohit_agriculture_and_farming_dataset_path = kagglehub.dataset_download('bhadramohit/agriculture-and-farming-dataset')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 🌾 Dataset Overview

This dataset provides insights into various aspects of agriculture 🌱, specifically focusing on farm-level details, crop types, irrigation methods, soil properties, and associated metrics such as yield, water usage, fertilizer usage, and pesticide usage. The goal is to analyze and predict crop yields and identify factors that influence productivity.

### 🔑 Key Features:

- **🏷️ Farm_ID**: Unique identifier for each farm.
- **🌽 Crop_Type**: Type of crop cultivated (e.g., Cotton, Carrot, Wheat).
- **🚿 Irrigation_Type**: Method of irrigation used (e.g., Drip, Manual, Flood).
- **🌍 Soil_Type**: Type of soil present on the farm (e.g., Loamy, Sandy, Silty).
- **🗓️ Season**: The season in which the crop is grown (e.g., Kharif, Rabi, Zaid).
- **🏞️ Farm_Area (acres)**: Total area of the farm in acres.
- **🧪 Fertilizer_Used (tons)**: Quantity of fertilizer applied.
- **🐞 Pesticide_Used (kg)**: Amount of pesticide used.
- **💧 Water_Usage (cubic meters)**: Total water usage for the crop.
- **📈 Yield (tons)**: Total yield of the crop in tons.

### 🎯 Objective:

The primary objective is to analyze the relationships between these variables, identify factors that influence crop yields, and build predictive models to estimate yields based on input features.

### 📊 Data Characteristics:

- **📝 Data Type**: The dataset contains both numerical and categorical features.
- **🎯 Target Variable**: `Yield(tons)`
- **🔢 Encoded Features**: Categorical variables such as `Crop_Type`, `Irrigation_Type`, and `Soil_Type` are encoded for modeling purposes.

### 🤖 Modeling Approach:

- **Regression Models Used**:
  - 🧮 Linear Regression
  - 🌲 Random Forest Regressor
  - ⚡ XGBoost Regressor
  - 💡 LGBM Regressor
  - 🐈 CatBoost Regressor
- **🏆 Best Model**: The CatBoost Regressor provided the best performance based on the evaluation metrics (Mean Squared Error and R-squared).

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">1. Imports and Setup</h1>
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

# <span style="color:transparent;">2. Load and Explore Dataset</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">2. Load and Explore Dataset</h1>
</div>

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/agriculture-and-farming-dataset/agriculture_dataset.csv')

In [None]:
# Display basic information about the dataset
print("Shape of the dataset:", df.shape)
display(df.head())
print("\nDataset Information:")
print(df.info())
print("\nStatistical Summary:")
display(df.describe().T)

A summary of the dataset:

- **Shape of the dataset**: (50, 10)
- **Column Information**:
  - `Farm_ID`: Unique identifier for each farm (object)
  - `Crop_Type`: Type of crop being cultivated (object)
  - `Farm_Area(acres)`: Land area of the farm in acres (float64)
  - `Irrigation_Type`: Type of irrigation used (object)
  - `Fertilizer_Used(tons)`: Amount of fertilizer used in tons (float64)
  - `Pesticide_Used(kg)`: Amount of pesticide used in kg (float64)
  - `Yield(tons)`: Crop yield in tons (float64)
  - `Soil_Type`: Type of soil on the farm (object)
  - `Season`: Season in which the crop is cultivated (object)
  - `Water_Usage(cubic meters)`: Water used in cubic meters (float64)

- **Statistical Summary**:
  - The average farm area is around 255 acres, with a standard deviation of 139.
  - Fertilizer and pesticide usage vary, with average values of 4.9 tons and 2.4 kg, respectively.
  - The average crop yield is about 27 tons, with water usage averaging 56,724 cubic meters.


In [None]:
# Check for missing and duplicated values
print(f'\nMissing values: {df.isna().sum().sum()}')
print(f'Duplicated values: {df.duplicated().sum()}')

The dataset has:

- **Missing values**: 0
- **Duplicated values**: 0

# <span style="color:transparent;">3. Unique Value Exploration</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">3. Unique Value Exploration</h1>
</div>

In [None]:
# Display the number of unique values in each column
print("\nUnique Values in Each Column:")
print(df.nunique())

In [None]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numerical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Display the lists of numerical and categorical columns
print("\nNumerical Columns:", numerical_columns)
print("Categorical Columns:", non_numerical_columns)

In [None]:
# Display unique values for each categorical column
for col in non_numerical_columns:
    print(f"\nColumn: {col}")
    print(f"Unique Values: {df[col].unique()}")

The unique values and column types in the dataset:

### Unique Values in Each Column
- **Farm_ID**: 50 unique IDs
- **Crop_Type**: 10 unique crop types
- **Irrigation_Type**: 5 unique types
- **Soil_Type**: 5 unique types
- **Season**: 3 unique values
- Other columns like `Farm_Area(acres)`, `Fertilizer_Used(tons)`, `Pesticide_Used(kg)`, `Yield(tons)`, and `Water_Usage(cubic meters)` have numerous unique values due to their continuous nature.

### Column Types
- **Numerical Columns**: `['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Yield(tons)', 'Water_Usage(cubic meters)']`
- **Categorical Columns**: `['Farm_ID', 'Crop_Type', 'Irrigation_Type', 'Soil_Type', 'Season']`

### Unique Values in Categorical Columns
- **Farm_ID**: Unique farm identifiers (e.g., `F001`, `F002`, ... `F050`)
- **Crop_Type**: `['Cotton', 'Carrot', 'Sugarcane', 'Tomato', 'Soybean', 'Rice', 'Maize', 'Wheat', 'Barley', 'Potato']`
- **Irrigation_Type**: `['Sprinkler', 'Manual', 'Flood', 'Rain-fed', 'Drip']`
- **Soil_Type**: `['Loamy', 'Peaty', 'Silty', 'Clay', 'Sandy']`
- **Season**: `['Kharif', 'Zaid', 'Rabi']`


# <span style="color:transparent;">4. Exploratory Data Analysis (EDA)</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">4. Exploratory Data Analysis (EDA)</h1>
</div>

In [None]:
# Function to perform univariate analysis for numeric columns
def univariate_analysis(data, columns):
    plt.figure(figsize=(10, 12))

    for i, column in enumerate(columns, 1):
        plt.subplot(4, 2, i)
        sns.histplot(data[column], kde=True, bins=10, color='darkcyan')
        plt.title(f'{column.replace("_", " ")} Distribution with KDE')
        plt.xlabel(column.replace('_', ' '))
        plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

columns_to_analyze = ['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Yield(tons)', 'Water_Usage(cubic meters)']

univariate_analysis(df, columns_to_analyze)

Univariate analysis of the numerical columns:

1. **Farm Area (acres)**:
   - The distribution is relatively wide, with farm areas ranging from small to large plots. The histogram shows a slightly right-skewed pattern, suggesting that while many farms have moderate areas, there are a few larger farms as well.

2. **Fertilizer Used (tons)**:
   - The distribution of fertilizer usage shows a peak around the middle values with a few farms using significantly higher amounts. This may indicate a common range of fertilizer requirements across farms, with only a few outliers.

3. **Pesticide Used (kg)**:
   - Pesticide usage appears to be left-skewed, with many farms using relatively low quantities, while a few farms use substantially higher amounts.

4. **Yield (tons)**:
   - Yield distribution is slightly right-skewed, with a majority of farms achieving moderate yields and some achieving much higher yields. This spread might reflect differences in crop type, farm size, and input efficiency.

5. **Water Usage (cubic meters)**:
   - Water usage has a wide range and a right-skewed distribution. Some farms consume significantly more water, possibly due to crop requirements, irrigation methods, or farm size.


In [None]:
# Function to perform univariate analysis for numeric columns
def univariate_analysis(data, column, title):
    plt.figure(figsize=(10, 2))

    sns.boxplot(x=data[column], color='lightsalmon')
    plt.title(f'{title} Boxplot')

    plt.tight_layout()
    plt.show()

    print(f'\nSummary Statistics for {title}:\n', data[column].describe())

columns_to_analyze = ['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Yield(tons)', 'Water_Usage(cubic meters)']

for column in columns_to_analyze:
    univariate_analysis(df, column, column.replace('_', ' '))

Insights based on the boxplots and summary statistics:

1. **Farm Area (acres)**:
   - **Range**: The farm area varies significantly, from as low as 12.5 acres to as high as 483.88 acres.
   - **Distribution**: The median farm area is around 282 acres, with a relatively wide interquartile range (IQR), suggesting variability in farm sizes.
   - **Potential Outliers**: Larger farms may act as outliers in this distribution, as indicated by the spread on the higher end.

2. **Fertilizer Used (tons)**:
   - **Range**: Fertilizer usage ranges from 0.5 to 9.96 tons, with an average of about 4.9 tons.
   - **Distribution**: The IQR shows that most farms use between approximately 2.4 and 6.9 tons. There are likely a few farms using higher quantities as outliers.

3. **Pesticide Used (kg)**:
   - **Range**: Pesticide usage ranges from 0.14 to 4.99 kg, with a mean around 2.4 kg.
   - **Distribution**: The median value is around 2.33 kg, with some farms using significantly more, likely representing outliers.

4. **Yield (tons)**:
   - **Range**: Crop yield ranges from 3.86 to 48.02 tons, with an average yield of 27.06 tons.
   - **Distribution**: The central IQR indicates yields primarily between 16.19 and 37.86 tons, suggesting variability depending on factors like crop type and farm size.

5. **Water Usage (cubic meters)**:
   - **Range**: Water usage shows a wide range, from 5,869.75 to 94,754.73 cubic meters.
   - **Distribution**: The median is around 54,097 cubic meters, with a large IQR. Some farms use exceptionally high quantities of water, which could reflect outliers due to factors like farm size or crop water needs.

In [None]:
def plot_categorical_distribution(column_name, data=df):
    plt.figure(figsize=(10, 4))

    plt.subplot(1, 2, 1)
    sns.countplot(y=column_name, data=df, palette='muted')
    plt.title(f'Distribution of {column_name}')

    ax = plt.gca()
    for p in ax.patches:
        ax.annotate(f'{int(p.get_width())}', (p.get_width(), p.get_y() + p.get_height() / 2),
                    ha='center', va='center', xytext=(10, 0), textcoords='offset points')

    sns.despine(left=True, bottom=True)

    plt.subplot(1, 2, 2)
    df[column_name].value_counts().plot.pie(autopct='%1.1f%%', colors=sns.color_palette('muted'), startangle=90, explode=[0.05]*df[column_name].nunique())
    plt.title(f'Percentage Distribution of {column_name}')
    plt.ylabel('')

    plt.tight_layout()
    plt.show()

In [None]:
plot_categorical_distribution('Crop_Type')
plot_categorical_distribution('Irrigation_Type')
plot_categorical_distribution('Soil_Type')
plot_categorical_distribution('Season')

Insights based on the categorical distributions:

1. **Crop Type**:
   - **Distribution**: The dataset includes a variety of crops, with certain crops like Cotton, Carrot, and Tomato appearing more frequently. Other crops such as Potato and Barley are less common.
   - **Percentage**: The distribution of crop types is fairly diverse, indicating a range of crops cultivated across different farms. This variety could impact resource needs and yields.

2. **Irrigation Type**:
   - **Distribution**: Irrigation methods vary, with Sprinkler and Manual methods being more prevalent. Drip and Rain-fed methods are less common.
   - **Percentage**: The distribution suggests that traditional methods like Manual and Sprinkler irrigation are dominant, potentially influencing water and fertilizer usage.

3. **Soil Type**:
   - **Distribution**: There is a relatively balanced representation of soil types, with Loamy and Silty soils being the most common, followed by Peaty, Clay, and Sandy.
   - **Percentage**: This balance across soil types indicates a range of soil conditions that might affect crop selection and yield potential.

4. **Season**:
   - **Distribution**: The Kharif season appears to be the most common, followed by Zaid and Rabi seasons.
   - **Percentage**: This suggests that a significant portion of farming activities takes place during the Kharif season, potentially due to seasonal crop cycles and climate conditions.

In [None]:
# Creating bar plots for each column by 'Crop_Type'
columns_to_plot = ['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Water_Usage(cubic meters)', 'Yield(tons)']

plt.figure(figsize=(16, 20))
for i, column in enumerate(columns_to_plot, 1):
    plt.subplot(3, 2, i)
    sns.barplot(data=df, x='Crop_Type', y=column, ci=None, palette='muted')
    plt.title(f'Bar Plot of {column.replace("_", " ")} by Crop Type')
    plt.xlabel('Crop Type')
    plt.ylabel(column.replace('_', ' '))
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Identifying crop types with highest and lowest values for different metrics
# Creating a more readable output format
metrics_summary = {
    "Metric": [
        "Highest Yield", "Lowest Yield",
        "Highest Fertilizer Used", "Lowest Fertilizer Used",
        "Highest Pesticide Used", "Lowest Pesticide Used",
        "Highest Water Usage", "Lowest Water Usage",
        "Highest Farm Area", "Lowest Farm Area"
    ],
    "Crop Type": [
        df.loc[df['Yield(tons)'].idxmax()]['Crop_Type'], df.loc[df['Yield(tons)'].idxmin()]['Crop_Type'],
        df.loc[df['Fertilizer_Used(tons)'].idxmax()]['Crop_Type'], df.loc[df['Fertilizer_Used(tons)'].idxmin()]['Crop_Type'],
        df.loc[df['Pesticide_Used(kg)'].idxmax()]['Crop_Type'], df.loc[df['Pesticide_Used(kg)'].idxmin()]['Crop_Type'],
        df.loc[df['Water_Usage(cubic meters)'].idxmax()]['Crop_Type'], df.loc[df['Water_Usage(cubic meters)'].idxmin()]['Crop_Type'],
        df.loc[df['Farm_Area(acres)'].idxmax()]['Crop_Type'], df.loc[df['Farm_Area(acres)'].idxmin()]['Crop_Type']
    ],
    "Value": [
        df.loc[df['Yield(tons)'].idxmax()]['Yield(tons)'], df.loc[df['Yield(tons)'].idxmin()]['Yield(tons)'],
        df.loc[df['Fertilizer_Used(tons)'].idxmax()]['Fertilizer_Used(tons)'], df.loc[df['Fertilizer_Used(tons)'].idxmin()]['Fertilizer_Used(tons)'],
        df.loc[df['Pesticide_Used(kg)'].idxmax()]['Pesticide_Used(kg)'], df.loc[df['Pesticide_Used(kg)'].idxmin()]['Pesticide_Used(kg)'],
        df.loc[df['Water_Usage(cubic meters)'].idxmax()]['Water_Usage(cubic meters)'], df.loc[df['Water_Usage(cubic meters)'].idxmin()]['Water_Usage(cubic meters)'],
        df.loc[df['Farm_Area(acres)'].idxmax()]['Farm_Area(acres)'], df.loc[df['Farm_Area(acres)'].idxmin()]['Farm_Area(acres)']
    ]
}

import pandas as pd
metrics_summary_df = pd.DataFrame(metrics_summary)
metrics_summary_df

Insights based on the summary of crop metrics:

1. **Yield Insights**:
   - **Highest Yield**: Tomato has the highest yield at 48.02 tons, indicating its potential as a highly productive crop under favorable conditions.
   - **Lowest Yield**: Maize has the lowest yield at 3.86 tons, which could suggest challenges in cultivation, lower productivity, or constraints due to environmental or management factors.

2. **Fertilizer Usage**:
   - **Highest Fertilizer Usage**: Cotton stands out with the highest fertilizer usage at 9.96 tons, suggesting a high nutrient demand for maximizing productivity.
   - **Lowest Fertilizer Usage**: Interestingly, Cotton also has the lowest fertilizer usage at 0.50 tons for certain instances, which could reflect variability in management practices or differing needs across different fields.

3. **Pesticide Usage**:
   - **Highest Pesticide Usage**: Rice uses the highest amount of pesticides at 4.99 kg, which may indicate higher susceptibility to pests and the need for more intensive pest management.
   - **Lowest Pesticide Usage**: Barley, on the other hand, has the lowest pesticide usage at 0.14 kg, suggesting it may be less prone to pest attacks or is managed with minimal chemical intervention.

4. **Water Usage**:
   - **Highest Water Usage**: Cotton has the highest water usage, consuming 94,754.73 cubic meters. This highlights the water-intensive nature of Cotton cultivation, which may have implications for irrigation and sustainability.
   - **Lowest Water Usage**: Rice, despite being a typically water-demanding crop, shows the lowest water usage at 5,869.75 cubic meters, potentially due to different cultivation methods, such as more water-efficient practices.

5. **Farm Area**:
   - **Highest Farm Area**: Rice is cultivated on the largest farm area, with 483.88 acres, indicating its importance or high demand in the region.
   - **Lowest Farm Area**: Sugarcane has the smallest farm area at 12.50 acres, which could reflect niche cultivation or limited demand.

The insights illustrate significant variability in resource usage, productivity, and farm area across different crop types. Cotton and Rice, for example, demonstrate contrasting needs and environmental demands, impacting their cultivation practices. Tomato's high yield makes it particularly productive, while Maize’s low yield points to potential areas for improvement or challenges to address. Such data is valuable for optimizing agricultural practices and improving crop productivity and sustainability.


In [None]:
# Create a table showing Crop Types and corresponding Farm IDs for each crop type
crop_farm_table = df.groupby('Crop_Type')['Farm_ID'].apply(list).reset_index()

crop_farm_table

In [None]:
# Checking if any farms have multiple crop types
multiple_crops_per_farm = df.groupby('Farm_ID')['Crop_Type'].nunique().reset_index()
multiple_crops_per_farm = multiple_crops_per_farm[multiple_crops_per_farm['Crop_Type'] > 1]

# Displaying the result or a message if no farm has multiple crops
if not multiple_crops_per_farm.empty:
    import ace_tools as tools; tools.display_dataframe_to_user(name="Farms with Multiple Crop Types", dataframe=multiple_crops_per_farm)
else:
    print("No farms have multiple crop types.")

In [None]:
# Plotting the pie chart for farm distribution by crop type
plt.figure(figsize=(8, 8))
crop_type_counts = df['Crop_Type'].value_counts()
plt.pie(crop_type_counts, labels=crop_type_counts.index, autopct='%1.1f%%', startangle=90,
        colors=sns.color_palette('muted'), wedgeprops={'edgecolor': 'black'})

plt.title('Farm Distribution by Crop Type')
plt.show()


Insights based on the analysis and visualizations:

1. **Crop Type and Farm Association**:
   - Each crop type is associated with a distinct set of farms, and no single farm grows multiple crop types. This setup may imply a **specialization in crop cultivation**, where each farm is focused on a single crop, possibly to optimize resources and expertise for specific crop needs.
   - **Most Common Crops**: Certain crops like Barley, Cotton, and Tomato are associated with multiple farms, while others like Maize have fewer farms. This distribution could reflect the popularity or economic value of these crops in the dataset's region.

2. **Farm Distribution by Crop Type (Pie Chart)**:
   - The pie chart provides a visual distribution of farms across crop types. We can see that the **largest segments** represent crops with a broader farm base, such as Cotton and Barley. In contrast, **smaller segments** correspond to crops like Maize and Potato, indicating fewer farms cultivate these crops.
   - This distribution can help identify **crop popularity and farming focus** within the dataset, potentially indicating the region’s agricultural strengths or specific crop demands.

3. **Specialization of Farms**:
   - Since no farm grows multiple crop types, each farm’s focus on a single crop type could reflect specialized farming practices or crop rotations that don’t overlap within the same season. This setup might also be due to factors like soil suitability, water availability, or climate requirements specific to each crop.

In [None]:
# Calculating the total area for each crop type
total_area_per_crop = df.groupby('Crop_Type')['Farm_Area(acres)'].sum().reset_index()

total_area_per_crop

In [None]:
# Plotting a pie chart for the distribution of total Farm_Area(acres) by crop type
plt.figure(figsize=(8, 8))
total_area_per_crop = df.groupby('Crop_Type')['Farm_Area(acres)'].sum()

plt.pie(total_area_per_crop, labels=total_area_per_crop.index, autopct='%1.1f%%', startangle=90,
        colors=sns.color_palette('muted'), wedgeprops={'edgecolor': 'black'})

plt.title('Farm Area Distribution by Crop Type')
plt.show()


Insights based on the distribution of total farm area by crop type

1. **Largest Farm Areas**:
   - **Cotton** (1,993.80 acres), **Rice** (1,845.24 acres), and **Barley** (1,671.22 acres) occupy the largest total farm areas. This suggests that these crops may be highly prioritized or economically significant within the dataset’s region.

2. **Moderate Farm Areas**:
   - **Tomato** (1,655.02 acres), **Sugarcane** (1,187.99 acres), and **Soybean** (1,050.68 acres) have substantial but moderate land allocation. These crops still represent a significant part of the agricultural landscape, albeit not as prominent as Cotton and Rice.

3. **Smaller Farm Areas**:
   - **Carrot** (765.90 acres), **Wheat** (872.57 acres), **Maize** (978.53 acres), and **Potato** (727.24 acres) have the smallest total areas. These crops may either be less in demand or require less land due to specific cultivation practices.

### Overall Observations:
- The distribution of farm area across crop types highlights the emphasis on certain staple crops like Cotton, Rice, and Barley, which are given more land, possibly for economic or agricultural reasons.
- The pie chart visually convey the land allocation, with larger crops clearly standing out, offering a quick visual reference for priority crops in terms of land use.

In [None]:
# Identifying the crop types and the corresponding soil types they grow in
crop_soil_table = df.groupby('Crop_Type')['Soil_Type'].unique().reset_index()

crop_soil_table

In [None]:
# Plotting pie charts for each crop type to show distribution of soil types they grow in
unique_crops = df['Crop_Type'].unique()

# Set up a grid for multiple pie charts
plt.figure(figsize=(15, 12))
for i, crop in enumerate(unique_crops, 1):
    plt.subplot(4, 3, i)
    soil_distribution = df[df['Crop_Type'] == crop]['Soil_Type'].value_counts()
    plt.pie(soil_distribution, labels=soil_distribution.index, autopct='%1.1f%%', startangle=90,
            colors=sns.color_palette('pastel'), wedgeprops={'edgecolor': 'black'})
    plt.title(f'{crop} - Soil Type Distribution')

plt.tight_layout()
plt.show()


Insights based on the soil type distribution for each crop type

1. **Diverse Soil Types**:
   - Certain crops like **Barley**, **Tomato** and **Sugarcane** are grown in a wide variety of soil types, including Sandy, Silty, Clay, and Loamy. This indicates that these crops are versatile and adaptable to multiple soil conditions.

2. **Limited Soil Preferences**:
   - Crops like **Carrot** and **Soybean** are associated with fewer soil types. For example, Carrot is primarily found in Peaty, Loamy, and Clay soils, while Soybean predominantly grows in Sandy, Silty, and Loamy soils. This could suggest a more specific soil preference or suitability for these crops.

3. **Predominant Soil Types**:
   - **Sandy and Loamy soils** are common across multiple crops, including Cotton, Rice, and Soybean. This prevalence indicates that these soil types are likely more suitable or widely available for various crops in the dataset.

4. **Unique Soil Associations**:
   - **Peaty soil** is less commonly associated with crops, appearing mainly with Carrot, Sugarcane, Maize, and Tomato. This may indicate that Peaty soil is less prevalent or less suitable for a broad range of crops, except for a few specialized types.

5. **Adaptable Crops**:
   - **Sugarcane**,**Tomato** and **Barley** grow in a variety of soil types, including Loamy, Silty, Clay, and Peaty soils. This adaptability makes them versatile crops, potentially less affected by soil type variability.


In [None]:
# Identifying the crop types and the corresponding seasons they are grown in
crop_season_table = df.groupby('Crop_Type')['Season'].unique().reset_index()

crop_season_table

In [None]:
# Plotting pie charts for each crop type to show distribution of seasons they are grown in
plt.figure(figsize=(15, 12))
for i, crop in enumerate(df['Crop_Type'].unique(), 1):
    plt.subplot(4, 3, i)
    season_distribution = df[df['Crop_Type'] == crop]['Season'].value_counts()
    plt.pie(season_distribution, labels=season_distribution.index, autopct='%1.1f%%', startangle=90,
            colors=sns.color_palette('pastel'), wedgeprops={'edgecolor': 'black'})
    plt.title(f'{crop} - Season Distribution')

plt.tight_layout()
plt.show()


Insights based on the season distribution of crop types:

1. **Multi-Season Crops**:
   - Crops like **Carrot, Rice, Soybean, and Tomato** are grown across all three seasons (Kharif, Zaid, and Rabi), showcasing their adaptability to different growing conditions and potentially offering continuous yields throughout the year.
   - **Cotton** also spans all seasons but has a higher concentration in the Kharif season, indicating a strong preference for certain climatic conditions during that period.

2. **Season-Specific Crops**:
   - **Maize and Wheat** are grown in two seasons (Rabi and Zaid) but do not appear in the Kharif season. This suggests a preference for specific climatic or soil conditions during Rabi and Zaid.
   - **Sugarcane** is predominantly grown in the Kharif and Zaid seasons, showing a notable absence from Rabi, which might be due to climatic or crop cycle requirements.

3. **Dominant Season for Certain Crops**:
   - **Potato** is primarily grown in the Zaid season (75% of the time) and less in the Kharif season. This suggests a high yield during Zaid, aligning with favorable conditions.
   - **Barley** and **Cotton** are heavily cultivated during the Kharif and Zaid seasons, indicating their importance and adaptability to these seasons' conditions.

4. **Balanced Crops Across Seasons**:
   - **Tomato** displays a balanced distribution across Kharif, Zaid, and Rabi seasons, indicating it can be cultivated year-round without significant preference toward any single season.
   - **Wheat** has a balanced 50-50 split between Zaid and Rabi, suggesting stable and predictable yields during these times.

### Overall Observations:
- **Crop Adaptability**: Certain crops like Tomato, Rice, and Carrot are highly versatile, thriving across multiple seasons, whereas others like Maize and Wheat have more focused seasonal windows.
- **Seasonal Preferences**: The distribution of crop types per season highlights the significance of specific climatic and growing conditions needed to optimize yields, which can be critical for planning planting cycles and resource allocation.



In [None]:
# Identifying the soil type with the highest crop yield for each crop type
highest_yield_per_crop_soil = df.groupby(['Crop_Type', 'Soil_Type'])['Yield(tons)'].mean().reset_index()
max_yield_per_crop = highest_yield_per_crop_soil.loc[highest_yield_per_crop_soil.groupby('Crop_Type')['Yield(tons)'].idxmax()]

print("Soil Type with Highest Yield for Each Crop Type:")
display(max_yield_per_crop)

# Identifying the soil type with the lowest crop yield for each crop type
min_yield_per_crop = highest_yield_per_crop_soil.loc[highest_yield_per_crop_soil.groupby('Crop_Type')['Yield(tons)'].idxmin()]

print("\nSoil Type with Lowest Yield for Each Crop Type:")
display(min_yield_per_crop)

### Summary Insights:

- **Loamy soil** is generally associated with higher yields for many crops (Barley, Carrot, Soybean, Sugarcane), highlighting its favorable characteristics for plant growth due to its balance of sand, silt, and clay.
- **Sandy soil** supports high yields for crops like Cotton, Maize, and Potato but leads to lower yields for crops like Barley and Soybean, reflecting its mixed suitability depending on crop needs.
- **Clay soil** provides high yields for Rice and Tomato but can negatively impact yields for Carrot, Sugarcane, and Wheat due to its dense structure and water retention characteristics.

Understanding these soil-crop relationships can help optimize crop selection and management practices, enhancing yields based on soil conditions.

In [None]:
# Identifying the soil type with the highest Fertilizer Used for each crop type
highest_fertilizer_per_crop_soil = df.groupby(['Crop_Type', 'Soil_Type'])['Fertilizer_Used(tons)'].mean().reset_index()
max_fertilizer_per_crop = highest_fertilizer_per_crop_soil.loc[highest_fertilizer_per_crop_soil.groupby('Crop_Type')['Fertilizer_Used(tons)'].idxmax()]

print("Soil Type with Highest Fertilizer Used for Each Crop Type:")
display(max_fertilizer_per_crop)

# Identifying the soil type with the lowest Fertilizer Used for each crop type
min_fertilizer_per_crop = highest_fertilizer_per_crop_soil.loc[highest_fertilizer_per_crop_soil.groupby('Crop_Type')['Fertilizer_Used(tons)'].idxmin()]

print("\nSoil Type with Lowest Fertilizer Used for Each Crop Type:")
display(min_fertilizer_per_crop)

### Summary Observations:

- **Loamy Soil**: While it often requires high fertilizer input due to its balanced texture and nutrient-holding capacity, it also supports high yields, making the investment in fertilizers worthwhile for many crops.
- **Sandy Soil**: Consistently appears with the lowest fertilizer usage, possibly due to its poor retention capabilities, leading to limited absorption and requiring different nutrient management practices.
- **Soil-Crop Specificity**: Fertilizer usage varies significantly depending on the crop-soil combination, reflecting complex interactions between soil properties, nutrient needs, and crop characteristics.

These insights can help in optimizing fertilizer management practices to reduce costs and improve yields by targeting specific soil and crop combinations.

In [None]:
# Identifying the soil type with the highest Water Usage for each crop type
highest_water_usage_per_crop_soil = df.groupby(['Crop_Type', 'Soil_Type'])['Water_Usage(cubic meters)'].mean().reset_index()
max_water_usage_per_crop = highest_water_usage_per_crop_soil.loc[highest_water_usage_per_crop_soil.groupby('Crop_Type')['Water_Usage(cubic meters)'].idxmax()]

# Identifying the soil type with the lowest Water Usage for each crop type
min_water_usage_per_crop = highest_water_usage_per_crop_soil.loc[highest_water_usage_per_crop_soil.groupby('Crop_Type')['Water_Usage(cubic meters)'].idxmin()]

# Displaying the results
print("Soil Type with Highest Water Usage for Each Crop Type:")
display(max_water_usage_per_crop)

print("\nSoil Type with Lowest Water Usage for Each Crop Type:")
display(min_water_usage_per_crop)

### Summary Observations:

- **Loamy Soil** often has the highest water usage for multiple crops, reflecting its versatility and high crop yields but also necessitating significant water input.
- **Silty Soil** consistently shows the lowest water usage for several crops due to its moisture retention capabilities, reducing the need for excessive irrigation.
- **Sandy Soil** demands high water usage for certain crops like Potato and Rice due to its quick drainage properties.
- The variability in water usage by soil type emphasizes the importance of choosing the right irrigation strategy based on soil properties and crop needs for optimized water management and crop yield.

These insights can be valuable for improving water efficiency and resource management in agricultural practices.

In [None]:
# Identifying the soil and irrigation type with the highest Water Usage for each crop type
highest_water_usage_per_crop = df.groupby(['Crop_Type', 'Soil_Type', 'Irrigation_Type'])['Water_Usage(cubic meters)'].mean().reset_index()
max_water_usage_per_crop = highest_water_usage_per_crop.loc[highest_water_usage_per_crop.groupby('Crop_Type')['Water_Usage(cubic meters)'].idxmax()]

# Identifying the soil and irrigation type with the lowest Water Usage for each crop type
min_water_usage_per_crop = highest_water_usage_per_crop.loc[highest_water_usage_per_crop.groupby('Crop_Type')['Water_Usage(cubic meters)'].idxmin()]

# Displaying the results
print("Soil and Irrigation Type with Highest Water Usage for Each Crop Type:")
display(max_water_usage_per_crop)

print("\nSoil and Irrigation Type with Lowest Water Usage for Each Crop Type:")
display(min_water_usage_per_crop)


### Summary Observations:

- **Loamy Soil** with various irrigation methods often results in high water usage for many crops, highlighting its moisture retention but also its need for sustained watering.
- **Sandy Soil** tends to have both high and low water usage depending on the crop and irrigation type, reflecting its variability in water retention.
- **Drip and Sprinkler irrigation systems** show both high and low water usage, indicating that while efficient, the crop and soil properties greatly influence water demand.
- **Flood irrigation** often leads to high water usage, particularly for water-intensive crops like Cotton and Rice, emphasizing the inefficiency of this method.

These insights emphasize the critical interaction between soil, irrigation methods, and water usage efficiency, providing a guide for optimizing water management practices.

In [None]:
# Identifying the season and irrigation type with the highest crop yield for each crop type
highest_yield_per_crop_season_irrigation = df.groupby(['Crop_Type', 'Season', 'Irrigation_Type'])['Yield(tons)'].mean().reset_index()
max_yield_per_crop = highest_yield_per_crop_season_irrigation.loc[highest_yield_per_crop_season_irrigation.groupby('Crop_Type')['Yield(tons)'].idxmax()]

# Identifying the season and irrigation type with the lowest crop yield for each crop type
min_yield_per_crop = highest_yield_per_crop_season_irrigation.loc[highest_yield_per_crop_season_irrigation.groupby('Crop_Type')['Yield(tons)'].idxmin()]

# Displaying the results
print("Season and Irrigation Type with Highest Crop Yield for Each Crop Type:")
display(max_yield_per_crop)

print("\nSeason and Irrigation Type with Lowest Crop Yield for Each Crop Type:")
display(min_yield_per_crop)


### Summary Observations:

- **Drip irrigation** often appears in both high and low-yield scenarios, emphasizing the importance of precise water management for maximizing yields.
- **Flood irrigation** is associated with both high and low yields, particularly for crops like Rice, Tomato, and Sugarcane, highlighting its mixed effectiveness depending on crop and season.
- **Seasonal Variation**: Many crops achieve their highest yields in the **Zaid and Rabi seasons**, but low yields can also occur during these periods depending on irrigation practices.
- **Manual and Rain-fed irrigation** methods appear effective for certain crops, reflecting the importance of natural rainfall and human intervention in optimizing yields.

These insights can help guide irrigation and crop management strategies based on season-specific needs and crop characteristics.

In [None]:
# Identifying the season with the highest Pesticide Used for each crop type
highest_pesticide_usage_per_crop_season = df.groupby(['Crop_Type', 'Season'])['Pesticide_Used(kg)'].mean().reset_index()
max_pesticide_usage_per_crop = highest_pesticide_usage_per_crop_season.loc[highest_pesticide_usage_per_crop_season.groupby('Crop_Type')['Pesticide_Used(kg)'].idxmax()]

# Identifying the season with the lowest Pesticide Used for each crop type
min_pesticide_usage_per_crop = highest_pesticide_usage_per_crop_season.loc[highest_pesticide_usage_per_crop_season.groupby('Crop_Type')['Pesticide_Used(kg)'].idxmin()]

# Displaying results
print("Season with Highest Pesticide Used for Each Crop Type:")
display(max_pesticide_usage_per_crop)

print("\nSeason with Lowest Pesticide Used for Each Crop Type:")
display(min_pesticide_usage_per_crop)

### Summary Observations:

- **Kharif and Rabi Seasons**: These seasons generally have higher pesticide usage for many crops, reflecting increased pest and disease pressures due to climatic conditions such as humidity and temperature.
- **Zaid Season**: This season often shows lower pesticide usage, indicating reduced pest activity due to drier conditions.
- **Crop-Specific Patterns**: Crops like **Tomato, Cotton, and Rice** tend to have high pesticide requirements during specific seasons, highlighting their vulnerability to pests and diseases during these periods.
- **Efficient Pest Management**: Understanding seasonal patterns of pesticide usage can help optimize pest management strategies, reducing costs and improving crop health.

In [None]:
def create_displots(data, columns, col, hue, palette='Set2'):
    for column in columns:
        g = sns.displot(data=data, x=column, col=col, hue=hue, kde=True, palette=palette)
        g.fig.suptitle(f'{column} Distribution by {col}', fontsize=14, y=1.10)
        plt.tight_layout()
        plt.show()

# List of columns to create distribution plots for
columns_to_plot = ["Yield(tons)", "Fertilizer_Used(tons)", "Pesticide_Used(kg)", "Water_Usage(cubic meters)"]
create_displots(df, columns_to_plot, col='Soil_Type', hue='Soil_Type')


A summary of how `Soil_Type` affects the spread and density of each metric

### 1. **Yield (tons)**:
   - **Loamy Soil**: Generally exhibits a wide distribution with a higher density at moderate-to-high yield levels, indicating that crops grown in loamy soil often achieve higher yields, but with some variability.
   - **Sandy and Silty Soils**: Show distributions with narrower peaks, suggesting more consistent yields within a specific range, but often at slightly lower yield levels compared to loamy soil.
   - **Clay Soil**: Typically displays a moderate distribution with yields clustered around mid-range values, reflecting moderate productivity and some yield variability.
   - **Peaty Soil**: Shows a more concentrated distribution at lower yields, indicating limited productivity compared to other soil types, with fewer instances of high yields.

### 2. **Fertilizer Used (tons)**:
   - **Loamy Soil**: Has a broader distribution, indicating higher variability and generally higher fertilizer application rates. This suggests a more intensive fertilization strategy is employed, possibly to maximize crop yield potential.
   - **Sandy Soil**: Exhibits a narrower distribution with a focus on lower fertilizer usage, likely due to limited nutrient-holding capacity and faster drainage, necessitating careful fertilizer management.
   - **Clay and Silty Soils**: Tend to have moderate fertilizer usage with a concentrated distribution. Clay’s nutrient-holding capabilities might reduce the need for frequent application, whereas silty soil balances nutrient retention and water management.
   - **Peaty Soil**: Shows lower overall fertilizer usage, possibly due to its high organic matter content, which can naturally enrich the soil and reduce the need for external inputs.

### 3. **Pesticide Used (kg)**:
   - **Loamy Soil**: Displays a relatively broad distribution with higher pesticide usage, indicating a potentially higher susceptibility to pests or a strategy of proactive pest management to protect high-value crops.
   - **Sandy Soil**: Shows a narrower distribution with moderate pesticide use, reflecting fewer instances of severe pest pressures or a tailored pest control approach.
   - **Clay Soil**: Tends to have a moderate distribution but with slightly lower pesticide usage, possibly due to less susceptibility to certain pests in dense, moisture-retaining conditions.
   - **Peaty Soil**: Exhibits lower pesticide usage, which may be attributed to its moisture-rich and acidic environment that might naturally deter some pests.

### 4. **Water Usage (cubic meters)**:
   - **Loamy Soil**: Demonstrates a broad distribution with high water usage, reflecting the high productivity potential but also the water demands associated with maximizing yield in this soil type.
   - **Sandy Soil**: Shows a wide range but generally indicates high water usage due to its quick-draining properties, requiring more frequent and larger water inputs to maintain crop health.
   - **Clay Soil**: Typically has a narrower distribution with moderate-to-high water usage. The water retention properties of clay soil may reduce the need for frequent watering but lead to more concentrated irrigation events.
   - **Silty Soil**: Exhibits moderate water usage with a relatively consistent distribution, reflecting balanced moisture retention and moderate irrigation needs.
   - **Peaty Soil**: Shows lower water usage, which aligns with its natural moisture-retention capabilities, reducing the need for heavy irrigation.

### Summary Observations:
- **Loamy Soil**: Generally supports higher productivity and shows a broad range of input (fertilizer, pesticide, water) usage, indicating intensive management practices for maximizing yields.
- **Sandy Soil**: Requires more careful management of inputs, particularly water, due to quick drainage, and shows moderate pesticide and fertilizer usage.
- **Clay Soil**: Offers moderate-to-high productivity with balanced input usage, reflecting its water and nutrient retention properties.
- **Peaty Soil**: Shows lower input usage and yields, highlighting its natural fertility but potential limitations in terms of productivity.

In [None]:
def create_displots_by_season(data, columns, col, hue, palette='Set2'):
    for column in columns:
        g = sns.displot(data=data, x=column, col=col, hue=hue, kde=True, palette=palette)
        g.fig.suptitle(f'{column} Distribution by {col}', fontsize=14, y=1.10)
        plt.tight_layout()
        plt.show()

# List of columns to create distribution plots for
columns_to_plot = ["Fertilizer_Used(tons)", "Yield(tons)", "Pesticide_Used(kg)", "Water_Usage(cubic meters)"]
create_displots_by_season(df, columns_to_plot, col='Season', hue='Season')


Insights based on the distribution plots for each metric (`Fertilizer_Used(tons)`, `Yield(tons)`, `Pesticide_Used(kg)`, and `Water_Usage(cubic meters)`) by `Season`

### 1. **Fertilizer Used (tons) Distribution by Season**:
   - **Kharif Season**: The distribution shows a moderate to high density of fertilizer usage, indicating that this season often requires substantial fertilizer inputs, likely due to high crop productivity.
   - **Rabi Season**: Displays a wide range with a moderate density, reflecting varied fertilization needs across crops grown in this season.
   - **Zaid Season**: Shows a slightly narrower distribution compared to Kharif and Rabi, indicating more consistent but often lower fertilizer usage.

### 2. **Yield (tons) Distribution by Season**:
   - **Kharif Season**: The distribution reveals higher densities around moderate to high yields, suggesting this is a productive season for many crops due to favorable weather conditions (e.g., monsoons).
   - **Rabi Season**: Shows a relatively concentrated distribution with moderate yields, reflecting stable but generally lower productivity compared to Kharif.
   - **Zaid Season**: Displays more variability with some lower yields, indicating this short growing season may have less favorable conditions for many crops.

### 3. **Pesticide Used (kg) Distribution by Season**:
   - **Kharif Season**: Higher pesticide usage is apparent, likely due to increased pest activity associated with humid, rainy weather.
   - **Rabi Season**: Shows a moderate density of pesticide usage, reflecting controlled but still notable pest management needs during cooler months.
   - **Zaid Season**: Displays a narrower distribution with generally lower pesticide usage, indicating reduced pest pressure during this dry, short growing season.

### 4. **Water Usage (cubic meters) Distribution by Season**:
   - **Kharif Season**: Shows high water usage, aligning with the needs of water-demanding crops grown during monsoon months.
   - **Rabi Season**: Displays a wide range with moderate water usage, reflecting irrigation needs due to the dry winter months.
   - **Zaid Season**: Exhibits lower water usage overall, consistent with shorter crop cycles and more water-efficient practices during this season.

### Summary Observations:
- **Kharif Season** generally sees higher inputs (fertilizer, water, and pesticides) and yields due to favorable growing conditions but comes with increased pest and disease risks.
- **Rabi Season** demonstrates a more balanced but moderate level of inputs and yields, reflecting stable but lower productivity relative to Kharif.
- **Zaid Season** shows lower inputs and yields, indicating a focus on less water-intensive, short-duration crops with reduced pest pressure.

In [None]:
def create_bar_plots(data, x_column, y_columns, estimator=sum, ci=None, palette='muted'):
    for y_column in y_columns:
        plt.figure(figsize=(8, 5))
        sns.barplot(data=data, x=x_column, y=y_column, ci=ci, estimator=estimator, palette=palette)
        plt.title(f'Total {y_column} by {x_column}')
        plt.xlabel(x_column)
        plt.ylabel(f'Total {y_column}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# List of y_columns to create bar plots for
y_columns = ['Yield(tons)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Water_Usage(cubic meters)']
create_bar_plots(df, 'Irrigation_Type', y_columns)


A summary of how the total values of each metric (`Yield(tons)`, `Fertilizer_Used(tons)`, `Pesticide_Used(kg)`, and `Water_Usage(cubic meters)`) vary by `Irrigation_Type`

### 1. **Total Crop Yield by Irrigation Type**:
   - **Drip Irrigation**: Tends to have a high total yield, which suggests that controlled water delivery through drip systems can significantly enhance crop productivity.
   - **Flood Irrigation**: Also shows a high total yield, which is consistent with water-demanding crops like rice. However, the potential for over-irrigation and inefficient water use may be a concern.
   - **Sprinkler and Manual Irrigation**: Generally have moderate total yields compared to other irrigation types.
   - **Rain-fed**: Typically shows the lowest total yield, indicating that crops relying solely on rainfall might not perform as well due to limited water control.

### 2. **Total Fertilizer Used by Irrigation Type**:
   - **Flood Irrigation**: Has the highest total fertilizer usage, possibly because crops under flood irrigation have higher nutrient demands or suffer nutrient losses due to leaching.
   - **Drip and Sprinkler Irrigation**: Show moderate fertilizer usage. Drip irrigation, in particular, allows for more efficient fertilizer application directly to the root zone, potentially reducing overall usage.
   - **Manual and Rain-fed Irrigation**: Generally use less fertilizer, reflecting either reduced input intensity or limited need for high fertilization under such conditions.

### 3. **Total Pesticide Usage by Irrigation Type**:
   - **Sprinkler and Drip Irrigation**: Show relatively high pesticide usage, indicating that controlled irrigation might create conditions favorable for pests, necessitating more pesticide applications.
   - **Flood Irrigation**: Also exhibits a high total pesticide usage, which can be attributed to humid and waterlogged conditions that encourage pest growth.
   - **Manual and Rain-fed Irrigation**: Have lower pesticide usage, reflecting either reduced pest pressures or more traditional, less intensive pest control practices.

### 4. **Total Water Usage by Irrigation Type**:
   - **Flood Irrigation**: As expected, shows the highest total water usage, reflecting the inefficiencies of this method and its high water demand.
   - **Sprinkler and Drip Irrigation**: Show significantly lower water usage compared to flood irrigation, indicating more water-efficient methods. Drip irrigation, in particular, demonstrates precise water delivery, reducing overall water use.
   - **Manual and Rain-fed Irrigation**: Have the lowest total water usage. Manual irrigation often relies on minimal water application, while rain-fed depends on natural rainfall, leading to limited water input.

### Overall Observations:
- **Efficiency of Drip Irrigation**: Drip irrigation consistently shows strong performance in terms of yield while managing moderate levels of fertilizer and water use, highlighting its efficiency and potential for sustainable agriculture.
- **High Input Demands of Flood Irrigation**: While flood irrigation achieves high yields, it comes at the cost of high water and fertilizer usage, indicating potential inefficiencies and environmental impacts.
- **Lower Inputs for Rain-fed and Manual Irrigation**: These irrigation methods generally show lower values for all metrics, reflecting less intensive and more traditional agricultural practices, but they often result in lower yields.

In [None]:
# Creating bar plots to visualize Crop Yield by Season for each Crop Type
plt.figure(figsize=(12, 8))
sns.barplot(data=df, x='Crop_Type', y='Yield(tons)', hue='Season', estimator=sum, ci=None, palette='muted')
plt.title('Crop Yield by Season for Each Crop Type')
plt.xlabel('Crop Type')
plt.ylabel('Total Yield (tons)')
plt.xticks(rotation=45)
plt.legend(title='Season')
plt.tight_layout()
plt.show()


In [None]:
# Creating a bar plot to visualize total Fertilizer_Used(tons) by season for each crop type
plt.figure(figsize=(12, 8))
sns.barplot(data=df, x='Crop_Type', y='Fertilizer_Used(tons)', hue='Season', estimator=sum, ci=None, palette='muted')
plt.title('Total Fertilizer Usage by Season for Each Crop Type')
plt.xlabel('Crop Type')
plt.ylabel('Total Fertilizer Used (tons)')
plt.xticks(rotation=45)
plt.legend(title='Season')
plt.tight_layout()
plt.show()


In [None]:
# Creating a count plot to visualize total pesticide usage by season for each crop type
plt.figure(figsize=(12, 8))
sns.barplot(data=df, x='Crop_Type', y='Pesticide_Used(kg)', hue='Season', estimator=sum, ci=None, palette='muted')

plt.title('Total Pesticide Usage by Season for Each Crop Type')
plt.xlabel('Crop Type')
plt.ylabel('Total Pesticide Used (kg)')
plt.xticks(rotation=45)
plt.legend(title='Season')
plt.tight_layout()
plt.show()


In [None]:
# Creating a bar plot to visualize total Water_Usage(cubic meters) by season for each crop type
plt.figure(figsize=(12, 8))
sns.barplot(data=df, x='Crop_Type', y='Water_Usage(cubic meters)', hue='Season', estimator=sum, ci=None, palette='muted')
plt.title('Total Water Usage by Season for Each Crop Type')
plt.xlabel('Crop Type')
plt.ylabel('Total Water Usage (cubic meters)')
plt.xticks(rotation=45)
plt.legend(title='Season')
plt.tight_layout()
plt.show()


A summary of how each metric (`Yield(tons)`, `Fertilizer_Used(tons)`, `Pesticide_Used(kg)`, and `Water_Usage(cubic meters)`) varies by crop and season, with a focus on highlighting seasonal trends and crop-specific resource use

### 1. **Yield (tons) by Crop and Season**:
   - **Kharif Season**: Generally associated with high yields for crops like Rice, Cotton, and Maize, reflecting favorable monsoon conditions and abundant water availability.
   - **Rabi Season**: Moderate yields are observed for crops such as Wheat, Barley, and some vegetables (like Carrot). This season benefits from controlled irrigation and cooler temperatures.
   - **Zaid Season**: This is a shorter growing season and often results in lower yields across most crops, indicating less favorable conditions. However, some crops like Tomato and certain vegetables still perform relatively well.

### 2. **Fertilizer Used (tons) by Crop and Season**:
   - **Kharif Season**: High fertilizer usage is seen for crops like Rice and Cotton, indicating intensive nutrient management practices during this productive growing season.
   - **Rabi Season**: Moderate levels of fertilizer are used for crops such as Wheat and Barley, reflecting stable but less intensive nutrient management compared to Kharif.
   - **Zaid Season**: Generally shows lower fertilizer usage across crops, which aligns with the shorter duration and often less resource-intensive nature of this season.

### 3. **Pesticide Used (kg) by Crop and Season**:
   - **Kharif Season**: High pesticide usage is observed, especially for crops like Rice, Cotton, and Sugarcane, due to increased pest and disease pressure associated with warm, humid conditions.
   - **Rabi Season**: Moderate pesticide application is used, reflecting cooler temperatures and potentially lower pest pressures for crops like Wheat and Barley.
   - **Zaid Season**: Displays the lowest pesticide usage across crops, likely due to the dry conditions reducing pest proliferation.

### 4. **Water Usage (cubic meters) by Crop and Season**:
   - **Kharif Season**: Predictably has the highest water usage, particularly for water-intensive crops such as Rice and Sugarcane. This reflects the reliance on monsoon rains and often extensive irrigation systems to support crop growth.
   - **Rabi Season**: Water usage is moderate but significant, as irrigation is necessary due to the dry winter months. Crops like Wheat and some vegetables require consistent watering to maintain productivity.
   - **Zaid Season**: Shows the lowest overall water usage, which aligns with the shorter duration and generally more water-efficient practices during this season.

### Summary Observations:
- **Kharif Season**: Characterized by high inputs (fertilizer, water, and pesticide) and high yields, reflecting its importance as the main growing season with favorable weather conditions. However, it also brings higher resource demands and pest challenges.
- **Rabi Season**: Reflects moderate inputs and stable productivity, often with controlled irrigation and more predictable growing conditions, leading to efficient resource use.
- **Zaid Season**: Shows lower inputs and yields across most crops, highlighting its secondary importance and often more conservative use of resources.

### Crop-Specific Insights:
- **Rice and Cotton**: High yields and high resource demands, particularly during the Kharif season.
- **Wheat and Barley**: Moderate yields and resource usage during the Rabi season, reflecting their stable, predictable growing needs.
- **Vegetables (e.g., Tomato, Carrot)**: Often require targeted fertilizer and pesticide applications but can yield well in both Rabi and Zaid seasons.

In [None]:
# Creating a pair plot for the specified columns
columns_to_pairplot = ['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Yield(tons)', 'Water_Usage(cubic meters)']

sns.pairplot(df[columns_to_pairplot], diag_kind='kde', corner=True, plot_kws={'alpha': 0.6})
plt.suptitle('Pair Plot for Selected Columns', y=1.02)
plt.show()


# <span style="color:transparent;">5. Data Preprocessing</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">5. Data Preprocessing</h1>
</div>

In [None]:
def encode_categorical_columns(df, columns):
    df_encoded = df.copy()

    # Initializing encoders
    label_encoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # 'drop' reduces dimensionality by removing one category

    for col in columns:
        if col == 'Farm_ID':
            # Label Encoding for 'Farm_ID'
            df_encoded[col] = label_encoder.fit_transform(df_encoded[col])
        else:
            # One-Hot Encoding for other columns
            one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df_encoded[[col]]),
                                           columns=[f"{col}_{cat}" for cat in one_hot_encoder.categories_[0][1:]])
            # Drop the original column and concatenate the new one-hot-encoded columns
            df_encoded = df_encoded.drop(col, axis=1).join(one_hot_encoded)

    return df_encoded

categorical_columns = ['Farm_ID', 'Crop_Type', 'Irrigation_Type', 'Soil_Type', 'Season']
df_encoded = encode_categorical_columns(df, categorical_columns)

# Display the first few rows of the encoded DataFrame
df_encoded.head()


# <span style="color:transparent;">6. Correlation Heatmap</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">6. Correlation Heatmap</h1>
</div>

In [None]:
corr_matrix = df_encoded.corr()

# Plotting the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

### Key Observations:

1. **Correlation with `Yield(tons)`**:
   - **`Farm_Area(acres)`**: There is a positive correlation with yield (0.15). Larger farm areas tend to show a slight increase in yield, indicating that farm size might contribute to productivity, but the relationship is weak.
   - **`Fertilizer_Used(tons)`**: The correlation is positive but weak (0.045). This suggests that while fertilizer contributes to yield improvement, it is not a dominant factor on its own.
   - **`Water_Usage(cubic meters)`**: There is a positive correlation (0.11), indicating that higher water usage is generally associated with higher yields, as water availability is crucial for crop productivity.
   - **Crop Types**: Some specific crop types like `Crop_Type_Carrot` (0.21) and `Crop_Type_Tomato` (0.18) exhibit positive correlations with yield, indicating that these crops may have higher yield potential.

2. **Relationship Among Inputs**:
   - **`Fertilizer_Used(tons)` and `Pesticide_Used(kg)`**: There is a very weak negative correlation (-0.045). This suggests that the two inputs are not directly correlated, and their application may vary based on crop-specific needs.
   - **`Fertilizer_Used(tons)` and `Water_Usage(cubic meters)`**: A positive correlation (0.012) indicates that more fertilizer might coincide with more water usage, though the relationship is not strong. Effective fertilization often requires adequate water to maximize nutrient uptake.

3. **`Farm_Area(acres)`**:
   - **`Farm_Area(acres)` and `Water_Usage(cubic meters)`**: A weak negative correlation (-0.04) suggests that larger farms do not necessarily use more water per acre, reflecting possible variations in irrigation practices.
   - **`Farm_Area(acres)` and `Fertilizer_Used(tons)`**: A weak negative correlation (-0.13) shows that larger farms might use slightly less fertilizer per acre, perhaps reflecting efficiency measures or more extensive cultivation practices.

4. **Impact of Soil Type and Season**:
   - **`Soil_Type_Loamy`**: Positively correlated with `Yield(tons)` (0.02), indicating that loamy soil is generally favorable for higher yields due to its balanced moisture and nutrient retention capabilities.
   - **`Season_Kharif` and `Yield(tons)`**: A positive correlation indicates that Kharif season crops generally have higher yields, possibly due to favorable weather and water availability.

5. **`Irrigation_Type` Effects**:
   - **`Irrigation_Type_Manual` and `Yield(tons)`**: Shows a moderate positive correlation (0.19), suggesting that manually irrigated farms may focus on crops with higher yields or have more tailored irrigation practices.
   - **`Irrigation_Type_Flood` and `Yield(tons)`**: Negative correlation (-0.27) suggests that flood irrigation may not always lead to the highest yields and could relate to inefficiencies or waterlogging.

6. **Crop-Specific Observations**:
   - **`Crop_Type_Cotton`** has a negative correlation with `Yield(tons)` (-0.19), reflecting its potentially lower average yield compared to other crops.
   - **`Crop_Type_Potato`** has a positive correlation with `Fertilizer_Used(tons)` (0.34), suggesting higher input requirements.

### Summary Insights:
- **Farm Size Influence**: While larger farm areas have a weak positive correlation with yield, resource inputs do not scale linearly with size, indicating diverse management practices.
- **Resource Optimization**: Fertilizer, water, and pesticide inputs have varying degrees of correlation with yield, emphasizing the need for crop-specific management strategies.
- **Soil and Season Importance**: Loamy soil and Kharif season tend to positively influence yields, while irrigation type and specific crop types have varying impacts.

In [None]:
# Creating a table for correlation of the target variable 'Yield(tons)' with other features
target_variable = 'Yield(tons)'
target_correlation_table = df_encoded.corr()[[target_variable]].sort_values(by=target_variable, ascending=False)

# Displaying the table
target_correlation_table

### Summary Insights:
- **High-Yielding Crops**: Carrot and Tomato have the highest positive correlations with yield, indicating strong yield potential.
- **Irrigation Practices**: Manual and rain-fed irrigation methods show positive correlations, whereas flood irrigation negatively correlates with yields.
- **Soil Impact**: Silty soil correlates positively with yield, while sandy and peaty soils exhibit negative or near-neutral relationships.
- **Input Usage**: Fertilizer and water usage have weak positive correlations with yield, while higher pesticide usage may even reduce productivity.

# <span style="color:transparent;">7. Model Training and Evaluation</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">7. Model Training and Evaluation</h1>
</div>

In [None]:
X = df_encoded.drop(columns=['Yield(tons)'])  # Features
y = df_encoded['Yield(tons)']  # Target

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# Dictionary to store models and results
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse'),
    'LGBM': LGBMRegressor(verbose=-1, random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

results = {}

# Training and evaluating each model
for model_name, model in models.items():
    # Training
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MSE': mse, 'R2': r2}

    print(f"{model_name} - MSE: {mse:.2f}, R2: {r2:.2f}")


In [None]:
best_model = min(results, key=lambda x: results[x]['MSE'])
print(f"The best model is: {best_model} with MSE: {results[best_model]['MSE']:.2f} and R2: {results[best_model]['R2']:.2f}")


In [None]:
# Assuming best_model is defined and models is a dictionary containing model instances
if best_model in ['Random Forest', 'XGBoost', 'LGBM', 'CatBoost']:
    best_model_instance = models[best_model]
    feature_importances = best_model_instance.feature_importances_
    sorted_idx = np.argsort(feature_importances)
    feature_names = X.columns

    plt.figure(figsize=(10, 8))

    plt.barh(range(len(sorted_idx)), feature_importances[sorted_idx], align='center', color='darkcyan')

    plt.yticks(np.arange(len(sorted_idx)), np.array(feature_names)[sorted_idx])
    plt.title(f'Feature Importance for {best_model}')
    plt.xlabel('Importance')
    plt.show()

In [None]:
# Recreate the best model instance (CatBoost)
best_model_instance = CatBoostRegressor(verbose=0, random_state=42)
best_model_instance.fit(X_train, y_train)

# Generate predictions
y_pred = best_model_instance.predict(X_test)

# Calculate residuals
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals, alpha=0.7, color='darkcyan')
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot for CatBoost Regressor')
plt.show()

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #008b8b; padding: 10px; background-color: #7fffd4; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #008b8b; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">🚀 If you found this notebook helpful, please consider giving it an upvote! 👍</h1>
    <p style="color: #008b8b; font-size: 18px; text-align: center;">Your support motivates me to create more useful content like this, and it helps others discover the notebook too! 🙌</p>
    <p style="color: #008b8b; font-size: 18px; text-align: center;">Thank you for your time, and I hope this notebook brings value to your data science journey! 💡😊</p>
</div>