## This notebook file is the preparation of relevant datasets needed for this project.

## Relevant Datasets

Here are the datasets that have been prepared for this project:

- [Dataset 1: Crop Production Data](http://localhost:8888/edit/APY_final_dataset.csv)
- [Dataset 2: Rainfall Data](http://localhost:8888/edit/rainfall_dataset.csv)
- [Dataset 3: Soil Erosion Data](http://localhost:8888/edit/soil_erosion_data.csv)

All the datasets are in csv format.

### Below python code is used to prepare all the relevant datasets.

In [16]:
# This code reads, cleans, and preprocesses agricultural data from a CSV file, addressing missing values and correcting state names for analysis.
# Initial dataset is "APY.csv" is taken from kaggle. 

import pandas as pd
csv_file = "APY.csv"
df = pd.read_csv(csv_file)
df.rename(columns={"Production": "Production(Tonnes)", "Area": "Area(Hectares)", "Yield": "Yield(Tonnes/Hectare)"}, inplace=True)
unique_states = df["State"].unique()
df['State'] = df['State'].str.replace(' and ', ' & ')
state_replacements = {
    'Andaman & Nicobar Island': 'Andaman & Nicobar',
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'THE DADRA AND NAGAR HAVELI': 'Dadra and Nagar Haveli and Daman and Diu',
    'Laddak': 'Ladakh',
    'CHANDIGARH' :'Chandigarh'
}

df['State'] = df['State'].replace(state_replacements)
old_state_name = "Dadra & Nagar Haveli & Daman & Diu"
new_state_name = "Dadra & Nagar Haveli and Daman & Diu"

df.loc[df["State"] == old_state_name, "State"] = new_state_name

df.dropna(subset=['Crop'], inplace=True)

production_median = df['Production(Tonnes)'].median()
df['Production(Tonnes)'].fillna(production_median, inplace=True)

new_csv_file = "modified_data_APY.csv"

df.to_csv(new_csv_file, index=False)

In [17]:
# This code merges population data with agricultural data, aligning records based on corresponding years for analysis.
# Initial dataset is "indian population new.csv" is taken from kaggle. 

population_file = "indian population new.csv"
population_df = pd.read_csv(population_file)

filtered_population_df = population_df[(population_df["Year"] >= 1997) & (population_df["Year"] <= 2020)]

selected_population_df = filtered_population_df[["Year", "Population"]]

original_file = "modified_data_APY.csv"
original_df = pd.read_csv(original_file)

merged_df = original_df.merge(selected_population_df, left_on="Crop_Year", right_on="Year", how="left")

merged_df.drop("Year", axis=1, inplace=True)

main_csv_file = "APY_main_dataset.csv"
merged_df.to_csv(main_csv_file, index=False)


In [18]:
# This code add the data column of most used irrigation practice in each state.
# This column will be merged to a dataset "APY_main_dataset.csv".

df2 = pd.read_csv("APY_main_dataset.csv")
data = {
    'State': [
        'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh',
        'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jharkhand',
        'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
        'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab', 'Rajasthan',
        'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh',
        'Uttarakhand', 'West Bengal', 'Andaman & Nicobar', 'Chandigarh',
        'Dadra and Nagar Haveli and Daman and Diu', 'Delhi', 'Jammu & Kashmir',
        'Ladakh', 'Puducherry'
    ],
    'Most_Used_Irrigation': [
        'Flood', 'Terrace', 'Flood', 'Flood', 'Drip', 'Flood', 'Drip', 'Flood',
        'Terrace', 'Flood', 'Drip', 'Drip', 'Flood', 'Drip', 'Flood', 'Terrace',
        'Terrace', 'Flood', 'Flood', 'Flood', 'Drip', 'Flood', 'Drip', 'Drip',
        'Flood', 'Flood', 'Flood', 'Flood', 'Flood', 'Drip', 'Drip', 'Flood',
        'Terrace', 'Drip', 'Drip'
    ]
}

irrigation_data_df = pd.DataFrame(data)
merged_final_df = pd.merge(df2, irrigation_data_df, on='State')
merged_final_df.to_csv("APY_final_dataset.csv", index=False)

In [1]:
# This code loads, cleans, and prepares rainfall data for analysis, creating a focused dataset for further use.
# Initial dataset is "rainfaLLIndia.csv" is taken from kaggle. 

rainfall_df = pd.read_csv("rainfaLLIndia.csv")
rainfall_df.rename(columns={"subdivision": "State"}, inplace=True)

filtered_rainfall_df = rainfall_df[(rainfall_df["YEAR"] >= 1997) & (rainfall_df["YEAR"] <= 2020)]

filtered_rainfall_df.rename(columns={"JUN-SEP": "Monsoon_Rainfall", "YEAR": "Year"}, inplace=True)

selected_columns = ["State", "Year", "Monsoon_Rainfall"]
new_dataset = filtered_rainfall_df.loc[:, selected_columns]

new_dataset.to_csv("rainfall_dataset.csv", index=False)

In [20]:
# soil erosion data by states in the year 2010(latest available data)

data = {
    'State': ['Andhra Pradesh (including Telangana)', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Orissa', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
    'Area (\'000 ha)': [8093, 666, 3248, 851, 3733, 28, 1, 984, 306, 982, 1369, 3219, 7522, 490, 12262, 8799, 122, 302, None, 46, 2227, 229, 19029, 45, 2308, 109, 13075, 1018, 1332]
}
# The above data dictionary consists of data which is collected from this website: "https://pib.gov.in/PressReleaseIframePage.aspx?PRID=1810912"

df = pd.DataFrame(data)

output_file = "soil_erosion_data.csv"
df.to_csv(output_file, index=False)

In [1]:
# This code make visualizations for highest crop production in each season.

import plotly.express as px
season_production = merged_final_df.groupby(['Crop', 'Season'])['Production(Tonnes)'].sum().reset_index()
season_list =season_production.Season.unique()
for i in season_list:
  mask =season_production[season_production.Season == i]
  mask=mask.sort_values('Production(Tonnes)', ascending=False)
  top_crops = mask.head(10)
  fig = px.bar(top_crops, x='Crop', y='Production(Tonnes)', title= 'Highest Crop Production in {}'.format(i))
  fig.show()