In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import geopandas as gpd

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import json


In [None]:
df = pd.read_csv("mnrega_clean.csv")


In [None]:
df

In [None]:
df.head(5)

In [None]:
df.isna().sum()

#### If any of the plots are not visible(no output), Kindly restart the kernel and then run all cells

## Descriptive Analysis

In [None]:
import matplotlib.pyplot as plt

# Group the data by state and calculate the total households that applied for a job card for each state
state_data = df.groupby('State')['Households that applied for a job card'].sum().reset_index()
state_data = state_data.sort_values(by='Households that applied for a job card', ascending=False)

# Create a 2D bar plot
plt.figure(figsize=(10, 6))
plt.bar(state_data['State'], state_data['Households that applied for a job card'])
plt.title('Distribution of Households that Applied for a Job Card by State')
plt.xlabel('State')
plt.ylabel('Households that applied for a job card')
plt.xticks(rotation=45, ha='right')  # Adjust rotation for better visibility
plt.show()



#### Job card issued over the years all over India

In [None]:
import matplotlib.pyplot as plt

# Group the data by year code and sum the job cards issued for each year
yearly_data = df.groupby('Yearcode')['Job cards issued'].sum().reset_index()

# Create a line plot
plt.figure(figsize=(10, 6))
plt.plot(yearly_data['Yearcode'], yearly_data['Job cards issued'], marker='o')
plt.title('Job Cards Issued in India Over the Years')
plt.xlabel('Year')
plt.ylabel('No. of Job Cards Issued')
plt.grid(True)
plt.show()


In [None]:
import sweetviz as sv

report = sv.analyze(df)
report.show_notebook()

#### Total Job Cards Issued Over the Years in J & K


In [None]:
selected_state = 'Jammu And Kashmir'

# Filter the data for the selected state
state_data = df[df['State'] == selected_state]

# Group the filtered data by year and calculate the total job cards issued for each year
yearly_job_cards_issued = state_data.groupby('Yearcode')['Job cards issued'].sum().reset_index()


plt.figure(figsize=(10, 6))
plt.bar(yearly_job_cards_issued['Yearcode'], yearly_job_cards_issued['Job cards issued'])
plt.title(f'Total Job Cards Issued Over the Years in {selected_state}')
plt.xlabel('Year')
plt.ylabel('Total Job Cards Issued')
plt.xticks(yearly_job_cards_issued['Yearcode'], [str(year) for year in yearly_job_cards_issued['Yearcode']])
plt.show()


#### Total Job Cards Issued Over the Years in Uttar Pradesh

In [None]:
selected_state = 'Uttar Pradesh'
state_data = df[df['State'] == selected_state]

# Group the filtered data by year
yearly_job_cards_issued = state_data.groupby('Yearcode')['Job cards issued'].sum().reset_index()

plt.figure(figsize=(10, 6))
plt.bar(yearly_job_cards_issued['Yearcode'], yearly_job_cards_issued['Job cards issued'])
plt.title(f'Total Job Cards Issued Over the Years in {selected_state}')
plt.xlabel('Year')
plt.ylabel('Total Job Cards Issued')
plt.xticks(yearly_job_cards_issued['Yearcode'], [str(year) for year in yearly_job_cards_issued['Yearcode']])
plt.show()


#### Total Job Cards Issued Over the Years in Bihar

In [None]:
selected_state = 'Bihar' 
state_data = df[df['State'] == selected_state]

yearly_job_cards_issued = state_data.groupby('Yearcode')['Job cards issued'].sum().reset_index()



plt.figure(figsize=(10, 6))
plt.bar(yearly_job_cards_issued['Yearcode'], yearly_job_cards_issued['Job cards issued'])
plt.title(f'Total Job Cards Issued Over the Years in {selected_state}')
plt.xlabel('Year')
plt.ylabel('Total Job Cards Issued')
plt.xticks(yearly_job_cards_issued['Yearcode'], [str(year) for year in yearly_job_cards_issued['Yearcode']])
plt.show()

#### Job card applied vs issued in a particular state

In [None]:
state_data = df.groupby('State')[['Households that applied for a job card', 'Job cards issued']].sum().reset_index()


melted_state_data = pd.melt(state_data, id_vars='State', var_name='Metric', value_name='Count')
melted_state_data = melted_state_data.sort_values(by='Count', ascending=False)

color_mapping = {'Households that applied for a job card': 'brown', 'Job cards issued': 'lightgreen'}


plt.figure(figsize=(12, 8))
sns.barplot(x='State', y='Count', hue='Metric', data=melted_state_data, palette=color_mapping.values())
plt.title('Households that Applied vs Job Cards Issued by State')
plt.xlabel('State')
plt.ylabel('Count')
plt.xticks(rotation=-45, ha='left')
plt.legend(title='Metric')
plt.show()

In [None]:
# Calculate the total person days worked by women for each year
woman_work_by_year = df.groupby('Yearcode')['Total person days worked by women'].sum().reset_index()

# Calculate the growth by comparing with the previous year
woman_work_by_year['Growth'] = woman_work_by_year['Total person days worked by women'].diff()
woman_work_by_year['Growth'].iloc[0] = 0 

# Create a line plot to visualize the data
plt.figure(figsize=(12, 6))
plt.plot(woman_work_by_year['Yearcode'], woman_work_by_year['Total person days worked by women'], marker='o', label='Total Person Days Worked by Women')
plt.plot(woman_work_by_year['Yearcode'], woman_work_by_year['Growth'], marker='o', label='Growth')
plt.xlabel('Year')
plt.ylabel('Total Person Days Worked / Growth')
plt.title('Total Person Days Worked by Women and Growth Over the Years')
plt.legend()
plt.grid(True)

plt.show()


In [None]:
data = df

# Select the features you want to include in the scatter plot
features_to_plot = [
       'Households that applied for a job card', 'Job cards issued',
       'Job cards issued for scheduled caste',
       'Job cards issued for scheduled tribes',
       'Job cards issued for non scheduled tribes or scheduled caste',
       'Households that demanded work', 'Persons who demanded work',
       'Households that were allotted work', 'Persons that were allotted work',
       'Muster rolls filled',
       'Households that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Persons that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Households that reached a 100 day limit', 'Persons with disability',
       'Non scheduled tribes or scheduled caste houeholds that worked',
       'Total person days worked by non scheduled tribes or scheduled caste persons.',
       'Scheduled caste houeholds that worked',
       'Total person days worked scheduled caste persons',
       'Scheduled tribe houeholds that worked',
       'Total person days worked scheduled tribe persons',
       'Households that worked on land reform or indira awas yojana',
        'Scheduled caste households that reached a 100 day limit',
       'Scheduled tribe households that reached a 100 day limit',
       'Labour expenditure that has been disbursed',
       'Material expenditure that has been disbursed',
       'Labour expenditure both disbursed and pending',

]

print(len(features_to_plot))

fig = px.scatter(data, x=data.index, y=features_to_plot, labels={'x': 'Data Points'},
                 title='Interactive Scatter Plot for Ten Different Features')

fig.update_layout(showlegend=True)

fig.show()

#### Checking Distribution of data

In [None]:
for column in df.columns:
    plt.figure()
    plt.hist(df[column], bins=10, edgecolor='k')
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

plt.show()


#### Heatmap

In [None]:
# Please run the below cells to see the output

india_states = json.load(open("states_india.geojson",'r'))

In [None]:
india_states['features'][0]

In [None]:
state_id_map = {}
for feature in india_states["features"]:
    feature["id"] = feature["properties"]["state_code"]
    state_id_map[feature["properties"]["st_nm"]] = feature["id"]

In [None]:
state_id_map

In [None]:
df['State'] = df['State'].str.replace('Jammu And Kashmir', 'Jammu & Kashmir')
df['State'] = df['State'].str.replace('The Dadra And Nagar Haveli And Daman And Diu', 'Dadara & Nagar Havelli')
df['State'] = df['State'].str.replace('Arunachal Pradesh', 'Arunanchal Pradesh')
df['State'] = df['State'].str.replace('Andaman And Nicobar Islands', 'Andaman & Nicobar Island')
df['State'] = df['State'].str.replace('Ladakh', 'Jammu & Kashmir')

In [None]:
df["id"] = df["State"].apply(lambda x: state_id_map[x])


In [None]:
# import plotly.io as pio
# pio.renderers.default = 'browser'


#### Job cards issued in states

In [None]:
fig = px.choropleth(
    df,
    locations="id",
    geojson=india_states,
    color="Job cards issued",
    hover_name="State",
    hover_data=["Job cards issued"],
    title="Job Cards Issued",
)
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

#### Job cards issued for SC

In [None]:
fig = px.choropleth(
    df,
    locations="id",
    geojson=india_states,
    color="Job cards issued for scheduled caste",
    hover_name="State",
    hover_data=["Job cards issued for scheduled caste"],
    title="Job cards issued for scheduled caste",
)
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

#### Job cards issued for ST

In [None]:
fig = px.choropleth(
    df,
    locations="id",
    geojson=india_states,
    color="Job cards issued for scheduled tribes",
    hover_name="State",
    hover_data=["Job cards issued for scheduled tribes"],
    title="Job cards issued for scheduled tribes",
)
fig.update_geos(fitbounds="locations", visible=False)
fig.show()