In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import geopandas as gpd

from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import itertools


from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
df = pd.read_csv("mnrega_clean.csv")

In [None]:
x_column = 'Job cards issued'
y_column = 'Total person days'

plt.figure(figsize=(10, 6))
plt.scatter(df[x_column], df[y_column], alpha=0.5)
plt.xlabel('Job Cards Issued')
plt.ylabel('Total Person-Days Worked')
plt.title('Scatter Plot: Job Cards Issued vs. Total Person-Days Worked')
plt.grid(True)

states = df['State'].unique()
for state in states:
    state_data = df[df['State'] == state]
    plt.scatter(state_data[x_column], state_data[y_column], alpha=0.5, label=state)

plt.show()

In [None]:
data = df

# Select the features you want to include in the scatter plot
features_to_plot = [
       'Households that applied for a job card', 'Job cards issued',
       'Job cards issued for scheduled caste',
       'Job cards issued for scheduled tribes',
       'Job cards issued for non scheduled tribes or scheduled caste',
       'Households that demanded work', 'Persons who demanded work',
       'Households that were allotted work', 'Persons that were allotted work',
       'Muster rolls filled',
       'Households that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Persons that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Households that reached a 100 day limit', 'Persons with disability',
       'Non scheduled tribes or scheduled caste houeholds that worked',
       'Total person days worked by non scheduled tribes or scheduled caste persons.',
       'Scheduled caste houeholds that worked',
       'Total person days worked scheduled caste persons',
       'Scheduled tribe houeholds that worked',
       'Total person days worked scheduled tribe persons',
       'Households that worked on land reform or indira awas yojana',
]

plt.figure(figsize=(10, 6))

for i, feature in enumerate(features_to_plot):
    plt.scatter(data.index, data[feature], label=feature, alpha=0.7)

plt.title('Scatter Plot for many Features')
plt.xlabel('Data Points')
plt.ylabel('Feature Values')
# plt.legend()
plt.grid(True)

plt.show()

In [None]:
data = df

features_to_plot = [
       'Households that applied for a job card', 'Job cards issued',
       'Job cards issued for scheduled caste',
       'Job cards issued for scheduled tribes',
       'Job cards issued for non scheduled tribes or scheduled caste',
       'Households that demanded work', 'Persons who demanded work',
       'Households that were allotted work', 'Persons that were allotted work',
       'Muster rolls filled',
       'Households that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Persons that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Households that reached a 100 day limit', 'Persons with disability',
       'Non scheduled tribes or scheduled caste houeholds that worked',
       'Total person days worked by non scheduled tribes or scheduled caste persons.',
       'Scheduled caste houeholds that worked',
       'Total person days worked scheduled caste persons',
       'Scheduled tribe houeholds that worked',
       'Total person days worked scheduled tribe persons',
       'Households that worked on land reform or indira awas yojana',
        'Scheduled caste households that reached a 100 day limit',
       'Scheduled tribe households that reached a 100 day limit',
       'Labour expenditure that has been disbursed',
       'Material expenditure that has been disbursed',
       'Labour expenditure both disbursed and pending',

]


# Plotting all features in one plot
fig, axes = plt.subplots(figsize=(15, 10))
fig.suptitle('Scatter Plots for Different Features')

for feature in features_to_plot:
    axes.scatter(data.index, data[feature], alpha=0.5, label=feature)

axes.set_xlabel('Data Points')
axes.set_ylabel('Feature Values')
axes.legend()
plt.show()

# print(len(features_to_plot))

# fig = px.scatter(data, x=data.index, y=features_to_plot, labels={'x': 'Data Points'},
#                  title='Interactive Scatter Plot for Ten Different Features')

# fig.update_layout(showlegend=True)

# fig.show()

In [None]:
df.columns

In [None]:
df.isna().sum()

##  Elbow - method

In [None]:
## These should be the versions to run kmeans.fit.... else it will give an error

import sklearn
print(sklearn.show_versions())

In [None]:
state_person_days= df.groupby('State lgd code')['Total person days'].sum().reset_index()
state_person_days1 = df.groupby('State')['Total person days'].sum().reset_index()

# Scaling the data
scaler = StandardScaler()
state_person_days[['Total person days']] = scaler.fit_transform(state_person_days[['Total person days']])

# Define a range of K values to test
k_values = range(1, 11)  # You can adjust the range as needed

# Calculate the sum of squared distances (inertia) for different K values
inertia = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(state_person_days[['Total person days']])
    inertia.append(kmeans.inertia_)

# Plot the inertia values against K values
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()


In [None]:
state_person_days

In [None]:

state_person_days

In [None]:
n_clusters = 3

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
state_person_days['Cluster'] = kmeans.fit_predict(state_person_days[['Total person days']])
state_person_days1['Cluster']=state_person_days['Cluster']

# Create an interactive plot
fig = px.scatter(state_person_days1, x='Total person days', color='Cluster', hover_name='State', title='State Clustering by Total Person-Days')
fig.update_layout(xaxis_title='Total Person-Days (Scaled)')
fig.show()


# Plot the clustered data
plt.figure(figsize=(10, 6))
plt.scatter(state_person_days1['Total person days'], state_person_days1['Cluster'], c=state_person_days1['Cluster'], cmap='viridis')
plt.title('State Clustering by Total Person-Days')
plt.xlabel('Total Person-Days (Scaled)')
plt.ylabel('Cluster')
plt.show()

In [None]:
columns_for_clustering = ['Job cards issued', 'Total person days', 'Labour expenditure that has been disbursed']  # You can select other columns as well

# Create a subset of the data with the selected columns
data_for_clustering = df[columns_for_clustering]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_for_clustering)

# Specify the number of clusters (K) you want to create
num_clusters = 3  # You can choose the desired number of clusters

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualize the clustered data
plt.figure(figsize=(10, 6))
for cluster in range(num_clusters):
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data[columns_for_clustering[0]], cluster_data[columns_for_clustering[1]], label=f'Cluster {cluster}')

plt.xlabel(columns_for_clustering[0])
plt.ylabel(columns_for_clustering[1])
plt.title(f'K-Means Clustering ({num_clusters} Clusters)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#Specify the columns for clustering
columns_for_clustering = ['Job cards issued', 
                          'Total person days',
                          'Persons that worked under mahatma gandhi national rural employment guarantee act (mgnrega)']

# Group the data by 'State' and calculate the sum of selected columns for each state
state_data = df.groupby('State')[columns_for_clustering].sum().reset_index()

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(state_data[columns_for_clustering])

# Specify the number of clusters (K) you want to create
num_clusters = 3  # You can choose the desired number of clusters

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
state_data['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualize the clustered data
plt.figure(figsize=(10, 6))
for cluster in range(num_clusters):
    cluster_data = state_data[state_data['Cluster'] == cluster]
    plt.scatter(cluster_data[columns_for_clustering[0]], cluster_data[columns_for_clustering[1]], label=f'Cluster {cluster}')

plt.xlabel(columns_for_clustering[0])
plt.ylabel(columns_for_clustering[1])
plt.title(f'K-Means Clustering of States ({num_clusters} Clusters)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Select relevant features for clustering
features = ['Labour expenditure that has been disbursed', 'Material expenditure that has been disbursed', 'Total person days']

# Subset the data with selected features
data = df[features]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Determine the optimal number of clusters (K) using the elbow method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow method to choose K
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

# Based on the elbow method, choose an optimal K value (e.g., K=3)

# Apply K-Means clustering with the chosen K
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(scaled_data)

# Add cluster labels to the original DataFrame
df['Cluster'] = kmeans.labels_

# Visualize the clusters
plt.figure(figsize=(10, 8))
for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data[features[0]], cluster_data[features[1]], label=f'Cluster {cluster}')

plt.xlabel(features[0])
plt.ylabel(features[1])
plt.title('Clustering: Expenditure Efficiency')
plt.legend()
plt.grid(True)
plt.show()

# You can further analyze and interpret the clusters, and access the cluster centroids using kmeans.cluster_centers_


In [None]:
# Select relevant features for clustering
features = ['Labour expenditure that has been disbursed', 'Material expenditure that has been disbursed', 'Total person days']

# Subset the data with selected features
data = df[features]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# # Determine the optimal number of clusters (K) using the elbow method
# inertia = []
# for k in range(1, 11):
#     kmeans = KMeans(n_clusters=k, random_state=0)
#     kmeans.fit(scaled_data)
#     inertia.append(kmeans.inertia_)

# Apply K-Means clustering with the chosen K
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(scaled_data)

# Add cluster labels to the original DataFrame
df['Cluster'] = kmeans.labels_

plt.scatter(df[features[0]], df[features[2]], c=df['Cluster'], cmap='viridis')
plt.title('Non-Interactive Clustering: Expenditure Efficiency')
plt.xlabel(features[0])
plt.ylabel(features[2])
plt.show()


In [None]:
## Please run this cell to see interactive plot

features = ['Labour expenditure that has been disbursed', 'Material expenditure that has been disbursed', 'Total person days']

# Subset the data with selected features
data = df[features]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)


# Apply K-Means clustering with the chosen K
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(scaled_data)

# Add cluster labels to the original DataFrame
df['Cluster'] = kmeans.labels_

# Create an interactive scatter plot using Plotly
fig = px.scatter_3d(df, x=features[0],y=features[1], z=features[2], color='Cluster', hover_name=df.index)

# Customize the plot layout
fig.update_layout(
    title='Interactive Clustering: Expenditure Efficiency',
   scene=dict(xaxis_title=features[0], yaxis_title=features[1],
              zaxis_title=features[2]),
)

# Show the interactive plot
fig.show()

####  Kmeans Clustering

In [None]:
features = ['Labour expenditure that has been disbursed', 'Material expenditure that has been disbursed', 'Total person days']

#Subset the data with selected features
data = df[features]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Apply K-Means clustering with the chosen K
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(scaled_data)

# Calculate the Silhouette Score
silhouette_avg = silhouette_score(scaled_data, kmeans.labels_)
print(f'Silhouette Score: {silhouette_avg}')

# Add cluster labels to the original DataFrame
df['Cluster'] = kmeans.labels_

# Create an interactive scatter plot using Plotly
fig = px.scatter_3d(df, x=features[0], y=features[1], z=features[2], color='Cluster', hover_name=df.index)

# Customize the plot layout
fig.update_layout(
    title='Interactive Clustering: Expenditure Efficiency',
    scene=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=features[2]),
)

# Show the interactive plot
fig.show()

#### Clustering with change of features

In [None]:
# Select the features for clustering
features = ['Yearcode', 'Persons who demanded work', 'Labour expenditure both disbursed and pending']  # Adjust as needed

# Subset the data with selected features
data = df[features]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform K-Means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Create a 3D scatter plot using Plotly
fig = px.scatter_3d(df, x=features[0], y=features[1], z=features[2], color='Cluster',
                    hover_name='Yearcode', title='3D Scatter Plot of Year-to-Year Changes')

# Customize the plot layout
fig.update_layout(scene=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=features[2]))

# Show the interactive 3D plot
fig.show()

In [None]:
features = ['Yearcode', 'Persons who demanded work', 'Labour expenditure both disbursed and pending']  # Adjust as needed

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform K-Means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualize the results
# Example: Plot year-to-year changes within clusters
for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    plt.figure()
    for year in cluster_data['Yearcode'].unique():
        year_data = cluster_data[cluster_data['Yearcode'] == year]
        plt.plot(year_data['Yearcode'], year_data['Persons who demanded work'], label=f'Cluster {cluster}, Yearcode {year}')

    plt.title(f'Year-to-Year Changes in Work Demand (Cluster {cluster})')
    plt.xlabel('Yearcode')
    plt.ylabel('Persons who demanded work')
    plt.legend()

plt.show()

In [None]:
features = ['Yearcode', 'Persons who demanded work', 'Labour expenditure both disbursed and pending']  # Adjust as needed

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform K-Means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Create a scatter plot
plt.figure(figsize=(10, 6))
for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data['Yearcode'], cluster_data['Persons who demanded work'], label=f'Cluster {cluster}')

plt.title('Clustering of Year vs. Work Demand')
plt.xlabel('Yearcode')
plt.ylabel('Work Demand')
plt.legend()
plt.grid(True)

plt.show()


# # Create an interactive scatter plot using Plotly
# fig = px.scatter(df, x='Yearcode', y='Persons who demanded work', color='Cluster', hover_name=df.index)

# # Customize the plot layout
# fig.update_layout(
#     title='Interactive Clustering of Year vs. Work Demand',
#     xaxis_title='Yearcode',
#     yaxis_title='Persons who demanded work'
# )

# # Show the interactive plot
# fig.show()

In [None]:
x_column = 'Total person days'
y_column = 'Labour expenditure that has been disbursed'

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[x_column], df[y_column], alpha=0.5)
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Scatter Plot: {x_column} vs. {y_column}')
plt.grid(True)

# Show the plot
plt.show()


In [None]:
column_names = df.columns
column_names

In [None]:
x_column = 'Total person days'
y_column = 'Labour expenditure that has been disbursed'

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[x_column], df[y_column], alpha=0.5)
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Scatter Plot: {x_column} vs. {y_column}')
plt.grid(True)

# Show the plot
plt.show()

In [None]:
dff = df

In [None]:
dff.columns

In [None]:
columns_to_drop = ['State', 'District']
df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
dff.columns

# 2d plot for nC2 [Manual Inspection]

In [None]:
## Plotting for any 2 combinations of columns to see if we can extract something out of data

feature_columns = df.columns

# Generate scatter plots for all combinations of two features
combinations = list(itertools.combinations(feature_columns, 2))

for combo in combinations:
    x_column, y_column = combo

    # Create the scatter plot
    plt.scatter(df[x_column], df[y_column], alpha=0.5)
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.title(f'Scatter Plot: {x_column} vs {y_column}')
    plt.grid(True)

    plt.show()


# 3d plots nC3 [Manual Inspection]

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import itertools

# Get all feature columns
feature_columns = dff.columns

# Generate 3D scatter plots for all combinations of three features
combinations = list(itertools.combinations(feature_columns, 3))

for combo in combinations:
    x_column, y_column, z_column = combo

    # Create the 3D scatter plot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(dff[x_column], dff[y_column], dff[z_column], alpha=0.5)
    ax.set_xlabel(x_column)
    ax.set_ylabel(y_column)
    ax.set_zlabel(z_column)
    plt.title(f'3D Scatter Plot: {x_column} vs {y_column} vs {z_column}')


    plt.show()


<!-- Households that demanded work vs person with disability
Households that demanded work vs Non scheduled tribes or scheduled caste households that worked
Households that demanded work vs Total person days worked by non scheduled tribes or scheduled caste persons
Households that demanded work vs Total person days
person who demanded work vs Households that were alloted work
person that worked under mnrega vs total person days
person with disability vs Scheduled caste households that worked

person with disability vs total person days -->

In [None]:
x_column = 'Households that demanded work'
y_column = 'Persons with disability'

plt.figure(figsize=(8, 6))
plt.scatter(df[x_column], df[y_column], alpha=0.5)
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Scatter Plot: {x_column} vs {y_column}')
plt.grid(True)
plt.show()


In [None]:
correlation_matrix = df.corr()

# Print or visualize the correlation matrix to see which columns are least corelated
correlation_matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

#### Finding least correlated columns to do clustering

In [None]:
# Find the least correlated columns
least_correlated_columns = []

# Set a threshold for considering correlations as low
correlation_threshold = 0.2  

# Iterate through the correlation matrix to find least correlated columns
for col1 in correlation_matrix.columns:
    for col2 in correlation_matrix.columns:
        if col1 != col2 and abs(correlation_matrix[col1][col2]) < correlation_threshold:
            least_correlated_columns.append((col1, col2, correlation_matrix[col1][col2]))

# Sort the least correlated column pairs by correlation coefficient
least_correlated_columns.sort(key=lambda x: x[2])

# Print the least correlated column pairs
for col1, col2, correlation in least_correlated_columns:
    print(f"Columns: {col1} and {col2}, Correlation: {correlation}")


In [None]:
#Plotting each corelated column to inspect for clusters
corr_matrix = df.corr()

# Find the least correlated columns
least_corr_columns = []

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) < 0.1:  # Adjust the correlation threshold as needed
            least_corr_columns.append((corr_matrix.columns[i], corr_matrix.columns[j]))

# Create scatter plots for the least correlated pairs
for pair in least_corr_columns:
    x_col, y_col = pair
    plt.figure(figsize=(8, 6))
    plt.scatter(df[x_col], df[y_col], alpha=0.5)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f'Scatter Plot: {x_col} vs {y_col}')
    plt.grid(True)
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

pairs = [
    ('Households that demanded work', 'Persons with disability'),
    ('Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked'),
    ('Households that demanded work', 'Total person days worked by non scheduled tribes or scheduled caste persons.'),
    ('Households that demanded work', 'Total person days'),
    ('Persons who demanded work', 'Households that were allotted work'),
    ('Persons that worked under mahatma gandhi national rural employment guarantee act (mgnrega)', 'Total person days'),
    ('Persons with disability', 'Scheduled caste houeholds that worked'),
    ('Persons with disability', 'Total person days')
]

# Create scatter plots for each pair
for x_col, y_col in pairs:
    plt.figure(figsize=(8, 6))
    plt.scatter(df[x_col], df[y_col], alpha=0.5)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f'Scatter Plot: {x_col} vs {y_col}')
    plt.grid(True)
    plt.show()


In [None]:
df[['Households that demanded work','Non scheduled tribes or scheduled caste houeholds that worked']]

In [None]:

data = df[['Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked']]

# Standardize the data (scaling)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Determine the optimal number of clusters using the Elbow method
wcss = []  # Within-Cluster-Sum-of-Squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Plot the Elbow method graph
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within-Cluster-Sum-of-Squares (WCSS)')
plt.grid(True)
plt.show()


In [None]:
k = 3  # You can adjust this value

# Perform K-Means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualize the results
plt.figure(figsize=(10, 6))
for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(
        cluster_data['Households that demanded work'],
        cluster_data['Non scheduled tribes or scheduled caste houeholds that worked'],
        label=f'Cluster {cluster}'
    )

plt.xlabel('Households that demanded work')
plt.ylabel('Non scheduled tribes or scheduled caste houeholds that worked')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

## Kmeans with log transformation

In [None]:
data_for_log =data

In [None]:
for column in data_for_log.columns:
#     if column.startswith('Values'):  # You can specify which columns to transform
    data_for_log[f'{column}_Log'] = np.log(data_for_log[column]+1)

In [None]:
columns = ['Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked']
data_for_log = data_for_log.drop(columns=columns)

In [None]:
data_for_log

In [None]:
# Determine the optimal number of clusters using the Elbow method
wcss = []  # Within-Cluster-Sum-of-Squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(data_for_log)
    wcss.append(kmeans.inertia_)

# Plot the Elbow method graph
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within-Cluster-Sum-of-Squares (WCSS)')
plt.grid(True)
plt.show()


In [None]:
k = 2  # You can adjust this value

# Perform K-Means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
data_for_log['Cluster'] = kmeans.fit_predict(data_for_log)

# Visualize the results
plt.figure(figsize=(10, 6))
for cluster in data_for_log['Cluster'].unique():
    cluster_data = data_for_log[data_for_log['Cluster'] == cluster]
    plt.scatter(
        cluster_data['Households that demanded work_Log'],
        cluster_data['Non scheduled tribes or scheduled caste houeholds that worked_Log'],
        label=f'Cluster {cluster}'
    )

plt.xlabel('Households that demanded work')
plt.ylabel('Non scheduled tribes or scheduled caste houeholds that worked')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

#### Spectral Clustering

In [None]:
# Select the features for clustering
features = ['Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked']  # Adjust as needed

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform Spectral Clustering
spectral = SpectralClustering(n_clusters=k, affinity='nearest_neighbors')
df['Cluster'] = spectral.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('Spectral Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()
silhouette_avg = silhouette_score(scaled_data, df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")



In [None]:
# Select the features for clustering
features = ['Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked']  # Adjust as needed

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform Spectral Clustering
spectral = SpectralClustering(n_clusters=k, affinity='rbf')
df['Cluster'] = spectral.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('Spectral Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()

silhouette_avg = silhouette_score(scaled_data, df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")


#### Agglomerative Clustering

In [None]:
# Select the features for clustering
features = ['Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked']  # Adjust as needed

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (n_clusters) based on your objectives
n_clusters =3  # You can adjust this value

# Perform Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',  affinity='euclidean' )
df['Cluster'] = agg_clustering.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('Agglomerative Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()

silhouette_avg = silhouette_score(scaled_data, df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

In [None]:
linkage_matrix = linkage(scaled_data, method='ward')  # Adjust the linkage method as needed

# Plot the dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_matrix, labels=df.index, orientation='top', leaf_rotation=0, leaf_font_size=10)
plt.title('Dendrogram for Agglomerative Clustering')
plt.xlabel('Data Points')
plt.ylabel('Distance')
plt.show()

In [None]:
features = ['Households that demanded work', 'Non scheduled tribes or scheduled caste houeholds that worked']  # Adjust as needed

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=1, min_samples=7)  # Adjust parameters (eps and min_samples) as needed
df['Cluster'] = dbscan.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('DBSCAN Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()

silhouette_avg = silhouette_score(scaled_data, df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")


In [None]:
from sklearn.mixture import GaussianMixture

# Define the number of components (clusters) for GMM
n_components =  3 # Adjust the number of components as needed

# Create a GMM model
gmm = GaussianMixture(n_components=n_components)

# Fit the model to the scaled data
gmm.fit(scaled_data)

# Predict the cluster labels
df['GMM_Cluster'] = gmm.predict(scaled_data)

# Visualize the results
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['GMM_Cluster'], cmap='viridis')
plt.title('GMM Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()

# Compute the Silhouette Score for GMM
gmm_silhouette_avg = silhouette_score(scaled_data, df['GMM_Cluster'])
print(f"GMM Silhouette Score: {gmm_silhouette_avg}")


In [None]:
df.columns

## Checking if state can do something to form clusters.

In [None]:

# Replace 'selected_columns' with a list of columns you want to visualize
selected_columns = ['Households that applied for a job card', 'Job cards issued',
       'Job cards issued for scheduled caste',
       'Job cards issued for scheduled tribes',
       'Job cards issued for non scheduled tribes or scheduled caste',
       'Households that demanded work', 'Persons who demanded work',
       'Households that were allotted work', 'Persons that were allotted work',
       'Muster rolls filled',
       'Households that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Persons that worked under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Households that reached a 100 day limit', 'Persons with disability',
       'Non scheduled tribes or scheduled caste houeholds that worked',
       'Total person days worked by non scheduled tribes or scheduled caste persons.',
       'Scheduled caste houeholds that worked',
       'Total person days worked scheduled caste persons',
       'Scheduled tribe houeholds that worked',
       'Total person days worked scheduled tribe persons',
       'Households that worked on land reform or indira awas yojana',
       'Total person days worked by women', 'Total person days',
       'Scheduled caste households that reached a 100 day limit',
       'Scheduled tribe households that reached a 100 day limit',
       'Labour expenditure that has been disbursed',
       'Material expenditure that has been disbursed',
       'Labour expenditure both disbursed and pending',
       'Material expenditure both disbursed and pending', 'Amount sanctioned',
       'Works under mahatma gandhi national rural employment guarantee act (mgnrega)',
       'Total bank accounts', 'Individual bank accounts',
       'Joint bank accounts', 'Amount disbursed to bank accounts',
       'Post office accounts', 'Individual post office accounts',
       'Joint post office accounts',
       'Amount disbursed to post office accounts']

# Loop through selected columns and create scatter plots
for column in selected_columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(df['State lgd code'], df[column], alpha=0.5)
    plt.xlabel('State lgd code')
    plt.ylabel(column)
    plt.title(f'Scatter Plot: State vs {column}')
    plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
    plt.grid(True)
    plt.show()

## K Means on state vs Amt disbursed to PO

In [None]:
from sklearn.cluster import KMeans
import pandas as pd

# Select the columns for clustering
columns = ['Amount disbursed to post office accounts']

# Create a DataFrame with the selected columns
data = df[columns]

# Initialize the K-Means model with the desired number of clusters (K)
kmeans = KMeans(n_clusters=3, random_state=0)

# Fit the K-Means model to the data
kmeans.fit(data)

# Add cluster labels to the original DataFrame
df['Cluster'] = kmeans.labels_

# Display the cluster assignments for each state
clustered_data = df[['State lgd code', 'Cluster']]
print(clustered_data)


In [None]:
# Create a scatter plot
plt.figure(figsize=(12, 8))
for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data['State lgd code'], cluster_data['Amount disbursed to post office accounts'], label=f'Cluster {cluster}', alpha=0.5)

plt.title('K-Means Clustering: State vs Amount Disbursed to Post Office Accounts')
plt.xlabel('State')
plt.ylabel('Amount Disbursed to Post Office Accounts')
plt.legend()
plt.xticks(rotation=90)  # Rotate state names for better visibility
plt.show()


## Using Kmeans ++

In [None]:
# Assuming you have a DataFrame 'df' with the relevant data
data = df[['State lgd code', 'Amount disbursed to post office accounts']]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Apply K-Means clustering with K-Means++
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=0)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Create a scatter plot
plt.figure(figsize=(12, 8))
for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data['State lgd code'], cluster_data['Amount disbursed to post office accounts'], label=f'Cluster {cluster}', alpha=0.5)

plt.title('K-Means Clustering (K-Means++) - State vs Amount Disbursed to Post Office Accounts')
plt.xlabel('State')
plt.ylabel('Amount Disbursed to Post Office Accounts')
plt.legend()
plt.xticks(rotation=90)
plt.show()


## Using Spectral

In [None]:

# Select the features for clustering
features =['State lgd code', 'Amount disbursed to post office accounts']
data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform Spectral Clustering
spectral = SpectralClustering(n_clusters=k, affinity='nearest_neighbors')
df['Cluster'] = spectral.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('Spectral Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()


## Spectral with rbf

In [None]:
# Select the features for clustering
features =['State lgd code', 'Amount disbursed to post office accounts']
data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (K) based on your objectives
k = 3  # You can adjust this value

# Perform Spectral Clustering
spectral = SpectralClustering(n_clusters=k, affinity='rbf')
df['Cluster'] = spectral.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('Spectral Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()


## Agglomerative clustering

In [None]:
# Select the features for clustering

features =['State lgd code', 'Amount disbursed to post office accounts']
data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Choose the number of clusters (n_clusters) based on your objectives
n_clusters =2  # You can adjust this value

# Perform Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
df['Cluster'] = agg_clustering.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('Agglomerative Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()



## DBScan Clustering

In [None]:
features =['State lgd code', 'Amount disbursed to post office accounts']

data = df[features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.7, min_samples=5)  # Adjust parameters (eps and min_samples) as needed
df['Cluster'] = dbscan.fit_predict(scaled_data)

# Visualize the results
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df[features[0]], df[features[1]], c=df['Cluster'], cmap='viridis')
plt.title('DBSCAN Clustering')
plt.xlabel(features[0])
plt.ylabel(features[1])
plt.show()

silhouette_avg = silhouette_score(scaled_data, df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")


### Working on mean_tsne data which has 2 dimensions based on mean values corresponding to each state 

In [None]:
mean_tsne = pd.read_csv("mean_tsne.csv")

In [None]:
mean_tsne

In [None]:
features = ['Component_1', 'Component_2']
data_for_clustering = mean_tsne[features]

# Determine the optimal number of clusters (K) using the elbow method
inertia = []
k_values = range(1, 11)  # Range of K values to test

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_for_clustering)
    inertia.append(kmeans.inertia_)

# Plot the inertia values against K values
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.xticks(k_values)
plt.grid(True)
plt.show()





In [None]:
n_clusters = 4  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
mean_tsne['Cluster'] = kmeans.fit_predict(data_for_clustering)

# Plotting the clustered mean t-SNE values
fig = px.scatter(
    mean_tsne,
    x='Component_1',
    y='Component_2',
    color='Cluster',
    title=f'K-Means Clustering of Mean t-SNE Values (K={n_clusters})',
    labels={'Component_1': 'Mean Component 1', 'Component_2': 'Mean Component 2'}
)

silhouette_avg = silhouette_score(data_for_clustering, mean_tsne['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# Show the plot
fig.show()

### Working on mean_tsne_district data which has 2 dimensions based on mean values corresponding to each state 

In [None]:
mean_tsne_district = pd.read_csv("mean_tsne_district.csv")

In [None]:
mean_tsne_district

In [None]:
df.columns

In [None]:
# Assuming 'district' column exists in both DataFrames
result_df = pd.merge(mean_tsne_district, df[['District', 'State']], on='District', how='left')


In [None]:
result_df

In [None]:
features = ['Component_1', 'Component_2']
data_for_clustering = result_df[features]

# Determine the optimal number of clusters (K) using the elbow method
inertia = []
k_values = range(1, 11)  # Range of K values to test

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_for_clustering)
    inertia.append(kmeans.inertia_)

# Plot the inertia values against K values
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.xticks(k_values)
plt.grid(True)
plt.show()



In [None]:
fig = px.scatter(
    result_df,
    x='Component_1',
    y='Component_2',
    hover_name = result_df['State'],
    color='District',
    title='Mean t-SNE Values by State (2D)',
    labels={'Component_1': 'Mean Component 1', 'Component_2': 'Mean Component 2'}
)

# Show the plot
fig.show()

### Kmeans 

In [None]:
n_clusters = 4  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
result_df['Cluster'] = kmeans.fit_predict(data_for_clustering)

# Plotting the clustered mean t-SNE values
fig = px.scatter(
    result_df,
    x='Component_1',
    y='Component_2',
    color='Cluster',
    title=f'K-Means Clustering of Mean t-SNE Values (K={n_clusters})',
    labels={'Component_1': 'Mean Component 1', 'Component_2': 'Mean Component 2'}
)

silhouette_avg = silhouette_score(data_for_clustering, result_df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# Show the plot
fig.show()

### Kmeans ++

In [None]:
n_clusters = 4  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=n_clusters,init='k-means++', random_state=42)
result_df['Cluster'] = kmeans.fit_predict(data_for_clustering)

# Plotting the clustered mean t-SNE values
fig = px.scatter(
    result_df,
    x='Component_1',
    y='Component_2',
    color='Cluster',
    title=f'K-Means Clustering of Mean t-SNE Values (K={n_clusters})',
    labels={'Component_1': 'Mean Component 1', 'Component_2': 'Mean Component 2'}
)


silhouette_avg = silhouette_score(data_for_clustering, result_df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")
# Show the plot
fig.show()

### GMM

In [None]:
from sklearn.mixture import GaussianMixture

n_components = 4 # Number of clusters
gmm = GaussianMixture(n_components=n_components, random_state=42)
result_df['Cluster'] = gmm.fit_predict(data_for_clustering)

# Plotting the clustered mean t-SNE values with GMM
fig = px.scatter(
    result_df,
    x='Component_1',
    y='Component_2',
    color='Cluster',
    title=f'Gaussian Mixture Model Clustering of Mean t-SNE Values (n_components={n_components})',
    labels={'Component_1': 'Mean Component 1', 'Component_2': 'Mean Component 2'}
)

# Show the plot

silhouette_avg = silhouette_score(data_for_clustering, result_df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

fig.show()


## DBSCAN

In [None]:
eps = 2   #ust epsilon as needed
min_samples = 10  #djust min_samples as needed
dbscan = DBSCAN(eps=eps, min_samples=min_samples,algorithm= 'kd_tree')

# Fit DBSCAN to your data
result_df['Cluster'] = dbscan.fit_predict(data_for_clustering)

# Plotting the clustered data points (assuming 2D data)
fig = px.scatter(
    result_df,
    x='Component_1',
    y='Component_2',
    color='Cluster',
    title='DBSCAN Clustering',
    labels={'Component_1': 'Component 1', 'Component_2': 'Component 2'}
)

# Show the plot

silhouette_avg = silhouette_score(data_for_clustering, result_df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")


fig.show()

## Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Assuming 'data_for_clustering' contains your data for clustering

# Set up Agglomerative Clustering
n_clusters = 4  # Adjust the number of clusters as needed
agglomerative = AgglomerativeClustering(n_clusters=n_clusters)

# Fit Agglomerative Clustering to your data
result_df['Cluster'] = agglomerative.fit_predict(data_for_clustering)

# Plotting the clustered data points (assuming 2D data)
fig = px.scatter(
    result_df,
    x='Component_1',
    y='Component_2',
    color='Cluster',
    title='Agglomerative Clustering',
    labels={'Component_1': 'Component 1', 'Component_2': 'Component 2'}
)

silhouette_avg = silhouette_score(data_for_clustering, result_df['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")



# Show the plot
fig.show()
