In [46]:
import pandas as pd
from scipy.stats import chi2_contingency
import statsmodels.api as sm
import numpy as np

# Load the data (replace paths with actual file paths or DataFrames)
census_2001 = pd.read_csv("pc01_vd_clean_shrid.csv")  # Replace with actual 2001 data path
census_2011 = pd.read_csv("pc11_vd_clean_shrid.csv")  # Replace with actual 2011 data path


In [47]:
def fix_key(x):
    x = x.split("-")
    subdr = x[3]
    subdr = int(subdr)
    # make it a string with 5 digits (pad 0 in beginning)
    subdr = str(subdr).zfill(5)
    x[3] = subdr
    return "-".join(x)

In [48]:
census_2001["shrid2"] = census_2001["shrid2"].apply(lambda x: fix_key(x))

In [49]:
# drop the columns that are not null in either of the columns that i specify
census_2001 = census_2001.dropna(subset=['pc01_vd_sc_p', 'pc01_vd_st_p'])
census_2011 = census_2011.dropna(subset=['pc11_vd_sc_p', 'pc11_vd_st_p'])

In [50]:
print(census_2001.shape)
print(census_2011.shape)

(527918, 110)
(588973, 284)


In [38]:
census_2011["pc11_vd_sc_p"] 

0            0.0
1            0.0
2            0.0
3            0.0
4            0.0
           ...  
588968    1438.0
588969    2281.0
588970    2726.0
588971    1381.0
588972    4518.0
Name: pc11_vd_sc_p, Length: 588973, dtype: float64

In [39]:
merged_data = pd.merge(census_2001, census_2011, on="shrid2")


In [40]:
merged_data.shape

(519860, 393)

In [52]:
# Merge datasets on village identifiers (assuming a common column like "village_id")

# Calculate percentage change in SC and ST populations
merged_data["sc_change_pct"] = (
    (merged_data["pc11_vd_sc_p"] - merged_data["pc01_vd_sc_p"])
    / merged_data["pc01_vd_sc_p"]
) * 100
merged_data["st_change_pct"] = (
    (merged_data["pc11_vd_st_p"] - merged_data["pc01_vd_st_p"])
    / merged_data["pc01_vd_st_p"]
) * 100


In [54]:
merged_data[["pc01_vd_sc_p", "pc11_vd_sc_p", "pc01_vd_st_p", "pc11_vd_st_p", "sc_change_pct", "st_change_pct"]]

Unnamed: 0,pc01_vd_sc_p,pc11_vd_sc_p,pc01_vd_st_p,pc11_vd_st_p,sc_change_pct,st_change_pct
0,1.0,0.0,1.0,0.0,-100.000000,-100.000000
1,0.0,0.0,0.0,0.0,,
2,0.0,0.0,188.0,349.0,,85.638298
3,0.0,0.0,253.0,302.0,,19.367589
4,0.0,0.0,10.0,0.0,,-100.000000
...,...,...,...,...,...,...
519855,1267.0,1438.0,7.0,0.0,13.496448,-100.000000
519856,2243.0,2281.0,0.0,0.0,1.694160,
519857,2188.0,2726.0,0.0,0.0,24.588665,
519858,1322.0,1381.0,0.0,0.0,4.462935,


In [None]:

# Categorize villages based on the number of educational facilities in 2011
def categorize_facilities(count):
    if count < 2:
        return "Low"
    elif count <= 5:
        return "Medium"
    else:
        return "High"

merged_data["education_facility_category"] = merged_data["num_educational_facilities_2011"].apply(categorize_facilities)

# Categorize population changes into bins (e.g., decrease, no change, increase)
def categorize_change(value):
    if value < 0:
        return "Decrease"
    elif value == 0:
        return "No Change"
    else:
        return "Increase"

merged_data["sc_change_category"] = merged_data["sc_change_pct"].apply(categorize_change)
merged_data["st_change_category"] = merged_data["st_change_pct"].apply(categorize_change)

# Create a contingency table for chi-squared test
contingency_table = pd.crosstab(
    merged_data["education_facility_category"], merged_data["sc_change_category"]
)

# Perform chi-squared test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("Chi-squared Test Results for SC Population Changes:")
print(f"Chi-squared Statistic: {chi2}")
print(f"P-value: {p_value}")

# Logistic Regression for SC Population Change (Increase vs Others)
merged_data["sc_increase"] = (merged_data["sc_change_category"] == "Increase").astype(int)
X = pd.get_dummies(merged_data["education_facility_category"], drop_first=True)
X = sm.add_constant(X)  # Add a constant for the intercept
y = merged_data["sc_increase"]

logit_model = sm.Logit(y, X)
result = logit_model.fit()
print(result.summary())

# Repeat the analysis for ST Population Change
contingency_table_st = pd.crosstab(
    merged_data["education_facility_category"], merged_data["st_change_category"]
)

chi2_st, p_value_st, dof_st, expected_st = chi2_contingency(contingency_table_st)
print("\nChi-squared Test Results for ST Population Changes:")
print(f"Chi-squared Statistic: {chi2_st}")
print(f"P-value: {p_value_st}")

merged_data["st_increase"] = (merged_data["st_change_category"] == "Increase").astype(int)
y_st = merged_data["st_increase"]

logit_model_st = sm.Logit(y_st, X)
result_st = logit_model_st.fit()
print(result_st.summary())
