In [2]:
import pandas as pd
from scipy.stats import chi2_contingency
import statsmodels.api as sm
import numpy as np

# Load the data (replace paths with actual file paths or DataFrames)
census_2001 = pd.read_csv("pc01_vd_clean_pc01subdist.csv")  # Replace with actual 2001 data path
census_2011 = pd.read_csv("pc01_vd_clean_pc01subdist.csv")  # Replace with actual 2011 data path


In [None]:
list(census_2001.columns)

In [None]:
# Merge datasets on village identifiers (assuming a common column like "village_id")
merged_data = pd.merge(census_2001, census_2011, left_on = "pc01_subdistrict_id", right_on="pc01_subdistrict_id")

# Calculate percentage change in SC and ST populations
merged_data["sc_change_pct"] = (
    (merged_data["sc_population_2011"] - merged_data["sc_population_2001"])
    / merged_data["sc_population_2001"]
) * 100
merged_data["st_change_pct"] = (
    (merged_data["st_population_2011"] - merged_data["st_population_2001"])
    / merged_data["st_population_2001"]
) * 100

# Categorize villages based on the number of educational facilities in 2011
def categorize_facilities(count):
    if count < 2:
        return "Low"
    elif count <= 5:
        return "Medium"
    else:
        return "High"

merged_data["education_facility_category"] = merged_data["num_educational_facilities_2011"].apply(categorize_facilities)

# Categorize population changes into bins (e.g., decrease, no change, increase)
def categorize_change(value):
    if value < 0:
        return "Decrease"
    elif value == 0:
        return "No Change"
    else:
        return "Increase"

merged_data["sc_change_category"] = merged_data["sc_change_pct"].apply(categorize_change)
merged_data["st_change_category"] = merged_data["st_change_pct"].apply(categorize_change)

# Create a contingency table for chi-squared test
contingency_table = pd.crosstab(
    merged_data["education_facility_category"], merged_data["sc_change_category"]
)

# Perform chi-squared test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("Chi-squared Test Results for SC Population Changes:")
print(f"Chi-squared Statistic: {chi2}")
print(f"P-value: {p_value}")

# Logistic Regression for SC Population Change (Increase vs Others)
merged_data["sc_increase"] = (merged_data["sc_change_category"] == "Increase").astype(int)
X = pd.get_dummies(merged_data["education_facility_category"], drop_first=True)
X = sm.add_constant(X)  # Add a constant for the intercept
y = merged_data["sc_increase"]

logit_model = sm.Logit(y, X)
result = logit_model.fit()
print(result.summary())

# Repeat the analysis for ST Population Change
contingency_table_st = pd.crosstab(
    merged_data["education_facility_category"], merged_data["st_change_category"]
)

chi2_st, p_value_st, dof_st, expected_st = chi2_contingency(contingency_table_st)
print("\nChi-squared Test Results for ST Population Changes:")
print(f"Chi-squared Statistic: {chi2_st}")
print(f"P-value: {p_value_st}")

merged_data["st_increase"] = (merged_data["st_change_category"] == "Increase").astype(int)
y_st = merged_data["st_increase"]

logit_model_st = sm.Logit(y_st, X)
result_st = logit_model_st.fit()
print(result_st.summary())
