In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('pc11_vd_clean_shrid.csv')

# Select relevant variables
variables = [
    'pc11_vd_p_sch_gov',   # Government primary schools
    'pc11_vd_m_sch_gov',   # Government middle schools
    'pc11_vd_s_sch_gov',   # Government secondary schools
    'pc11_vd_ph_cntr',     # Primary health centres
    'pc11_vd_all_hosp',    # Allopathic hospitals
    'pc11_vd_comm_bank',   # Commercial banks
    'pc11_vd_power_dom',   # Power supply for domestic use
    'pc11_vd_wat_tap_trt', # Treated tap water
    'pc11_vd_mobl_cov',    # Mobile phone coverage
    'pc11_vd_bus_gov',     # Public bus service
    'pc11_vd_t_p'          # Total population
]

# Prepare the data
data = data[variables].dropna()
binary_vars = ['pc11_vd_power_dom', 'pc11_vd_wat_tap_trt', 'pc11_vd_mobl_cov', 'pc11_vd_bus_gov']
data[binary_vars] = data[binary_vars].astype(int)

# Create the infrastructural development index (IDI)
index_vars = variables[:-1]  # Exclude total population
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[index_vars])
data['infra_index'] = data_scaled.sum(axis=1)

# Classify villages
median_index = data['infra_index'].median()
data['infra_level'] = data['infra_index'].apply(lambda x: 1 if x >= median_index else 0)

# Prepare features and target variable
X = data[index_vars + ['pc11_vd_t_p']]  # Include total population
y = data['infra_level']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
logreg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svc = SVC(kernel='rbf', probability=True, random_state=42)

# Train models
logreg.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)
svc.fit(X_train_scaled, y_train)

# Predictions
models = {
    'Logistic Regression': (logreg, X_test_scaled),
    'Random Forest': (rf, X_test_scaled),
    'Support Vector Machine': (svc, X_test_scaled)
}

# Evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("-"*50)
    return y_proba

# Store predicted probabilities for ROC curves
probas = {}
for name, (model, X_tst) in models.items():
    y_proba = evaluate_model(model, X_tst, y_test, name)
    probas[name] = y_proba

# Plot ROC curves
plt.figure(figsize=(10,6))
for model_name, y_proba in probas.items():
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc_score(y_test, y_proba):.2f})')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()

# Feature importance from Random Forest
importances = rf.feature_importances_
feature_names = X.columns
forest_importances = pd.Series(importances, index=feature_names)
forest_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10,6))
plt.title('Feature Importances from Random Forest')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()


  from pandas.core import (
