In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv('analysis_ready_dataset.csv')

In [13]:
df = df.rename(columns={
    'Dropout_Rate_Sec': 'dropout_sec',                 # Secondary dropout rate
    'Dropout_Rate_HSec': 'dropout_hsec',               # Higher secondary dropout rate
    'Avg_PTR_district_23_24': 'ptr',                   # Avg Pupil-Teacher Ratio
    '%_Schools_Func_Toilets_23_24': 'pct_toilets',     # % schools with functional toilets
    '%_Schools_Library_23_24': 'pct_library',          # % schools with library
    '%_Schools_Electricity_23_24': 'pct_school_elec',  # % schools with electricity
    'NFHS_Women_10_plus_School': 'women_10plus',       # % women with 10+ yrs schooling
    'NFHS_Women_Literate': 'women_literate',           # % literate women
    'NFHS_Female_Ever_Attended_School': 'female_ever_school', # % females ever attended school
    'NFHS_Improved_Water': 'nfhs_water',               # % HHs with improved drinking water
    'NFHS_Improved_Sanitation': 'nfhs_sanitation',     # % HHs with improved sanitation
    'NFHS_Electricity': 'nfhs_electricity',            # % HHs with electricity
    'NFHS_Health_Insurance': 'nfhs_insurance',         # % HHs with any health insurance
    'NFHS_Early_Marriage': 'early_marriage',           # % women married before 18 yrs
    'NFHS_Teen_Pregnancy': 'teen_pregnancy',           # % women (15‚Äì19) who are mothers
    'NFHS_Family_Planning_Modern': 'fp_modern'         # % using modern family planning
})

# ensure columns exist
print("Columns available:", df.columns.tolist())

Columns available: ['district', 'dropout_sec', 'dropout_hsec', 'ptr', 'pct_toilets', 'pct_library', 'pct_school_elec', 'women_10plus', 'women_literate', 'female_ever_school', 'nfhs_water', 'nfhs_sanitation', 'nfhs_electricity', 'nfhs_insurance', 'early_marriage', 'teen_pregnancy', 'fp_modern']


In [15]:
# ==========================================================
# üìä BEAUTIFIED DASHBOARD VISUALS ‚Äî CORRELATION + RF IMPORTANCE
# ==========================================================

import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ==========================================================
# 1Ô∏è‚É£ COMBINE DROP RATES AND DEFINE FEATURES
# ==========================================================
df['dropout_overall'] = df[['dropout_sec', 'dropout_hsec']].mean(axis=1)

feature_cols = [
    'women_10plus', 'women_literate', 'female_ever_school',
    'nfhs_electricity', 'nfhs_sanitation', 'nfhs_water',
    'nfhs_insurance', 'early_marriage', 'teen_pregnancy',
    'fp_modern', 'ptr', 'pct_toilets', 'pct_library', 'pct_school_elec'
]

for col in feature_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df_model = df[feature_cols + ['dropout_overall']].dropna(subset=['dropout_overall'])
X = df_model[feature_cols]
y = df_model['dropout_overall']

# ==========================================================
# 2Ô∏è‚É£ RANDOM FOREST MODEL TRAINING
# ==========================================================
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

X_p = preprocessor.fit_transform(X)

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_p, y)
y_pred = rf.predict(X_p)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f"‚úÖ Random Forest Trained ‚Äî MAE: {mae:.3f}, R¬≤: {r2:.3f}")

# ==========================================================
# 3Ô∏è‚É£ USER-FRIENDLY LABELS
# ==========================================================
pretty_labels = {
    'ptr': 'Pupil‚ÄìTeacher Ratio',
    'early_marriage': 'Women Married Before 18 (%)',
    'teen_pregnancy': 'Teen Pregnancy Rate (%)',
    'pct_toilets': 'Schools with Functional Toilets (%)',
    'nfhs_water': 'Households with Safe Drinking Water (%)',
    'nfhs_insurance': 'Households with Health Insurance (%)',
    'fp_modern': 'Use of Modern Family Planning (%)',
    'pct_library': 'Schools with Library (%)',
    'female_ever_school': 'Women (6+) Who Ever Attended School (%)',
    'nfhs_electricity': 'Households with Electricity (%)',
    'nfhs_sanitation': 'Households with Improved Sanitation (%)',
    'pct_school_elec': 'Schools with Electricity (%)',
    'women_literate': 'Female Literacy Rate (%)',
    'women_10plus': 'Women with 10+ Years of Schooling (%)'
}

# ==========================================================
# 4Ô∏è‚É£ CORRELATION VISUALIZATION
# ==========================================================
corr_with_dropout = (
    df_model.corr()['dropout_overall']
    .drop('dropout_overall')
    .sort_values()
)

corr_df = corr_with_dropout.reset_index()
corr_df.columns = ['Feature', 'Correlation']
corr_df['Readable Name'] = corr_df['Feature'].map(pretty_labels)
corr_df['Effect'] = np.where(corr_df['Correlation'] > 0, 'High Dropout Factors', 'Dropout Reducing Factors')

fig_corr = px.bar(
    corr_df,
    x='Correlation',
    y='Readable Name',
    color='Effect',
    text='Correlation',
    orientation='h',
    color_discrete_map={
        'High Dropout Factors': '#e76f51',
        'Dropout Reducing Factors': '#2a9d8f'
    },
    title='Correlation of Socioeconomic and Educational Factors with Overall Dropout Rate',
    hover_data={'Feature': True, 'Correlation': ':.2f'}
)

fig_corr.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig_corr.update_layout(
    xaxis_title='Correlation Coefficient',
    yaxis_title='',
    title_font_size=18,
    xaxis=dict(showgrid=True, zeroline=True, zerolinecolor='gray'),
    template='simple_white',
    height=600,
    legend_title_text='Effect on Dropout Rate'
)
fig_corr.show()

# ==========================================================
# 5Ô∏è‚É£ RANDOM FOREST FEATURE IMPORTANCE (INTERACTIVE)
# ==========================================================
rf_importance = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=True)
rf_df = rf_importance.reset_index()
rf_df.columns = ['Feature', 'Importance']
rf_df['Readable Name'] = rf_df['Feature'].map(pretty_labels)

fig_rf = px.bar(
    rf_df,
    x='Importance',
    y='Readable Name',
    orientation='h',
    text='Importance',
    color='Importance',
    color_continuous_scale='Blues',
    title='Feature Importance ‚Äî Random Forest Model',
    hover_data={'Feature': True, 'Importance': ':.3f'}
)

fig_rf.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig_rf.update_layout(
    xaxis_title='Importance Score',
    yaxis_title='',
    template='simple_white',
    title_font_size=18,
    height=600,
    coloraxis_showscale=False
)
fig_rf.show()

# ==========================================================
# 6Ô∏è‚É£ SUMMARY INSIGHT
# ==========================================================
print("\nüß† Insight Summary:")
print("""
‚û°Ô∏è Higher Pupil‚ÄìTeacher Ratio (PTR), Early Marriage, and Teen Pregnancy rates show strong POSITIVE correlation.
   ‚Üí These are the leading contributors to increased dropout rates.

‚û°Ô∏è Female Education (10+ years schooling, literacy), and better School Infrastructure (Electricity, Library, Toilets)
   show strong NEGATIVE correlation.
   ‚Üí Districts with better educational environments have significantly lower dropout rates.

‚û°Ô∏è Random Forest confirms the same ‚Äî Women's Education and PTR emerge as the two dominant factors
   explaining variance in dropout rates across districts.
""")


‚úÖ Random Forest Trained ‚Äî MAE: 0.021, R¬≤: 0.942



üß† Insight Summary:

‚û°Ô∏è Higher Pupil‚ÄìTeacher Ratio (PTR), Early Marriage, and Teen Pregnancy rates show strong POSITIVE correlation.
   ‚Üí These are the leading contributors to increased dropout rates.

‚û°Ô∏è Female Education (10+ years schooling, literacy), and better School Infrastructure (Electricity, Library, Toilets)
   show strong NEGATIVE correlation.
   ‚Üí Districts with better educational environments have significantly lower dropout rates.

‚û°Ô∏è Random Forest confirms the same ‚Äî Women's Education and PTR emerge as the two dominant factors
   explaining variance in dropout rates across districts.

