In [55]:
import pandas as pd
import numpy as np

df = pd.read_csv('sorted_Data_2.csv')

# 2. Calculate the gap (dd_4 minus dd_5)
df['dd_gap'] = df['dd_mobile_own_5'] - df['dd_mobile_own_4']

# 3. Apply the condition
# Logic: > 0.08 is treated, <= 0.08 is not treated
df['treatment_status'] = np.where(df['dd_gap'] < 8, 'treated', 'not_treated')

# 4. Select only the required columns
# "save this in a new file with district treated and non treated and dd_gap"
df.to_csv('nya.csv', index=False)

print(f"Processing complete. Saved results to nya.csv")
print("\nPreview of saved data:")
print(df.head())

Processing complete. Saved results to nya.csv

Preview of saved data:
             state   district  internet_5  mobile_5  gender_5  \
0  Jammu & Kashmir   baramula         0.0  0.936000       0.0   
1  Jammu & Kashmir      samba         0.0  0.900000       1.0   
2  Jammu & Kashmir  bandipore         0.0  0.614525       1.0   
3  Jammu & Kashmir  ganderbal         0.0  0.976000       0.0   
4  Jammu & Kashmir   srinagar         0.0  0.985507       0.0   

   internet_use_m_5  mobile_own_m_5  internet_use_f_5  mobile_own_f_5  \
0               0.0        0.936000               0.0        0.679348   
1               0.0        0.990826               0.0        0.900000   
2               0.0        0.953846               0.0        0.614525   
3               0.0        0.976000               0.0        0.612565   
4               0.0        0.985507               0.0        0.786667   

   dd_internet_use_5  ...    pct_obc  pct_general  pct_hindu  pct_muslim  \
0                0.0  ..

In [56]:
import pandas as pd

# 1. Load the dataset
input_file = 'nya.csv'
output_file = 'nya_final.csv'

df = pd.read_csv(input_file)

# 2. Optimize for Data Quality
# The dataset contains duplicates because it is stratified by gender (rows for men and women).
# However, columns like 'mobile_own_m_5' (men) and 'mobile_own_f_5' (women) capture 
# both genders' data in a single row.
# We sort by 'n_men' (number of men observed) descending. This ensures that if there
# are multiple rows for a district, we keep the one with the most data recorded.
if 'n_men' in df.columns:
    df = df.sort_values(by='n_men', ascending=False)

# 3. Remove Duplicates
# We drop duplicates based on 'state' and 'district', keeping the first occurrence 
# (which is now the one with the most observations due to step 2).
df_clean = df.drop_duplicates(subset=['state', 'district'], keep='first')

# 4. Save the cleaned data
df_clean.to_csv(output_file, index=False)

print(f"Process complete.")
print(f"Original rows: {len(df)}")
print(f"Cleaned rows: {len(df_clean)}")
print(f"Saved to: {output_file}")

Process complete.
Original rows: 591
Cleaned rows: 591
Saved to: nya_final.csv


In [57]:
import pandas as pd

# 1. Load your messy file
df = pd.read_csv('nya_final.csv')

# 2. Clean the text (remove spaces, make title case)
df['state'] = df['state'].str.strip().str.title()

# 3. Define the Custom Order list
north_to_south = [
    "Jammu & Kashmir", "Ladakh", "Himachal Pradesh", "Punjab", "Chandigarh",
    "Uttarakhand", "Haryana", "Delhi", "Rajasthan", "Uttar Pradesh", "Bihar",
    "Sikkim", "Gujarat", "Madhya Pradesh", "Jharkhand", "West Bengal",
    "Maharashtra", "Chhattisgarh", "Odisha", "Telangana", "Andhra Pradesh",
    "Goa", "Karnataka", "Tamil Nadu", "Kerala"
]

# 4. Create a Categorical type with this specific order
df['state'] = pd.Categorical(
    df['state'], 
    categories=north_to_south, 
    ordered=True
)

# 5. Sort and save
df_sorted = df.sort_values('state')
df_sorted.to_csv('sorted_data_2.csv', index=False)

In [58]:
import pandas as pd
from scipy import stats

df = pd.read_csv('sorted_data_2.csv') 

treated_group = df[df['treatment_status'] == 'treated']['mean_age']
control_group = df[df['treatment_status'] == 'not_treated']['mean_age']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Age (Treated): {median_treated}")
print(f"mean Age (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Age (Treated): 30.237749291366907
mean Age (Not Treated): 30.151477342857145
Difference in Medians: 0.08627194850976139
Nt: 556
Nc: 35
t-statistic: 0.5834604203690528
p-value: 0.562572839075771


In [59]:
treated_group = df[df['treatment_status'] == 'treated']['mean_wealth_index']
control_group = df[df['treatment_status'] == 'not_treated']['mean_wealth_index']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Wealth Index (Treated): {median_treated}")
print(f"mean Wealth Index (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Wealth Index (Treated): 15220.26361109712
mean Wealth Index (Not Treated): 56525.862814285705
Difference in Medians: -41305.59920318858
Nt: 556
Nc: 35
t-statistic: -2.3544303300986913
p-value: 0.023567931046358268


In [60]:
treated_group = df[df['treatment_status'] == 'treated']['mean_hh_size']
control_group = df[df['treatment_status'] == 'not_treated']['mean_hh_size']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Household Size (Treated): {median_treated}")
print(f"mean Household Size (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Household Size (Treated): 5.478564567266187
mean Household Size (Not Treated): 5.316074391428571
Difference in Medians: 0.1624901758376165
Nt: 556
Nc: 35
t-statistic: 1.7490369525153076
p-value: 0.08729898921062246


In [61]:
treated_group = df[df['treatment_status'] == 'treated']['mean_education']
control_group = df[df['treatment_status'] == 'not_treated']['mean_education']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Education (Treated): {median_treated}")
print(f"mean Education (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Education (Treated): 6.988468662050359
mean Education (Not Treated): 7.918296205714286
Difference in Medians: -0.9298275436639276
Nt: 556
Nc: 35
t-statistic: -4.332109196802621
p-value: 8.4440085868019e-05


In [62]:
treated_group = df[df['treatment_status'] == 'treated']['literacy_gender_gap']
control_group = df[df['treatment_status'] == 'not_treated']['literacy_gender_gap']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Literacy Gender Gap (Treated): {median_treated}")
print(f"mean Literacy Gender Gap (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Literacy Gender Gap (Treated): 23.584090979293165
mean Literacy Gender Gap (Not Treated): 20.040168377142855
Difference in Medians: 3.5439226021503103
Nt: 556
Nc: 35
t-statistic: 1.1124068574365125
p-value: 0.27264113031433085


In [63]:
treated_group = df[df['treatment_status'] == 'treated']['male_pct_married']
control_group = df[df['treatment_status'] == 'not_treated']['male_pct_married']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Male % Married (Treated): {median_treated}")
print(f"mean Male % Married (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Male % Married (Treated): 61.98893418705036
mean Male % Married (Not Treated): 59.98625151428571
Difference in Medians: 2.0026826727646494
Nt: 556
Nc: 35
t-statistic: 2.239892058521878
p-value: 0.03065569621113542


In [64]:
treated_group = df[df['treatment_status'] == 'treated']['female_pct_married']
control_group = df[df['treatment_status'] == 'not_treated']['female_pct_married']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Female % Married (Treated): {median_treated}")
print(f"mean Female % Married (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Female % Married (Treated): 71.9852269676259
mean Female % Married (Not Treated): 72.11147451428572
Difference in Medians: -0.12624754665981186
Nt: 556
Nc: 35
t-statistic: -0.1487958526723865
p-value: 0.8824595458763147


In [65]:
treated_group = df[df['treatment_status'] == 'treated']['urban_share']
control_group = df[df['treatment_status'] == 'not_treated']['urban_share']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Urban Share (Treated): {median_treated}")
print(f"mean Urban Share (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Urban Share (Treated): 27.050144783291362
mean Urban Share (Not Treated): 25.367746508571425
Difference in Medians: 1.6823982747199366
Nt: 556
Nc: 35
t-statistic: 0.5992752364265364
p-value: 0.552307870179611


In [66]:
#list all treated districts
treated_districts = df[df['treatment_status'] == 'treated'][['state', 'district', 'dd_gap']]
print("Treated Districts:")

Treated Districts:


In [67]:
import pandas as pd

# --- CONFIGURATION ---
input_file = 'sorted_data_2.csv' # Or 'dd_mobile_gap_final.csv'
target_column = 'treatment_status'
target_value = 'treated' # The value that indicates a district is treated

# --- PROCESSING ---

try:
    df = pd.read_csv(input_file)
except FileNotFoundError:
    print(f"Error: Could not find {input_file}")
    exit()

# Check if column exists
if target_column not in df.columns:
    print(f"Error: Column '{target_column}' not found. Available columns:")
    print(df.columns.tolist())
    exit()

# Normalize text to avoid case sensitivity issues (optional but recommended)
df[target_column] = df[target_column].astype(str).str.strip().str.lower()

# Filter for the target value
treated_df = df[df[target_column] == target_value.lower()]

# Get unique districts
treated_districts = treated_df['district'].unique()

print(f"--- Found {len(treated_districts)} Treated Districts ---")
if len(treated_districts) > 0:
    for district in sorted(treated_districts):
        dd_gap = treated_df[treated_df['district'] == district]['dd_gap'].values[0]
        print(district, dd_gap)
else:
    print(f"No districts found with {target_column} = '{target_value}'.")
    print("Unique values found in this column were:")
    print(df[target_column].unique())

--- Found 551 Treated Districts ---
agra -5.127136
ahmadnagar 2.974399999999996
aizawl -3.635109
ajmer -4.749244999999995
akola -6.023479000000009
alappuzha 1.4532089999999975
aligarh -13.640903000000002
alirajpur -8.068725
allahabad 0.5887719999999987
almora -17.5535887
alwar -3.512065999999997
ambedkar nagar -8.991954999999997
amravati -21.38017600000001
amreli -0.260902999999999
amritsar -5.460106
anand 5.707421000000004
anantapur -1.3847310000000022
anantnag -28.604366
anjaw -27.542206
anugul -20.710453
anuppur -7.383217000000002
araria -6.78595
ariyalur -10.790454
arwal -5.362209
ashoknagar 3.294121000000004
auraiya -5.845031999999996
aurangabad -7.152206
azamgarh 0.5255590000000012
badgam -15.025237999999998
bagalkot -28.66386
bageshwar -9.42627
baghpat -9.963864
bahraich -13.044360999999997
baksa -15.144484999999996
balaghat -28.24165
balangir -18.203529000000003
baleshwar -14.143498
ballia -5.905613000000002
balrampur -20.02214
banas kantha 2.9572829999999968
bandipore -18.2248