In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('dd_mobile_gap_final.csv')

# 2. Calculate the gap (dd_4 minus dd_5)
df['dd_gap'] = df['dd_mobile_own_5'] - df['dd_mobile_own_4']

# 3. Apply the condition
# Logic: > 0.08 is treated, <= 0.08 is not treated
df['treatment_status'] = np.where(df['dd_gap'] > 8, 'treated', 'not_treated')

# 4. Select only the required columns
# "save this in a new file with district treated and non treated and dd_gap"
df.to_csv('nya.csv', index=False)

print(f"Processing complete. Saved results to nya.csv")
print("\nPreview of saved data:")
print(df.head())

Processing complete. Saved results to nya.csv

Preview of saved data:
                       state                 district  internet_5  mobile_5  \
0  andaman & nicobar islands                 nicobars         0.0  0.887097   
1  andaman & nicobar islands                 nicobars         0.0  0.640000   
2  andaman & nicobar islands  north  & middle andaman         0.0  0.942857   
3  andaman & nicobar islands  north  & middle andaman         0.0  0.846154   
4  andaman & nicobar islands            south andaman         0.0  0.962121   

   gender_5  internet_use_m_5  mobile_own_m_5  internet_use_f_5  \
0       0.0               0.0        0.887097               0.0   
1       1.0               0.0        0.887097               0.0   
2       0.0               0.0        0.942857               0.0   
3       1.0               0.0        0.942857               0.0   
4       0.0               0.0        0.962121               0.0   

   mobile_own_f_5  dd_internet_use_5  ...    pct_obc

In [2]:
import pandas as pd

# 1. Load the dataset
input_file = 'nya.csv'
output_file = 'nya_final.csv'

df = pd.read_csv(input_file)

# 2. Optimize for Data Quality
# The dataset contains duplicates because it is stratified by gender (rows for men and women).
# However, columns like 'mobile_own_m_5' (men) and 'mobile_own_f_5' (women) capture 
# both genders' data in a single row.
# We sort by 'n_men' (number of men observed) descending. This ensures that if there
# are multiple rows for a district, we keep the one with the most data recorded.
if 'n_men' in df.columns:
    df = df.sort_values(by='n_men', ascending=False)

# 3. Remove Duplicates
# We drop duplicates based on 'state' and 'district', keeping the first occurrence 
# (which is now the one with the most observations due to step 2).
df_clean = df.drop_duplicates(subset=['state', 'district'], keep='first')

# 4. Save the cleaned data
df_clean.to_csv(output_file, index=False)

print(f"Process complete.")
print(f"Original rows: {len(df)}")
print(f"Cleaned rows: {len(df_clean)}")
print(f"Saved to: {output_file}")

Process complete.
Original rows: 1212
Cleaned rows: 591
Saved to: nya_final.csv


In [5]:
import pandas as pd

# 1. Load your messy file
df = pd.read_csv('nya_final.csv')

# 2. Clean the text (remove spaces, make title case)
df['state'] = df['state'].str.strip().str.title()

# 3. Define the Custom Order list
north_to_south = [
    "Jammu & Kashmir", "Ladakh", "Himachal Pradesh", "Punjab", "Chandigarh",
    "Uttarakhand", "Haryana", "Delhi", "Rajasthan", "Uttar Pradesh", "Bihar",
    "Sikkim", "Gujarat", "Madhya Pradesh", "Jharkhand", "West Bengal",
    "Maharashtra", "Chhattisgarh", "Odisha", "Telangana", "Andhra Pradesh",
    "Goa", "Karnataka", "Tamil Nadu", "Kerala"
]

# 4. Create a Categorical type with this specific order
df['state'] = pd.Categorical(
    df['state'], 
    categories=north_to_south, 
    ordered=True
)

# 5. Sort and save
df_sorted = df.sort_values('state')
df_sorted.to_csv('sorted_data.csv', index=False)

In [6]:
import pandas as pd
from scipy import stats

df = pd.read_csv('sorted_data.csv') 

treated_group = df[df['treatment_status'] == 'treated']['mean_age']
control_group = df[df['treatment_status'] == 'not_treated']['mean_age']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Age (Treated): {median_treated}")
print(f"mean Age (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Age (Treated): 30.35866461904762
mean Age (Not Treated): 30.22799711578947
Difference in Medians: 0.1306675032581488
Nt: 21
Nc: 570
t-statistic: 0.6600072087144648
p-value: 0.5158517203594245


In [7]:
treated_group = df[df['treatment_status'] == 'treated']['mean_wealth_index']
control_group = df[df['treatment_status'] == 'not_treated']['mean_wealth_index']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Wealth Index (Treated): {median_treated}")
print(f"mean Wealth Index (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Wealth Index (Treated): 62391.68024761905
mean Wealth Index (Not Treated): 16018.678036964913
Difference in Medians: 46373.00221065414
Nt: 21
Nc: 570
t-statistic: 1.718286490381767
p-value: 0.10024939646733991


In [8]:
treated_group = df[df['treatment_status'] == 'treated']['mean_hh_size']
control_group = df[df['treatment_status'] == 'not_treated']['mean_hh_size']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Household Size (Treated): {median_treated}")
print(f"mean Household Size (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Household Size (Treated): 5.377864776190476
mean Household Size (Not Treated): 5.472297092631579
Difference in Medians: -0.09443231644110295
Nt: 21
Nc: 570
t-statistic: -0.7489549581497641
p-value: 0.46156479926689975


In [9]:
treated_group = df[df['treatment_status'] == 'treated']['mean_education']
control_group = df[df['treatment_status'] == 'not_treated']['mean_education']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Education (Treated): {median_treated}")
print(f"mean Education (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Education (Treated): 7.649374009523809
mean Education (Not Treated): 7.021214191403508
Difference in Medians: 0.6281598181203014
Nt: 21
Nc: 570
t-statistic: 2.15215919218303
p-value: 0.04221236339120589


In [10]:
treated_group = df[df['treatment_status'] == 'treated']['literacy_gender_gap']
control_group = df[df['treatment_status'] == 'not_treated']['literacy_gender_gap']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Literacy Gender Gap (Treated): {median_treated}")
print(f"mean Literacy Gender Gap (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Literacy Gender Gap (Treated): 15.195720571428575
mean Literacy Gender Gap (Not Treated): 23.675526922257895
Difference in Medians: -8.47980635082932
Nt: 21
Nc: 570
t-statistic: -2.006034026228538
p-value: 0.05743993537558259


In [11]:
treated_group = df[df['treatment_status'] == 'treated']['male_pct_married']
control_group = df[df['treatment_status'] == 'not_treated']['male_pct_married']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Male % Married (Treated): {median_treated}")
print(f"mean Male % Married (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Male % Married (Treated): 61.29518380952382
mean Male % Married (Not Treated): 61.891521668421056
Difference in Medians: -0.5963378588972361
Nt: 21
Nc: 570
t-statistic: -0.4551893202282546
p-value: 0.653517013983594


In [12]:
treated_group = df[df['treatment_status'] == 'treated']['female_pct_married']
control_group = df[df['treatment_status'] == 'not_treated']['female_pct_married']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Female % Married (Treated): {median_treated}")
print(f"mean Female % Married (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Female % Married (Treated): 74.10736685714285
mean Female % Married (Not Treated): 71.91479490877194
Difference in Medians: 2.1925719483709116
Nt: 21
Nc: 570
t-statistic: 2.203785042089728
p-value: 0.03807886953505138


In [13]:
treated_group = df[df['treatment_status'] == 'treated']['urban_share']
control_group = df[df['treatment_status'] == 'not_treated']['urban_share']

# 2. Calculate mean Age for both groups
median_treated = treated_group.mean()
median_not_treated = control_group.mean()

# 3. Calculate the difference in medians
diff_medians = median_treated - median_not_treated

# 4. Calculate Nt and Nc (sample sizes)
Nt = len(treated_group)
Nc = len(control_group)

# 5. Calculate the t-statistic
t_stat, p_val = stats.ttest_ind(treated_group, control_group, equal_var=False)

# Display the results
print(f"mean Urban Share (Treated): {median_treated}")
print(f"mean Urban Share (Not Treated): {median_not_treated}")
print(f"Difference in Medians: {diff_medians}")
print(f"Nt: {Nt}")
print(f"Nc: {Nc}")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")

mean Urban Share (Treated): 27.410235090476196
mean Urban Share (Not Treated): 26.933573141070173
Difference in Medians: 0.47666194940602225
Nt: 21
Nc: 570
t-statistic: 0.12814903315938725
p-value: 0.8991926151355726
