In [2]:
import numpy as np 
import pandas as pd 
import pyreadstat

In [3]:
# reading the metro file
fp3 = "./metro cities/3_LASI_W1_Individual_metrocities_28-03-22.sav"
df_fp3, meta_fp3 = pyreadstat.read_sav(fp3)

In [4]:
# Function to get the variable name given a column label (case-insensitive)
def get_varname_by_label(meta, target_label):
    target_label = target_label.strip().lower()
    for var, label in meta.column_names_to_labels.items():
        if label is not None and label.strip().lower() == target_label:
            return var
    return None

# Define a function to recode yes/no responses (assuming 1=Yes, 2=No)
def recode_yes(x):
    return 1 if x == 1 else 0  # Convert 1 to 1, everything else to 0

##### Definition 2:
Hard threshold: Individuals who have lived in their current city for more than 20 years will not be considered migrants (Binary Variable).

##### Definition 3:
Duration-based categories: Migration status classified into ordinal categories such as <2 years, <5 years, <10 years in the current location. (Ordinal Variable).

Create migration_3 with ordinal groups:
1. 0-2 years -> 1
2. 3-5 years -> 2
3. 6-10 years -> 3
4. 11-20 years -> 4
5. 21-40 years -> 5
6. .>40 years -> 6
7. NaN values -> 0


In [5]:
# params for definition 2
THRESHOLD = 20

# params for definition 3
BINS = [-np.inf, 2, 5, 10, 20, 40, np.inf]
LABELS = [1, 2, 3, 4, 5, 6]

# params for outcome - mental health (CIDI SF score method)
# Define the column names directly (based on the lowercase version of LASI dataset codes)
cols_cidi1 = ["mh204", "mh205", "mh206", "mh207", "mh208", "mh209", "mh210", "mh211"]
cols_cidi2 = ["mh217", "mh218", "mh219", "mh220", "mh221", "mh222"]

# Screening questions
col_MH201 = "mh201"  # Screening for dysphoria
col_MH214 = "mh214"  # Screening for anhedonia

CIDI_1_THRESHOLD = 3 # Depression: if CIDI_1 score >=3, then 1 else 0

In [6]:
# Get variable name for "Since how many years living continuously in this area"
col_name_years = get_varname_by_label(meta_fp3, "Since how many years living continuously in this area")
if col_name_years is None:
    raise ValueError("Column label 'Since how many years living continuously in this area' not found in fp3.sav metadata.")

# Convert the column to numeric (handling errors)
years_living = pd.to_numeric(df_fp3[col_name_years], errors='coerce')

# Create a new DataFrame for migration_2 and migration_3
df_input = pd.DataFrame()

# Create migration_2: 0 if years >= THRESHOLD, else 1
df_input["migration_2"] = np.where(years_living >= THRESHOLD, 0, 1)

# Use pd.cut, and fill NaN values with a default category (e.g., 0 for unknown)
df_input["migration_3"] = pd.cut(years_living, bins=BINS, labels=LABELS, right=True)

# Convert migration_3 to integer, replacing NaN with 0 (or any default category you prefer)
df_input["migration_3"] = df_input["migration_3"].cat.add_categories(0).fillna(0).astype(int)

# Compute CIDI_1 (dysphoria score)
df_fp3["MH201_binary"] = df_fp3[col_MH201].apply(recode_yes)
df_input["CIDI_1"] = df_fp3[cols_cidi1].applymap(recode_yes).sum(axis=1)
df_input.loc[df_fp3["MH201_binary"] == 0, "CIDI_1"] = 0  # Set to 0 if screening was No

# Compute CIDI_2 (anhedonia score)
df_fp3["MH214_binary"] = df_fp3[col_MH214].apply(recode_yes)
df_input["CIDI_2"] = df_fp3[cols_cidi2].applymap(recode_yes).sum(axis=1)
df_input.loc[df_fp3["MH214_binary"] == 0, "CIDI_2"] = 0  # Set to 0 if screening was No

# Create binary depression variable (1 if CIDI_1 >= CIDI_1_THRESHOLD, else 0)
df_input["Depression"] = df_input["CIDI_1"].apply(lambda x: 1 if x >= CIDI_1_THRESHOLD else 0)

# Display first few rows of the new DataFrame
print(df_input.head())


   migration_2  migration_3  CIDI_1  CIDI_2  Depression
0            1            3       0       0           0
1            1            4       0       0           0
2            1            4       0       0           0
3            0            5       0       0           0
4            0            5       0       0           0


In [7]:
counts = df_input["Depression"].value_counts()
print(counts)

Depression
0    4321
1     152
Name: count, dtype: int64
