## Import libraries and warnings

In [1]:
import warnings
import os
import re
import pandas as pd
import numpy as np
from scipy.stats import zscore # For calculating Z-scores


# Option 1: Ignore all FutureWarnings globally
warnings.simplefilter(action='ignore', category=FutureWarning)

print("FutureWarnings are now ignored.")



## Load and read data

In [2]:
df = pd.read_csv('../data/balanced_data.csv')

df.head()

Unnamed: 0,EnumID,State,Age,Gender,Marital Status,Family Setting,Num of Children,Educational Status,Employment Status,Monthly Income,...,Meds_Explained_SideFX,Encourage_Questions,Respond_Q_Concerns,Showed_Personal_Concern,Involved_In_Decisions,Discuss_NextSteps,Checked_Understanding,Time_Spent_Adequate,Visit_Satisfaction,Source
0,SYNTH-NEU-284,Jigawa,25-34,Female,Married,Monogamy,1-2,"Tertiary education (e.g., University, college)",Self-employed,"Less than 20,000 Naira",...,Agree,Agree,Neither Agree or Disagree,Strongly Agree,Agree,Agree,Agree,Agree,Neutral,Synthetic
1,BC023,Bauchi,25-34,Female,Widowed,Polygamy,Greater than 4,No formal education,Self-employed,"Less than 20,000 Naira",...,Strongly Agree,Strongly Agree,Agree,Disagree,Agree,Agree,Disagree,Agree,Very satisfied,Original
2,SYNTH-SAT-140,Jigawa,55 years and above,Female,Widowed,Monogamy,1-2,Secondary education,Unemployed,"51,000–100,000 Naira",...,Agree,Agree,Agree,Agree,Agree,Agree,Agree,Disagree,Satisfied,Synthetic
3,JG14,Jigawa,35–44,Female,Divorced,Monogamy,1-2,Secondary education,Employed part-time,"Less than 20,000 Naira",...,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Very satisfied,Original
4,SYNTH-VER-519,Bauchi,35–44,Female,Single,Monogamy,Greater than 4,Secondary education,Self-employed,"Less than 20,000 Naira",...,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Agree,Strongly Agree,Strongly Agree,Strongly Agree,Very dissatisfied,Synthetic


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   EnumID                    1046 non-null   object
 1   State                     1046 non-null   object
 2   Age                       1046 non-null   object
 3   Gender                    1046 non-null   object
 4   Marital Status            1046 non-null   object
 5   Family Setting            1046 non-null   object
 6   Num of Children           1046 non-null   object
 7   Educational Status        1046 non-null   object
 8   Employment Status         1046 non-null   object
 9   Monthly Income            1046 non-null   object
 10  Treatment Regimen         1046 non-null   object
 11  HIV_Duration_Years        1046 non-null   object
 12  Care_Duration_Years       1046 non-null   object
 13  Facility_Care_Dur_Years   1046 non-null   object
 14  HIV_Diag_Type           

## Data Preprocessing

### Combine Target Labels

### Transformation of Continuous Variables (Currently Categorical Ranges)

#### 1. Transform 'Age'

In [4]:
#@title 1. Transform 'Age'
age_mapping = {
    '18–24': 21,
    '25-34': 29.5,
    '35–44': 39.5,
    '45–54': 49.5,
    '55 years and above': 65 # Estimating 65 as a reasonable midpoint for an open-ended "above 55"
}
df['Age'] = df['Age'].replace(age_mapping).astype(float) # Convert to float

print("--- Transformed 'Age' ---")
print(df['Age'].unique())
#print(df[['Age']].head())
#print("-" * 30)

--- Transformed 'Age' ---
[29.5 65.  39.5 21.  49.5]


#### 2. Transform 'Num of Children'

In [5]:
#@title 2. Transform 'Num of Children'

children_mapping = {
    '1-2': 1.5,
    '3-4': 3.5,
    'Greater than 4': 5 # Estimating 5 as a reasonable value for "Greater than 4"
}
df['Num of Children'] = df['Num of Children'].replace(children_mapping).astype(float)

print("--- Transformed 'Num of Children' ---")
print(df['Num of Children'].unique())
#print(df[['Num of Children']].head())
#print("-" * 30)

--- Transformed 'Num of Children' ---
[1.5 5.  3.5]


#### 3. Transform 'Monthly Income'

In [6]:
#@title 3. Transform 'Monthly Income'
income_mapping = {
    'Less than 20,000 Naira': 10000,
    '20,000–50,000 Naira': 35000,
    '51,000–100,000 Naira': 75500,
    '101,000–200,000 Naira': 150500,
    'More than 200,000 Naira': 250000, # Estimating 250,000 as a reasonable value
    'Prefer not to say': np.nan # Map 'Prefer not to say' to NaN
}
df['Monthly Income'] = df['Monthly Income'].replace(income_mapping).astype(float)

print("--- Transformed 'Monthly Income' ---")
print(df['Monthly Income'].unique())
#print(df[['Monthly Income']].head())
#print("-" * 30)

--- Transformed 'Monthly Income' ---
[ 10000.  75500.  35000.     nan 150500. 250000.]


#### 4. Transform Duration Columns (HIV_Duration_Years, Care_Duration_Years, Facility_Care_Dur_Years)

In [7]:
#@title 4. Transform Duration Columns (HIV_Duration_Years, Care_Duration_Years, Facility_Care_Dur_Years)
# Note: Pay attention to different hyphens (en-dash vs. hyphen)
duration_mapping = {
    'Less than 1 year': 0.5,
    '1-3 years': 2,
    '1–3 years': 2, # Handle en-dash variation
    '4-7 years': 5.5,
    '4–7 years': 5.5, # Handle en-dash variation
    '8-10 years': 9,
    '8–10 years': 9, # Handle en-dash variation
    'More than 10 years': 12.5 # Estimating 12.5 as a reasonable value
}

for col in ['HIV_Duration_Years', 'Care_Duration_Years', 'Facility_Care_Dur_Years']:
    df[col] = df[col].replace(duration_mapping).astype(float)
    print(f"--- Transformed '{col}' ---")
    print(df[col].unique())
    #print(df[[col]].head())
    #print("-" * 30)

--- Transformed 'HIV_Duration_Years' ---
[ 2.  12.5  9.   0.5  5.5]
--- Transformed 'Care_Duration_Years' ---
[12.5  0.5  2.   9.   5.5]
--- Transformed 'Facility_Care_Dur_Years' ---
[ 2.   0.5  9.  12.5  5.5]


#### Fill the missing data points in Monthly Income with Imputation with a Missing Indicator (using Median)

In [8]:
#@title Fill the missing data points in Monthly Income with Imputation with a Missing Indicator (using Median)
# Create the missing indicator column
df['Monthly_Income_Missing'] = df['Monthly Income'].isna().astype(int)

# Then, fill the NaNs in the original column using median imputation
median_income = df['Monthly Income'].median()
df['Monthly Income'].fillna(median_income, inplace=True)

print("Monthly Income after imputation and adding missing indicator:")
print(df[['Monthly Income', 'Monthly_Income_Missing']].sample())
print("\nValue counts for Monthly_Income_Missing:")
print(df['Monthly_Income_Missing'].value_counts())
print("\nValue counts for Monthly_Income:")
print(df['Monthly Income'].value_counts())

Monthly Income after imputation and adding missing indicator:
    Monthly Income  Monthly_Income_Missing
38         35000.0                       1

Value counts for Monthly_Income_Missing:
Monthly_Income_Missing
0    893
1    153
Name: count, dtype: int64

Value counts for Monthly_Income:
Monthly Income
10000.0     432
35000.0     334
75500.0     205
150500.0     66
250000.0      9
Name: count, dtype: int64


### Identify outlier with Z-score and treat the identified outlier with Log Transformation

#### A. Identify outlier with Z-score

In [9]:
#@title Identify outlier with Z-score

# List of continuous numerical columns to process
continuous_cols = [
    'Age',
    'Num of Children',
    'Monthly Income',
    'HIV_Duration_Years',
    'Care_Duration_Years',
    'Facility_Care_Dur_Years'
]

# --- Outlier Identification using Z-score ---
# This loop both IDENTIFIES and POPULATES the list of outlier columns.
# It MUST be run before the log transformation loop.
cols_with_outliers_for_transform = []
print("--- Outlier Identification (Z-score) ---")
for col in continuous_cols:
    df[f'{col}_zscore'] = np.abs(zscore(df[col]))
    outliers = df[df[f'{col}_zscore'] > 3]
    print(f"Column: '{col}'")
    if not outliers.empty:
        print(f"  {len(outliers)} potential outlier(s) with |Z-score| > 3:")
        print(outliers[[col, f'{col}_zscore']])
        cols_with_outliers_for_transform.append(col)
    else:
        print(f"  No potential outliers found with |Z-score| > 3 for '{col}'.")
    print("-" * 80)

print(f"\nColumns identified for log transformation (with outliers): {cols_with_outliers_for_transform}")
print("\n" + "="*80 + "\n")

--- Outlier Identification (Z-score) ---
Column: 'Age'
  No potential outliers found with |Z-score| > 3 for 'Age'.
--------------------------------------------------------------------------------
Column: 'Num of Children'
  No potential outliers found with |Z-score| > 3 for 'Num of Children'.
--------------------------------------------------------------------------------
Column: 'Monthly Income'
  9 potential outlier(s) with |Z-score| > 3:
     Monthly Income  Monthly Income_zscore
77         250000.0               4.945044
112        250000.0               4.945044
253        250000.0               4.945044
305        250000.0               4.945044
396        250000.0               4.945044
554        250000.0               4.945044
730        250000.0               4.945044
909        250000.0               4.945044
968        250000.0               4.945044
--------------------------------------------------------------------------------
Column: 'HIV_Duration_Years'
  No potential 

#### B. Treat the identified outlier with Log Transformation

In [10]:
# --- Outlier Treatment with Log Transformation (Conditional) ---
# This loop depends on the list created above.
print("--- Applying Log Transformation (np.log1p) to identified outlier columns ---")
transformed_cols_display = []
for col in continuous_cols:
    if col in cols_with_outliers_for_transform: # The variable must be defined by now
        df[f'{col}_log'] = np.log1p(df[col])
        print(f"'{col}' transformed to '{col}_log' (due to outliers).")
        transformed_cols_display.append(col)
    else:
        print(f"'{col}' not transformed (no significant outliers detected).")
print("\n" + "="*80 + "\n")

--- Applying Log Transformation (np.log1p) to identified outlier columns ---
'Age' not transformed (no significant outliers detected).
'Num of Children' not transformed (no significant outliers detected).
'Monthly Income' transformed to 'Monthly Income_log' (due to outliers).
'HIV_Duration_Years' not transformed (no significant outliers detected).
'Care_Duration_Years' not transformed (no significant outliers detected).
'Facility_Care_Dur_Years' not transformed (no significant outliers detected).




In [11]:
# Final display
display_cols_final = []
for col in continuous_cols:
    display_cols_final.append(col)
    display_cols_final.append(f'{col}_zscore')
    if col in transformed_cols_display:
        display_cols_final.append(f'{col}_log')

print("\n--- Final DataFrame head with original, Z-score, and conditionally log-transformed columns ---")
print(df[display_cols_final].head())

print("\nDataFrame Info (check dtypes and new columns):")
df.info()


--- Final DataFrame head with original, Z-score, and conditionally log-transformed columns ---
    Age  Age_zscore  Num of Children  Num of Children_zscore  Monthly Income  \
0  29.5    0.871247              1.5                1.445493         10000.0   
1  29.5    0.871247              5.0                0.992070         10000.0   
2  65.0    2.224935              1.5                1.445493         75500.0   
3  39.5    0.000917              1.5                1.445493         10000.0   
4  39.5    0.000917              5.0                0.992070         10000.0   

   Monthly Income_zscore  Monthly Income_log  HIV_Duration_Years  \
0               0.753926            9.210440                 2.0   
1               0.753926            9.210440                12.5   
2               0.801418           11.231901                 2.0   
3               0.753926            9.210440                12.5   
4               0.753926            9.210440                 9.0   

   HIV_Duratio

### Define Likert columns.

In [12]:
# Define Likert columns (can adjust if needed)
likert_columns = [
       'Greet_Comfort', 'Discuss_VisitReason', 'Encourage_Thoughts',
       'Listen_Careful', 'Understood_You', 'Exam_Explained',
       'LabTests_Explained', 'Discuss_TreatOptions', 'Info_AsDesired',
       'Plan_Acceptability_Check', 'Meds_Explained_SideFX',
       'Encourage_Questions', 'Respond_Q_Concerns', 'Showed_Personal_Concern',
       'Involved_In_Decisions', 'Discuss_NextSteps', 'Checked_Understanding',
       'Time_Spent_Adequate', 'Visit_Satisfaction'
]

# Strip whitespace and standardize casing
for col in likert_columns:
    df[col] = df[col].astype(str).str.strip().str.title()

# Check unique values post-cleanup with dotted lines
for col in likert_columns:
    print(f"{col}: {df[col].unique()}")
    print("-" * 80) # Prints a dotted line

Greet_Comfort: ['Strongly Disagree' 'Agree' 'Strongly Agree' 'Disagree'
 'Neither Agree Or Disagree']
--------------------------------------------------------------------------------
Discuss_VisitReason: ['Agree' 'Strongly Agree' 'Strongly Disagree' 'Disagree'
 'Neither Agree Or Disagree']
--------------------------------------------------------------------------------
Encourage_Thoughts: ['Strongly Agree' 'Agree' 'Neither Agree Or Disagree' 'Disagree']
--------------------------------------------------------------------------------
Listen_Careful: ['Agree' 'Strongly Agree' 'Disagree' 'Neither Agree Or Disagree']
--------------------------------------------------------------------------------
Understood_You: ['Strongly Agree' 'Agree' 'Neither Agree Or Disagree' 'Disagree'
 'Strongly Disagree']
--------------------------------------------------------------------------------
Exam_Explained: ['Agree' 'Strongly Agree' 'Disagree' 'Neither Agree Or Disagree']
--------------------------------

### Check for Misspellings and Unexpected Categories

#### Quick Frequency Check

In [13]:
#@title Quick Frequency Check
for col in likert_columns:
    print(f"\n{col} value counts:")
    print(df[col].value_counts(dropna=False))
    print("-" * 40)


Greet_Comfort value counts:
Greet_Comfort
Strongly Agree               532
Agree                        440
Strongly Disagree             53
Disagree                      13
Neither Agree Or Disagree      8
Name: count, dtype: int64
----------------------------------------

Discuss_VisitReason value counts:
Discuss_VisitReason
Agree                        496
Strongly Agree               473
Strongly Disagree             28
Neither Agree Or Disagree     27
Disagree                      22
Name: count, dtype: int64
----------------------------------------

Encourage_Thoughts value counts:
Encourage_Thoughts
Strongly Agree               560
Agree                        421
Neither Agree Or Disagree     51
Disagree                      14
Name: count, dtype: int64
----------------------------------------

Listen_Careful value counts:
Listen_Careful
Agree                        553
Strongly Agree               452
Neither Agree Or Disagree     24
Disagree                      17
Name: cou

## Feature Engineering

### 1. Grouped Categorical Variables

In [14]:
#@title Educational Level Grouping
education_map = {
    'No formal education': 'No formal',
    'Primary education': 'Primary/Secondary',
    'Secondary education': 'Primary/Secondary',
    'Tertiary education (e.g., University, college)': 'Tertiary+',
    'Postgraduate': 'Tertiary+', # If this value exists in your raw data but not unique(), keep it.
    'Islamic education': 'No formal', # Or group as appropriate
    'Diploma': 'Tertiary+', # Or group as appropriate
    'Adult and non formal education': 'No formal' # Or group as appropriate
}

df['Education_Grouped'] = df['Educational Status'].map(education_map)

# Now, this should show your grouped categories and potentially NaN if any new unmapped values appear
print(df['Education_Grouped'].unique())

['Tertiary+' 'No formal' 'Primary/Secondary']


In [15]:
#@title Employment Grouping
employment_map = {
    'Unemployed': 'Unemployed',
    'Self-employed': 'Informal',
    'Employed part-time': 'Formal',  # Assuming part-time employment is formal
    'Employed full-time': 'Formal',  # Assuming full-time employment is formal
    'Other (please specify)': 'Other', # Grouping 'Other (please specify)' into 'Other'
    'Retired': 'Other',
    # If 'Informal', 'Government', 'Private', 'Student' don't exist in your raw 'Employment Status' column,
    # you can remove them from the map, or keep them if they might appear in other data.
    # For now, let's include only the ones found in your unique list.
}

df['Employment_Grouped'] = df['Employment Status'].map(employment_map)

# Now, this should show your grouped categories without NaN (unless there are new unmapped values)
print(df['Employment_Grouped'].unique())

['Informal' 'Unemployed' 'Formal' 'Other']


In [16]:
#@title Marital Grouping
marital_map = {
    'Single': 'Single',
    'Married': 'Married',
    'Divorced': 'Separated/Widowed',
    'Widowed': 'Separated/Widowed',
    'Seperated': 'Separated/Widowed' # Corrected spelling and mapping for your unique value
}

df['Marital_Grouped'] = df['Marital Status'].map(marital_map)

# This should now show your grouped categories without NaN (unless new unmapped values appear)
print(df['Marital_Grouped'].unique())

['Married' 'Separated/Widowed' 'Single']


### 2. Duration Features

In [17]:
#@title Duration Features
df['HIV_Care_Duration_Ratio'] = df['HIV_Duration_Years'] / (df['Care_Duration_Years'] + 0.1)

# Bucket Care Duration
df['Care_Duration_Bucket'] = pd.cut(df['Care_Duration_Years'],
                                     bins=[-np.inf, 1, 4, np.inf],
                                     labels=['Short-term', 'Medium-term', 'Long-term'])

# Interaction Terms
df['Age_x_HIV_Duration'] = df['Age'] * df['HIV_Duration_Years']
df['Income_x_Education'] = df['Monthly Income'].fillna(0) * df['Educational Status'].astype('category').cat.codes
df['Gender_x_Employment'] = df['Gender'].astype(
    'category').cat.codes * df['Employment Status'].astype('category').cat.codes
df['Education_x_Employment'] = df['Educational Status'].astype(
    'category').cat.codes * df['Employment Status'].astype('category').cat.codes

### 3. Build subscores

In [18]:
# -----------------------------------------
#@title Step 1: Define your Likert-style columns
# -----------------------------------------

empathy_cols = [
    'Showed_Personal_Concern',
    'Greet_Comfort',
    'Respond_Q_Concerns',
    'Time_Spent_Adequate'
]

listening_cols = [
    'Encourage_Thoughts',
    'Listen_Careful',
    'Understood_You'
]

decision_share_cols = [
    'Involved_In_Decisions',
    'Checked_Understanding',
    'Encourage_Questions'
]

info_delivery_cols = [
    'Discuss_VisitReason',
    'Exam_Explained',
    'LabTests_Explained',
    'Discuss_TreatOptions',
    'Meds_Explained_SideFX',
    'Info_AsDesired',
    'Plan_Acceptability_Check',
    'Discuss_NextSteps'
]

# Combine all columns
all_subscore_columns = empathy_cols + listening_cols + decision_share_cols + info_delivery_cols
# ----------------------------------------------------------------------------------------------

# -----------------------------------------
#@title Step 2: Robust Likert Cleaner
# -----------------------------------------

def normalize_likert(val):
    if pd.isnull(val):
        return None
    val = str(val).strip().lower()
    # Corrected: Match against lowercase strings
    if "strongly agree" in val:
        return 5
    elif val == "agree":
        return 4
    elif val == "neither agree or disagree":
        return 3
    elif "strongly disagree" in val: # Corrected: match against full phrase
        return 1
    elif "disagree" in val: # This should catch "Disagree" but not "Strongly Disagree" due to order
        return 2
    else:
        return None

# -----------------------------------------
#@title Step 3: Apply cleaning to relevant columns
# -----------------------------------------

for col in all_subscore_columns:
    if col in df.columns:
        df[col] = df[col].apply(normalize_likert)

# -----------------------------------------
#@title Step 4: Build subscores
# -----------------------------------------
df['Empathy_Score'] = df[empathy_cols].mean(axis=1)
df['Listening_Score'] = df[listening_cols].mean(axis=1)
df['Decision_Share_Score'] = df[decision_share_cols].mean(axis=1)
df['Info_Delivery_Score'] = df[info_delivery_cols].mean(axis=1)

# -----------------------------------------
#@title Step 5: Check missing values in subscores
# -----------------------------------------

missing_subscores = df[[
    'Empathy_Score',
    'Listening_Score',
    'Decision_Share_Score',
    'Info_Delivery_Score'
]].isnull().sum()

print("Missing values in subscores:\n", missing_subscores)

# Optional: see how many total rows are affected
rows_missing = df[[
    'Empathy_Score',
    'Listening_Score',
    'Decision_Share_Score',
    'Info_Delivery_Score'
]].isnull().any(axis=1).sum()
print(f"\nTotal rows with at least one missing subscore: {rows_missing}")

Missing values in subscores:
 Empathy_Score           0
Listening_Score         0
Decision_Share_Score    0
Info_Delivery_Score     0
dtype: int64

Total rows with at least one missing subscore: 0


In [19]:
df['Respond_Q_Concerns'].unique()

array([3, 4, 5, 2, 1])

### Check for duplicate patients

In [20]:
# -----------------------------------------
#@title Step 6: Check for duplicate patients
# -----------------------------------------

# Check if 'EnumID' column exists
if 'EnumID' in df.columns:
    # Create a temporary DataFrame by dropping 'EnumID' for the duplicate check
    df_for_dup_check = df.drop(columns=['EnumID'])
else:
    # If 'EnumID' doesn't exist, use the entire DataFrame
    df_for_dup_check = df.copy()

## Step 1: Identify Duplicate Rows
print("## Identifying Duplicate Rows")
# 'keep=False' marks all occurrences of a duplicate set as True
dupes_found = df_for_dup_check[df_for_dup_check.duplicated(keep=False)]

if not dupes_found.empty:
    print(f"**Duplicate rows found (considering all columns EXCEPT 'EnumID'): {len(dupes_found)}**")
    print("Here are the full rows from your original DataFrame that are considered duplicates:")
    # Use the index from 'dupes_found' to select corresponding rows from the original 'df'
    print(df.loc[dupes_found.index])
else:
    print("**No duplicate rows found (excluding 'EnumID').**")

## Step 2: Drop Duplicates

# Drop duplicates based on all columns EXCEPT 'EnumID'
# 'keep='first'' will keep the first occurrence of each duplicate set
if 'EnumID' in df.columns:
    # Get columns to consider for dropping duplicates (all except 'EnumID')
    columns_to_consider = [col for col in df.columns if col != 'EnumID']
    df_cleaned = df.drop_duplicates(subset=columns_to_consider, keep='first')
else:
    # If 'EnumID' doesn't exist, drop duplicates based on all columns
    df_cleaned = df.drop_duplicates(keep='first')


print("\n## DataFrame after dropping duplicates:")
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape: {df_cleaned.shape}")
print(df_cleaned)

## Identifying Duplicate Rows
**Duplicate rows found (considering all columns EXCEPT 'EnumID'): 2**
Here are the full rows from your original DataFrame that are considered duplicates:
    EnumID State   Age  Gender Marital Status Family Setting  Num of Children  \
292   KN17  Kano  29.5  Female        Married       Monogamy              3.5   
301  KN 17  Kano  29.5  Female        Married       Monogamy              3.5   

      Educational Status Employment Status  Monthly Income  ...  \
292  No formal education     Self-employed         35000.0  ...   
301  No formal education     Self-employed         35000.0  ...   

    HIV_Care_Duration_Ratio  Care_Duration_Bucket  Age_x_HIV_Duration  \
292                0.952381           Medium-term                59.0   
301                0.952381           Medium-term                59.0   

     Income_x_Education Gender_x_Employment  Education_x_Employment  \
292            105000.0                   0                      12   
301     

### Interaction Features

In [21]:
#@title Interaction Features
# Using .loc for explicit assignment
df_cleaned.loc[:, "Empathy_Listening_Interaction"] = df_cleaned["Empathy_Score"] * df_cleaned["Listening_Score"]
df_cleaned.loc[:, "Empathy_DecisionShare_Interaction"] = df_cleaned["Empathy_Score"] * df_cleaned["Decision_Share_Score"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, "Empathy_Listening_Interaction"] = df_cleaned["Empathy_Score"] * df_cleaned["Listening_Score"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, "Empathy_DecisionShare_Interaction"] = df_cleaned["Empathy_Score"] * df_cleaned["Decision_Share_Score"]


### Aggregate Behavioral Profiles

In [22]:
# --------------------------------------------------------------------
#@title Aggregate Behavioral Profiles
# All_High_Satisfaction: whether all Likert items were 4 or 5.
# Any_Dissatisfaction: whether any of the core variables are below 3.
# -------------------------------------------------------------------

core_likert = ['Empathy_Score', 'Listening_Score',
               'Decision_Share_Score', 'Info_Delivery_Score']
df["All_High_Satisfaction"] = (df[core_likert] >= 4).all(axis=1).astype(int)
df["Any_Low_Score"] = (df[core_likert] < 3).any(axis=1).astype(int)

In [23]:
# Rename the column
df_cleaned.rename(columns={'Visit_Satisfaction': 'Satisfaction'}, inplace=True)
df_cleaned.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.rename(columns={'Visit_Satisfaction': 'Satisfaction'}, inplace=True)


Index(['EnumID', 'State', 'Age', 'Gender', 'Marital Status', 'Family Setting',
       'Num of Children', 'Educational Status', 'Employment Status',
       'Monthly Income', 'Treatment Regimen', 'HIV_Duration_Years',
       'Care_Duration_Years', 'Facility_Care_Dur_Years', 'HIV_Diag_Type',
       'Greet_Comfort', 'Discuss_VisitReason', 'Encourage_Thoughts',
       'Listen_Careful', 'Understood_You', 'Exam_Explained',
       'LabTests_Explained', 'Discuss_TreatOptions', 'Info_AsDesired',
       'Plan_Acceptability_Check', 'Meds_Explained_SideFX',
       'Encourage_Questions', 'Respond_Q_Concerns', 'Showed_Personal_Concern',
       'Involved_In_Decisions', 'Discuss_NextSteps', 'Checked_Understanding',
       'Time_Spent_Adequate', 'Satisfaction', 'Source',
       'Monthly_Income_Missing', 'Age_zscore', 'Num of Children_zscore',
       'Monthly Income_zscore', 'HIV_Duration_Years_zscore',
       'Care_Duration_Years_zscore', 'Facility_Care_Dur_Years_zscore',
       'Monthly Income_log', 'E

### Remove redundant features to reduce noise

In [24]:
# -----------------------------------------------
#@title Remove redundant features to reduce noise
# -----------------------------------------------

# List of features to drop
features_to_drop = [
    'EnumID', 'Source', 'Visit_Satisfaction', 'Monthly_Income_Missing',
    'Age_zscore', 'Num of Children_zscore', 'Monthly Income_zscore',
    'HIV_Duration_Years_zscore', 'Care_Duration_Years_zscore', 'Facility_Care_Dur_Years_zscore',
    'Educational Status', 'Employment Status', 'Marital Status',
    'Age', 'Monthly Income', 'Num of Children', 'Care_Duration_Bucket',
    'Income_x_Education', 'Gender_x_Employment', 'Education_x_Employment',
    'HIV_Duration_Years', 'Care_Duration_Years', 'Facility_Care_Dur_Years'
]

# Drop the columns
df_engineered = df_cleaned.drop(columns=features_to_drop, errors='ignore')

# Confirm shape after drop (optional)
print(f"Remaining columns: {df_engineered.shape[1]}")

Remaining columns: 36


In [25]:
df_engineered.columns

Index(['State', 'Gender', 'Family Setting', 'Treatment Regimen',
       'HIV_Diag_Type', 'Greet_Comfort', 'Discuss_VisitReason',
       'Encourage_Thoughts', 'Listen_Careful', 'Understood_You',
       'Exam_Explained', 'LabTests_Explained', 'Discuss_TreatOptions',
       'Info_AsDesired', 'Plan_Acceptability_Check', 'Meds_Explained_SideFX',
       'Encourage_Questions', 'Respond_Q_Concerns', 'Showed_Personal_Concern',
       'Involved_In_Decisions', 'Discuss_NextSteps', 'Checked_Understanding',
       'Time_Spent_Adequate', 'Satisfaction', 'Monthly Income_log',
       'Education_Grouped', 'Employment_Grouped', 'Marital_Grouped',
       'HIV_Care_Duration_Ratio', 'Age_x_HIV_Duration', 'Empathy_Score',
       'Listening_Score', 'Decision_Share_Score', 'Info_Delivery_Score',
       'Empathy_Listening_Interaction', 'Empathy_DecisionShare_Interaction'],
      dtype='object')

In [26]:
df_engineered.head()

Unnamed: 0,State,Gender,Family Setting,Treatment Regimen,HIV_Diag_Type,Greet_Comfort,Discuss_VisitReason,Encourage_Thoughts,Listen_Careful,Understood_You,...,Employment_Grouped,Marital_Grouped,HIV_Care_Duration_Ratio,Age_x_HIV_Duration,Empathy_Score,Listening_Score,Decision_Share_Score,Info_Delivery_Score,Empathy_Listening_Interaction,Empathy_DecisionShare_Interaction
0,Jigawa,Female,Monogamy,Not sure,HIV-1,1,4,5,4,5,...,Informal,Married,0.15873,59.0,3.25,4.666667,4.0,4.125,15.166667,13.0
1,Bauchi,Female,Polygamy,Not sure,Do not know,4,4,4,4,4,...,Informal,Separated/Widowed,20.833333,368.75,3.5,4.0,3.666667,4.5,14.0,12.833333
2,Jigawa,Female,Monogamy,Not sure,Do not know,4,4,4,4,4,...,Unemployed,Separated/Widowed,0.952381,130.0,3.5,4.0,4.0,4.0,14.0,14.0
3,Jigawa,Female,Monogamy,First-line regimen,Both HIV-1 and HIV-2,5,5,5,5,5,...,Formal,Separated/Widowed,0.992063,493.75,5.0,5.0,5.0,5.0,25.0,25.0
4,Bauchi,Female,Monogamy,First-line regimen,HIV-1,5,5,5,4,5,...,Informal,Single,0.989011,355.5,5.0,4.666667,4.666667,4.5,23.333333,23.333333


In [27]:
df_engineered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1045 entries, 0 to 1045
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   State                              1045 non-null   object 
 1   Gender                             1045 non-null   object 
 2   Family Setting                     1045 non-null   object 
 3   Treatment Regimen                  1045 non-null   object 
 4   HIV_Diag_Type                      1045 non-null   object 
 5   Greet_Comfort                      1045 non-null   int64  
 6   Discuss_VisitReason                1045 non-null   int64  
 7   Encourage_Thoughts                 1045 non-null   int64  
 8   Listen_Careful                     1045 non-null   int64  
 9   Understood_You                     1045 non-null   int64  
 10  Exam_Explained                     1045 non-null   int64  
 11  LabTests_Explained                 1045 non-null   int64  
 1

### Install and import packages and libraries

In [28]:
# ------------------------------------------------------------------
#@title ✅ Import libraries
# ------------------------------------------------------------------
import shap
import requests
import json
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
import warnings

# Suppress SettingWithCopyWarning for cleaner output
warnings.simplefilter(action='ignore', category=FutureWarning)

print("FutureWarnings are now ignored.")

  from .autonotebook import tqdm as notebook_tqdm




In [29]:
# ---------------------------------------------------------------------------------
# @title ✅ Split, then Preprocess
# ---------------------------------------------------------------------------------

# Step 1: Split Features and Target
X = df_engineered.drop(columns=['Satisfaction']) # Features
y = df_engineered['Satisfaction']                 # Target

# Step 2: Split the data into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("\n" + "="*80 + "\n")

Shape of X_train: (836, 35)
Shape of X_test: (209, 35)




In [30]:
# ---------------------------------------------------------------------------------
# @title ✅ Encoding and Preprocessing for Features (X)
# ---------------------------------------------------------------------------------

# Identify categorical columns in the TRAINING set ONLY
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

# Define the OrdinalEncoder for features
# This encoder will be fitted on X_train and then used to transform both X_train and X_test
feature_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit the encoder ON THE TRAINING DATA ONLY and then transform both sets
X_train[categorical_cols] = feature_encoder.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = feature_encoder.transform(X_test[categorical_cols])

print("X_train with encoded categorical features (first 5 rows):")
print(X_train.head())
print("\n" + "="*80 + "\n")

X_train with encoded categorical features (first 5 rows):
     State  Gender  Family Setting  Treatment Regimen  HIV_Diag_Type  \
140    1.0     0.0             0.0                0.0            1.0   
274    0.0     0.0             1.0                1.0            1.0   
531    0.0     1.0             0.0                0.0            2.0   
227    2.0     0.0             0.0                1.0            4.0   
263    2.0     0.0             0.0                0.0            1.0   

     Greet_Comfort  Discuss_VisitReason  Encourage_Thoughts  Listen_Careful  \
140              4                    4                   5               4   
274              1                    5                   5               5   
531              5                    5                   5               5   
227              4                    4                   5               5   
263              4                    5                   3               5   

     Understood_You  ...  Employme

In [31]:
# ---------------------------------------------------------------------------------
# @title ✅ Encoding for Target (y)
# ---------------------------------------------------------------------------------

# Define the explicit order of categories for the target variable
# This is crucial for consistency.
satisfaction_order = ["Very Dissatisfied", "Not Satisfied", "Neutral", "Satisfied", "Very Satisfied"]

# Create a separate encoder for the target variable
target_encoder = OrdinalEncoder(categories=[satisfaction_order])

# Fit the target encoder ON THE TRAINING DATA ONLY
target_encoder.fit(y_train.values.reshape(-1, 1))

# Transform both training and test target data using the FITTED encoder
y_train_encoded = target_encoder.transform(y_train.values.reshape(-1, 1))
y_test_encoded = target_encoder.transform(y_test.values.reshape(-1, 1))

print("Original y_train values (first 10):")
print(y_train.values.flatten()[:10])
print("\nEncoded y_train values (used for model training, first 10):")
print(y_train_encoded.flatten()[:10])

Original y_train values (first 10):
['Satisfied' 'Very Satisfied' 'Very Dissatisfied' 'Neutral' 'Neutral'
 'Neutral' 'Satisfied' 'Very Satisfied' 'Very Satisfied' 'Very Satisfied']

Encoded y_train values (used for model training, first 10):
[3. 4. 0. 2. 2. 2. 3. 4. 4. 4.]


In [32]:
# --------------------------------------------------------------------
# @title ✅ Compute Class Weights
# --------------------------------------------------------------------

# CRITICAL: Use ONLY y_train to compute class weights.
# y_train is the target variable from your training set after the split.
classes = np.unique(y_train)

# This will produce an array of np.float64
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))

print("Class weights (as np.float64 array):")
print(class_weights)
print("Class weights (as dictionary):")
print(class_weight_dict)

# --- Optional Conversion to standard Python floats ---
class_weight_dict_scalar = {k: float(v) for k, v in class_weight_dict.items()}
print("\nClass weights (as standard Python floats):")
print(class_weight_dict_scalar)

Class weights (as np.float64 array):
[1.44337017 0.80138037 1.5929878  0.69852941]
Class weights (as dictionary):
{'Neutral': np.float64(1.4433701657458564), 'Satisfied': np.float64(0.8013803680981595), 'Very Dissatisfied': np.float64(1.5929878048780488), 'Very Satisfied': np.float64(0.6985294117647058)}

Class weights (as standard Python floats):
{'Neutral': 1.4433701657458564, 'Satisfied': 0.8013803680981595, 'Very Dissatisfied': 1.5929878048780488, 'Very Satisfied': 0.6985294117647058}


In [33]:
# --------------------------------------------
#@title 🔹 Random Forest
# -------------------------------------------

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    class_weight=class_weight_dict,
    random_state=42,
    n_estimators=200,
    max_depth=10
)
rf.fit(X_train, y_train)

In [34]:
# --------------------------------------------
#@title 🔹 XGBOOST
# -------------------------------------------

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
sample_weights = y_train.map(class_weight_dict)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_encoded, sample_weight=sample_weights)

class_mapping = dict(zip(le.transform(le.classes_), le.classes_))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [35]:
# --------------------------------------------
#@title 🔹 LightGBM
# -------------------------------------------

# Use verbose=-1 to suppress training output
lgbm = lgb.LGBMClassifier(class_weight=class_weight_dict, random_state=42, verbose=-1)

# The .fit() method now works without the verbose argument
lgbm.fit(X_train, y_train)

print("\nLightGBM model training completed. The training output was suppressed.")
print("-" * 50)


LightGBM model training completed. The training output was suppressed.
--------------------------------------------------


In [36]:
# -----------------------------------------------------------------------------
# ✅ Full Model Evaluation and Selection Based on Performance
# -----------------------------------------------------------------------------

# Evaluation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Encode target for models like XGBoost
y_encoded = le.fit_transform(y)

# Scoring dict for cross_validate
scoring = {
    'accuracy': 'accuracy',
    'f1_weighted': 'f1_weighted'
}

# Define the Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols)
    ],
    remainder='passthrough'
)

# Define models, wrapped in a Pipeline
models_to_evaluate = {
    'RandomForest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(class_weight=class_weight_dict, random_state=42))
    ]),
    'XGBoost': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(eval_metric='mlogloss', random_state=42, use_label_encoder=False))
    ]),
    'LightGBM': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(class_weight=class_weight_dict, random_state=42, verbose=-1))
    ]),
}

results = []

for name, model in models_to_evaluate.items():
    print(f"\n🔍 Evaluating {name}...")

    if name == 'XGBoost':
        y_input = y_encoded
    else:
        y_input = y

    cv_results = cross_validate(
        model,
        X,
        y_input,
        cv=cv,
        scoring=scoring,
        return_estimator=True
    )

    acc = np.mean(cv_results['test_accuracy'])
    f1w = np.mean(cv_results['test_f1_weighted'])

    logloss_vals = []
    for estimator, (_, test_idx) in zip(cv_results['estimator'], cv.split(X, y_input)):
        # This part is correct: X is a pandas DataFrame so .iloc is the right way.
        X_test_fold = X.iloc[test_idx]

        # Use conditional logic to ensure the correct indexing method is used for y_input
        if isinstance(y_input, pd.Series):
            y_test_fold = y_input.iloc[test_idx]
        else:
            # For XGBoost, y_input is a NumPy array, so standard indexing works fine.
            y_test_fold = y_input[test_idx]
        
        try:
            y_proba = estimator.predict_proba(X_test_fold)
            logloss_vals.append(log_loss(y_test_fold, y_proba))
        except Exception as e:
            print(f"⚠️ {name} failed log_loss on one fold: {e}")
            logloss_vals.append(np.nan)

    logloss = np.nanmean(logloss_vals)

    print(f"✅ Accuracy: {acc:.4f}, F1-weighted: {f1w:.4f}, Log Loss: {logloss:.4f}")

    results.append((name, {'accuracy': acc, 'f1_weighted': f1w, 'log_loss': logloss}))
    
# Select best model
best_model = max(results, key=lambda x: x[1]['f1_weighted'])
print(f"\n🏆 Best model based on weighted F1-score: {best_model[0]}")


🔍 Evaluating RandomForest...
✅ Accuracy: 0.8019, F1-weighted: 0.8032, Log Loss: 0.5433

🔍 Evaluating XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Accuracy: 0.8201, F1-weighted: 0.8198, Log Loss: 0.4899

🔍 Evaluating LightGBM...




✅ Accuracy: 0.8278, F1-weighted: 0.8272, Log Loss: 0.4829

🏆 Best model based on weighted F1-score: LightGBM




In [41]:
# --------------------------------------------
# ✅ SHAP + EXPLANATION PIPELINE (CATBOOST ONLY)
# --------------------------------------------

# --- LLM API Key (Keep this secure!) ---
openrouter_api_key = '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'

# --- GLOBAL LOG DATAFRAME ---
# This DataFrame will store the explanation logs
logs_df = pd.DataFrame(columns=[
    'instance_idx', 'prediction', 'confidence', 'top_features',
    'reason', 'suggestions', 'genai_explanation'
])

# --- RULE DEFINITIONS ---
def rule_empathy(shap_scores):
    return shap_scores.get('Empathy_Score', 3) < 2.5

def rule_decision_sharing(shap_scores):
    return shap_scores.get('Decision_Share_Score', 3) < 2.5

def rule_listening(shap_scores):
    return shap_scores.get('Listening_Score', 3) < 3

RULES = [
    ('Empathy was low', "Enhance provider's empathetic communication", rule_empathy),
    ('Decision-sharing was low', "Improve patient engagement in decisions", rule_decision_sharing),
    ('Listening was moderate', "Train providers on active listening techniques", rule_listening),
]

# --- SHAP EXPLAINER FUNCTION ---
# For LightGBM, TreeExplainer works directly on the model
def get_shap_explainer(model):
    return shap.TreeExplainer(model)

# --- DEEPSEEK EXPLANATION FUNCTION ---
def deepseek_generate_explanation(prediction, confidence, top_features, reasons, suggestions, openrouter_api_key):
    prompt = f"""
You are an AI assistant helping a healthcare team understand why a specific HIV client was predicted to be '{prediction}' with {confidence}% confidence.

Top contributing factors:
{json.dumps(top_features, indent=2)}

Rule-based issues:
{reasons}

Suggestions for improvement:
{suggestions}

Based on:
- Respectful provider communication, use of local language
- DSD models (MMD, fast track, reduced wait)
- Higher satisfaction with complex ART regimens, moderate income
- Lower satisfaction with polygamous families, long treatment, poor decision involvement, fragmented services

Write a concise explanation for clinical quality improvement.
"""
    headers = {
        "Authorization": f"Bearer {openrouter_api_key}",
        "Content-Type": "application/json",
    }
    body = {
        "model": "tngtech/deepseek-r1t2-chimera:free",
        "messages": [{"role": "user", "content": prompt}]
    }
    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions",
                                 headers=headers, data=json.dumps(body))
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content']
        else:
            return "LLM error: " + response.text
    except Exception as e:
        return f"Exception: {e}"

# --- MAIN PREDICTION EXPLANATION FUNCTION (ADAPTED FOR LIGHTGBM) ---
def explain_lgbm_prediction(instance_idx, model, X_test_transformed, openrouter_api_key, label_encoder, target_encoder):
    global logs_df

    instance = X_test_transformed.iloc[instance_idx:instance_idx+1]
    explainer = get_shap_explainer(model)
    shap_vals = explainer.shap_values(instance)

    preds_proba = model.predict_proba(instance)[0]
    pred_class_encoded = np.argmax(preds_proba)
    confidence_val = round(float(np.max(preds_proba)) * 100, 1)

    pred_class_text = target_encoder.inverse_transform([[pred_class_encoded]])[0][0]

    shap_vals_for_predicted_class = shap_vals[pred_class_encoded][0]
    shap_dict = dict(zip(X_test_transformed.columns, shap_vals_for_predicted_class))
    top_features = dict(sorted(shap_dict.items(), key=lambda x: abs(x[1]), reverse=True)[:3])
    top_features = {k: round(float(v), 1) for k, v in top_features.items()}

    shap_scores = {
        'Empathy_Score': instance['Empathy_Score'].iloc[0],
        'Decision_Share_Score': instance['Decision_Share_Score'].iloc[0],
        'Listening_Score': instance['Listening_Score'].iloc[0],
    }

    reasons, suggestions = [], []
    for reason_text, suggestion_text, rule_fn in RULES:
        if rule_fn(shap_scores):
            reasons.append(reason_text)
            suggestions.append(suggestion_text)

    explanation_text = deepseek_generate_explanation(
        pred_class_text, confidence_val, top_features, reasons, suggestions,
        openrouter_api_key=openrouter_api_key
    )

    log_entry = {
        'instance_idx': instance_idx,
        'prediction': pred_class_text,
        'confidence': f"{confidence_val}%",
        'top_features': top_features,
        'reason': "; ".join(reasons),
        'suggestions': "; ".join(suggestions),
        'genai_explanation': explanation_text
    }

    logs_df = pd.concat([logs_df, pd.DataFrame([log_entry])], ignore_index=True)
    return log_entry

# --- MAIN FUNCTION TO RUN EXPLANATIONS (ADAPTED FOR LIGHTGBM) ---
def run_explanations_for_lgbm(model, X_test_transformed, openrouter_api_key, label_encoder, target_encoder, max_instances=5):
    global logs_df
    logs_df = pd.DataFrame(columns=[
        'instance_idx', 'prediction', 'confidence', 'top_features',
        'reason', 'suggestions', 'genai_explanation'
    ])

    print(f"\n🔎 Explaining top {max_instances} instances with SHAP + DeepSeek for LightGBM...")

    for idx in range(min(max_instances, len(X_test_transformed))):
        log_entry = explain_lgbm_prediction(
            idx, model, X_test_transformed, openrouter_api_key, label_encoder, target_encoder
        )
        print(f"\n🧾 Instance {idx} explanation:")
        print(log_entry)
    print("\n✅ Completed explanation batch.")
    print("\n📋 Explanation Logs (Top 5):")
    print(logs_df.head())

# --------------------------------------------
# ✅ CALL FUNCTION TO GENERATE OUTPUT
# --------------------------------------------
# Assuming these variables are already defined from your main script:
# lgbm (your trained LightGBM model)
# X_test_transformed (your preprocessed and feature-selected test data)
# openrouter_api_key (your API key)
# label_encoder (your fitted LabelEncoder for y)
# target_encoder (your fitted OrdinalEncoder for y)

run_explanations_for_lgbm(
    model=lgbm,
    X_test=X_test,
    openrouter_api_key=openrouter_api_key,
    label_encoder=label_encoder,
    target_encoder=target_encoder,
    max_instances=5
)

NameError: name 'label_encoder' is not defined