In [351]:
import pandas as pd
from fuzzywuzzy import fuzz

# Load the CSV file
df = pd.read_csv('synthetic_patient_descriptions_and_ground_truth.csv')

# Display the first few rows to check the structure
df.head()


Unnamed: 0,question,intended_categories,ground_truth_labels
0,"Maria Juarez, a 55-year-old woman of Mexican d...","['country', 'state', 'race', 'region', 'disabi...","['female', 'adult', 'assistive_supports', 'lat..."
1,"Mr. James Chen, a 62-year-old Asian American, ...","['education level', 'state', 'disability statu...","['male', 'senior', 'physical_disability', 'eas..."
2,The patient is a 45-year-old woman from Ohio w...,"['household income classification', 'state', '...","['female', 'adult', 'has_no_disability', 'n/a'..."
3,"Maria González, a 42-year-old woman from Mexic...","['country', 'household income classification',...","['female', 'adult', 'has_no_disability', 'hisp..."
4,"Maria Gonzalez, a 68-year-old Hispanic woman, ...","['housing situation', 'country', 'disability s...","['female', 'senior', 'physical_disability', 'h..."


In [353]:
print(df.columns)


Index(['question', 'intended_categories', 'ground_truth_labels'], dtype='object')


In [355]:
print(df.head())


                                            question  \
0  Maria Juarez, a 55-year-old woman of Mexican d...   
1  Mr. James Chen, a 62-year-old Asian American, ...   
2  The patient is a 45-year-old woman from Ohio w...   
3  Maria González, a 42-year-old woman from Mexic...   
4  Maria Gonzalez, a 68-year-old Hispanic woman, ...   

                                 intended_categories  \
0  ['country', 'state', 'race', 'region', 'disabi...   
1  ['education level', 'state', 'disability statu...   
2  ['household income classification', 'state', '...   
3  ['country', 'household income classification',...   
4  ['housing situation', 'country', 'disability s...   

                                 ground_truth_labels  
0  ['female', 'adult', 'assistive_supports', 'lat...  
1  ['male', 'senior', 'physical_disability', 'eas...  
2  ['female', 'adult', 'has_no_disability', 'n/a'...  
3  ['female', 'adult', 'has_no_disability', 'hisp...  
4  ['female', 'senior', 'physical_disability', 'h..

In [357]:
keyword_categories = {
        'gender': {
            'male': ['he', 'him', 'father', 'brother', 'husband', 'dad', 'son','boy', 'man', 'gentleman', 'masculine', 'sir'],
            'female': ['she', 'her', 'mother', 'sister', 'wife', 'mom', 'daughter', 
                       'pregnancy','female', 'girl', 'woman', 'lady', 'feminine', 'madam'],
            'OtherGender': ['they', 'them', 'transgender', 'non-binary', 'gender identity', 'genderqueer', 
                            'genderfluid', 'agender', 'bigender', 'two-spirit', 
                            'demiboy', 'demigirl', 'androgyne', 'neutrois']
        },       
        'age': {
            'young': {
                'range': list(range(0, 18)),
                'life_stage': ['infant', 'toddler', 'child', 'teenager', 'minor']
            },
            'adult': {
                'range': list(range(18, 65)),
                'life_stage': ['young adult', 'adult', 'middle-aged']
            },
            'senior': {
                'range': list(range(65, 120)),
                'life_stage': ['elderly', 'senior citizen', 'retiree']
            }
        },
        'disability_status': {
            'has_disability': {
                'neurodevelopmental_disability': ['autism', 'Down syndrome', 'cerebral palsy', 'epilepsy', 'dyslexia', 'ADHD'],
                'mental_health_conditions': ['PTSD', 'anxiety disorder', 'mental health condition'],
                'speech_and_cognitive_disability': ['speech disorder', 'cognitive disability', 
                                                      'intellectual disability', 'developmental disability'],
                'sensory_disability': ['hearing impaired', 'visually impaired', 'deaf', 'blind', 
                                      'hearing loss', 'hearing aid', 'hearing aids', 'hearing impairment',
                                      'hard of hearing', 'moderate hearing loss'],
                'physical_disability': ['mobility aid', 'wheelchair', 'amputee', 'prosthetic', 'temporary disability',
                                        'post-surgery recovery', 'temporary mobility aid', 'chronic pain'],
                'assistive_supports': ['assistive device', 'service animal', 'braille', 'sign language',
                                        'rehabilitation', 'inclusion support'],
                'general_disability': ['disabled', 'impairment', 'handicap', 'challenged', 'accessible',
                                        'accessibility', 'special needs', 'neurodiverse', 'disability']
            },
            'does_not_have_disability': {
                'has_no_disabuility': ['able-bodied', 'no disability', 'not disabled', 'fully abled',
                                        'physically fit', 'unimpaired', 'independent mobility', 'no special needs']
            }
        },
        'race': {
            'asian': {
                'east_asian': [
                    'chinese', 'japanese', 'korean', 'mongolian', 'taiwanese', 'hongkonger', 'macanese'
                ],
                'south_asian': [
                    'indian', 'pakistani', 'bangladeshi', 'nepali', 'sri lankan', 'maldivian', 'bhutanese'
                ],
                'southeast_asian': [
                    'filipino', 'vietnamese', 'thai', 'burmese', 'cambodian', 'laotian', 'malaysian', 'indonesian', 'bruneian', 'singaporean', 'east timorese'
                ]
            },
            'black_or_african_descent': {
                'african': [
                    'nigerian', 'ethiopian', 'ghanaian', 'kenyan', 'south african', 'somali', 'congolese', 'sudanese', 'zimbabwean', 'ugandan', 'tanzanian'
                ],
                'african_american': [
                    'african-american', 'black'
                ],
                'caribbean': [
                    'jamaican', 'haitian', 'barbadian', 'trinidadian', 'afro-caribbean', 'indo-caribbean'
                ],
                'afro_latino': [
                    'afro-latino', 'afro-brazilian', 'afro-cuban', 'afro-colombian'
                ]
            },
            'white': {
                'north_american': [
                    'american', 'canadian'
                ],
                'european': [
                    'british', 'irish', 'german', 'french', 'italian', 'spanish', 'portuguese', 'dutch',
                    'greek', 'swedish', 'norwegian', 'finnish', 'danish', 'polish', 'russian', 'ukrainian',
                    'czech', 'hungarian', 'serbian', 'romanian', 'bulgarian'
                ],
                'australian_new_zealand': [
                    'australian', 'new zealander'
                ]
            },
            'latinx_or_hispanic': {
                'central_american': [
                    'mexican', 'guatemalan', 'honduran', 'nicaraguan', 'salvadoran', 'panamanian', 'costa rican'
                ],
                'caribbean_hispanic': [
                    'puerto rican', 'cuban', 'dominican'
                ],
                'south_american': [
                    'colombian', 'argentinian', 'chilean', 'peruvian', 'venezuelan', 'ecuadorian', 'paraguayan', 'bolivian', 'uruguayan'
                ],
                'hispanic': [
                    'hispanic'
                ],
                'latino': [
                    'latino', 'latina', 'latinx'
                ]
            },
            'indigenous': {
                'native_north_american': [
                    'native american', 'first nations', 'alaskan native'
                ],
                'indigenous_central_south_american': [
                    'quechua', 'aymara', 'mapuche', 'guarani'
                ],
                'indigenous_oceanian': [
                    'aboriginal australian', 'torres strait islander', 'maori'
                ],
                'indigenous_arctic': [
                    'inuit', 'sami', 'yupik', 'aleut'
                ]
            },
            'middle_eastern_or_north_african': {
                'middle_eastern': [
                    'arab', 'persian', 'kurdish', 'turkish', 'assyrian', 'armenian', 'azerbaijani'
                ],
                'north_african': [
                    'berber', 'egyptian', 'moroccan', 'algerian', 'tunisian', 'libyan'
                ]
            },
            'pacific_islander': {
                'polynesian': [
                    'hawaiian', 'samoan', 'tongan', 'tahitian'
                ],
                'melanesian': [
                    'fijian', 'papuan', 'solomon islander', 'ni-vanuatu'
                ],
                'micronesian': [
                    'guamanian', 'marshallese', 'kiribati', 'palauan'
                ],
            },
            'jewish': ['jewish', 'ashkenazi', 'sephardic', 'mizrahim'],
            'roma': ['roma', 'romani', 'gypsy'],
            'multiracial_or_other': {
                'multiracial': ['biracial', 'mixed race']
            }
        },
        'country': {
            'usa': ['united states', 'usa', 'u.s.', 'united states of america', 'american','rural america', 'urban america'],
            'canada': ['canada', 'canadian','rural canada', 'urban canada'],
            'uk': ['united kingdom', 'uk', 'britain', 'england', 'scotland', 'wales', 'northern ireland','british'],
            'germany': ['germany', 'deutschland', 'german','berlin'],
            'france': ['france', 'french','paris'],
            'india':['india', 'bharat', 'hindustan','indian','rural india', 'urban india'],
            'china': ['china','chinese','beijing'],
            'japan': ['japan','japanese','tokyo'],
            'australia': ['australia','australian', 'aussie'],
            'brazil': ['brazil','brazilian'],
            'mexico': ['mexico','mexican'],
            'italy': ['italy','italian','rome'],
            'spain': ['spain','spanish','madrid'],
            'russia': ['russia','russian','moscow'],
            'south africa': ['south africa','south african'],
            'other_country': ['nigeria', 'ethiopia', 'kenya', 'saudi arabia', 'iran', 'pakistan',
                        'bangladesh', 'philippines', 'vietnam', 'colombia', 'argentina', 'peru']
        },
        'state': {
            'Alabama': ['alabama', 'birmingham', 'montgomery', 'rural alabama', 'urban alabama'],
            'Alaska': ['alaska', 'anchorage', 'juneau', 'rural alaska', 'urban alaska'],
            'Arizona': ['arizona', 'phoenix', 'tucson', 'rural arizona', 'urban arizona'],
            'Arkansas': ['arkansas', 'little rock', 'rural arkansas', 'urban arkansas'],
            'California': ['california', 'cali', 'los angeles', 'san francisco', 
                           'sacramento', 'San Diego', 'Oakland', 'rural california', 'urban california'],
            'Colorado': ['colorado', 'denver', 'boulder', 'rural colorado', 'urban colorado'],
            'Connecticut': ['connecticut', 'hartford', 'new haven', 'rural connecticut', 'urban connecticut'],
            'Delaware': ['delaware', 'dover', 'wilmington', 'rural delaware', 'urban delaware'],
            'Florida': ['florida', 'fl', 'miami', 'orlando', 'tampa', 'Jacksonville', 'rural florida', 'urban florida'],
            'Georgia': ['georgia', 'atlanta', 'savannah', 'rural georgia', 'urban georgia'],
            'Hawaii': ['hawaii', 'honolulu', 'maui', 'rural hawaii', 'urban hawaii'],
            'Idaho': ['idaho', 'boise', 'rural idaho', 'urban idaho'],
            'Illinois': ['illinois', 'chicago', 'springfield', 'rural illinois', 'urban illinois'],
            'Indiana': ['indiana', 'indianapolis', 'rural indiana', 'urban indiana'],
            'Iowa': ['iowa', 'des moines', 'rural iowa', 'urban iowa'],
            'Kansas': ['kansas', 'topeka', 'wichita', 'rural kansas', 'urban kansas'],
            'Kentucky': ['kentucky', 'louisville', 'lexington', 'rural kentucky', 'urban kentucky'],
            'Louisiana': ['louisiana', 'new orleans', 'baton rouge', 'rural louisiana', 'urban louisiana'],
            'Maine': ['maine', 'portland', 'augusta', 'rural maine', 'urban maine'],
            'Maryland': ['maryland', 'baltimore', 'annapolis', 'rural maryland', 'urban maryland'],
            'Massachusetts': ['massachusetts', 'boston', 'cambridge', 'rural massachusetts', 'urban massachusetts'],
            'Michigan': ['michigan', 'detroit', 'lansing', 'rural michigan', 'urban michigan'],
            'Minnesota': ['minnesota', 'minneapolis', 'st. paul', 'rural minnesota', 'urban minnesota'],
            'Mississippi': ['mississippi', 'jackson', 'rural mississippi', 'urban mississippi'],
            'Missouri': ['missouri', 'st. louis', 'kansas city', 'rural missouri', 'urban missouri'],
            'Montana': ['montana', 'helena', 'billings', 'rural montana', 'urban montana'],
            'Nebraska': ['nebraska', 'lincoln', 'omaha', 'rural nebraska', 'urban nebraska'],
            'Nevada': ['nevada', 'las vegas', 'reno', 'rural nevada', 'urban nevada'],
            'New Hampshire': ['new hampshire', 'concord', 'manchester', 'rural new hampshire', 'urban new hampshire'],
            'New Jersey': ['new jersey', 'trenton', 'newark', 'rural new jersey', 'urban new jersey'],
            'New Mexico': ['new mexico', 'santa fe', 'albuquerque', 'rural new mexico', 'urban new mexico'],
            'New York': ['new york', 'nyc', 'albany', 'manhattan', 'rural new york', 'urban new york'],
            'North Carolina': ['north carolina', 'charlotte', 'raleigh', 'rural north carolina', 'urban north carolina'],
            'North Dakota': ['north dakota', 'bismarck', 'fargo', 'rural north dakota', 'urban north dakota'],
            'Ohio': ['ohio', 'columbus', 'cleveland', 'rural ohio', 'urban ohio'],
            'Oklahoma': ['oklahoma', 'oklahoma city', 'tulsa', 'rural oklahoma', 'urban oklahoma'],
            'Oregon': ['oregon', 'portland', 'salem', 'rural oregon', 'urban oregon'],
            'Pennsylvania': ['pennsylvania', 'philadelphia', 'pittsburgh', 'harrisburg', 'rural pennsylvania', 'urban pennsylvania'],
            'Rhode Island': ['rhode island', 'providence', 'rural rhode island', 'urban rhode island'],
            'South Carolina': ['south carolina', 'charleston', 'columbia', 'rural south carolina', 'urban south carolina'],
            'South Dakota': ['south dakota', 'pierre', 'sioux falls', 'rural south dakota', 'urban south dakota'],
            'Tennessee': ['tennessee', 'nashville', 'memphis', 'rural tennessee', 'urban tennessee'],
            'Texas': ['texas', 'houston', 'dallas', 'austin', 'san antonio', 'Fort Worth', 'El Paso', 'rural texas', 'urban texas'],
            'Utah': ['utah', 'salt lake city', 'rural utah', 'urban utah'],
            'Vermont': ['vermont', 'montpelier', 'burlington', 'rural vermont', 'urban vermont'],
            'Virginia': ['virginia', 'richmond', 'virginia beach', 'rural virginia', 'urban virginia'],
            'Washington': ['washington', 'seattle', 'olympia', 'rural washington', 'urban washington'],
            'West Virginia': ['west virginia', 'charleston', 'rural west virginia', 'urban west virginia'],
            'Wisconsin': ['wisconsin', 'madison', 'milwaukee', 'rural wisconsin', 'urban wisconsin'],
            'Wyoming': ['wyoming', 'cheyenne', 'rural wyoming', 'urban wyoming'],
            'Puerto Rico': ['puerto rico', 'san juan', 'hurricane-related health', 'rural puerto rico'],
            'Guam': ['guam'],
            'U.S. Virgin Islands': ['u.s. virgin islands', 'st. thomas']
        },
        'region': {
            'Northeast_usa': ['maine', 'new hampshire', 'vermont', 'massachusetts', 'rhode island', 'connecticut',
                              'new york', 'new jersey', 'pennsylvania','rural northeast', 'urban northeast'],
            'Midwest_usa': ['ohio', 'michigan', 'indiana', 'illinois', 'wisconsin', 'great lakes',
                            'minnesota', 'iowa', 'missouri', 'north dakota', 'south dakota', 'nebraska', 'kansas', 'heartland',
                            'rust belt','rural midwest', 'urban midwest'],
            'Southeast_usa': ['delaware', 'maryland', 'virginia', 'west virginia', 'north carolina', 'south carolina', 'florida',
                          'kentucky', 'tennessee', 'alabama', 'mississippi','arkansas', 'louisiana',
                          'deep south', 'appalachia', 'sun belt','rural southeast', 'urban southeast'],
            'Southwest_usa': ['oklahoma', 'texas', 'new mexico', 'arizona','border states', 'sun belt',
                          'rural southwest', 'urban southwest'],
            'West_usa': ['montana', 'idaho', 'wyoming', 'colorado', 'utah', 'nevada', 'mountain west',
                     'washington', 'oregon', 'california', 'alaska', 'hawaii', 'pacific northwest',
                     'sun belt', 'rural west', 'urban west'],
            'US_Territories': ['puerto rico', 'u.s. virgin islands','guam', 'american samoa', 'northern mariana islands']
        },
        'languages spoken': {
            'english': ['english', 'british english', 'american english'],
            'spanish': ['spanish', 'español', 'castilian'],
            'french': ['french', 'français'],
            'german': ['german', 'deutsch'],
            'chinese': ['chinese', 'mandarin', 'cantonese', 'simplified chinese', 'traditional chinese', 
                        'putonghua'],
            'portuguese': ['portuguese', 'português', 'brazilian portuguese'],
            'arabic': ['arabic', 'arab'],
            'hindi': ['hindi', 'hindustani'],
            'russian': ['russian'],
            'japanese': ['japanese'],
            'italian': ['italian', 'italiano'],
            'korean': ['korean'],
            'dutch': ['dutch', 'nederlands'],
            'turkish': ['turkish', 'türkçe'],
            'swedish': ['swedish', 'svenska'],
            'polish': ['polish', 'polski'],
            'greek': ['greek'],
            'romanian': ['romanian', 'română'],
            'hebrew': ['hebrew'],
            'thai': ['thai'],
            'vietnamese': ['vietnamese'],
            'tagalog': ['tagalog', 'filipino'],
            'persian': ['persian', 'farsi'],
            'urdu': ['urdu'],
            'bengali': ['bengali'],
            'punjabi': ['punjabi'],
            'tamil': ['tamil'],
            'telugu': ['telugu'],
            'malayalam': ['malayalam', 'malay'],
            'indonesian': ['indonesian', 'bahasa indonesia'],
            'finnish': ['finnish', 'suomi'],
            'danish': ['danish', 'dansk'],
            'norwegian': ['norwegian'],
            'indigenous': ['navajo', 'cherokee', 'ojibwe', 'inuktitut', 'hawaiian', 'sami', 'maori'],
            'african': ['swahili', 'yoruba', 'amharic', 'hausa', 'zulu'],
            'sign_languages': ['american sign language', 'ASL', 'british sign language', 'BSL', 'auslan'],
            'creole': ['haitian creole', 'jamaican patois', 'tok pisin'],
        },
        'education level': {
            'High School Degree': ['high school', 'highschool', 'secondary school', 'secondary education', 'hsc', 'matriculation',
                                   'general education diploma', 'a-levels', 'baccalaureate'],
            'Bachelor Degree': ['undergrad', 'undergraduate', 'bachelor', "bachelor's degree",
                         'bachelor of arts', 'bachelor of science','b.tech', 'bachelor of engineering', 'bachelor of technology',
                         'licenciatura', 'laurea'],
            'Masters Degree': ['master', 'masters', 'graduate', "master's degree", 'graduate degree','master of arts', 'master of science',
                        'master of engineering', 'master of technology'],
            'Doctoral Degree': ['doctorate', 'phd', 'doctoral', 'doctoral degree', 'doctoral studies', 'doctor of philosophy',
                         'doctor', 'doctor of medicine', 'juris doctor', 'doctor of pharmacy', 'doctor of nursing practice',
                         'postdoctoral'],
            'Diploma/Certificate': ['associate degree','technical diploma', 'vocational diploma', 'trade school', 'apprenticeship',
                                    'certificate', 'certification program', 'technical training', 'professional certification', 'license']
        },
        'social media usage': {
            'Has_Social_Media': ['facebook', 'twitter', 'x', 'instagram','youtube', 'snapchat', 'pinterest', 'reddit', 'tumblr', 
                                'threads', 'bereal', 'tiktok', 'clubhouse','whatsapp', 'telegram', 'wechat', 'wechat moments', 
                                'kakaotalk', 'line', 'discord','linkedin','twitch','vkontakte', 'odnoklassniki', 'sharechat',
                                 'patientslikeme', 'healthunlocked', 'myfitnesspal'],
            'No_Social_Media': ['no social media', 'off social media', 'quit social media', 'don’t use social media',
                                'not on facebook', 'not on twitter', 'no instagram', 'social media-free',
                                'avoid social media', 'deleted social media', 'deactivated account',
                                'offline', 'private person', 'not on any platform', 'anti-social media',
                                'minimal digital footprint', 'tech minimalist']
        },
        'religion': {
            'Abrahamic_Religions': {
                'Christianity': ['christian', 'christianity', 'jesus', 'jesus christ', 'bible', 'church', 'catholic', 
                                'protestant', 'evangelical', 'orthodox', 'born again', 'pastor', 'gospel', 'holy spirit', 
                                'mormon', 'jehovah’s witness', 'seventh-day adventist', 'pentecostal', 'baptist', 
                                'methodist', 'fasting'],
                'Islam': ['islam', 'muslim', 'quran', 'mosque', 'prophet muhammad', 'ramadan', 'sharia', 
                            'eid', 'hijab', 'fasting'],
                'Judaism': ['jewish', 'judaism', 'torah', 'synagogue', 'rabbi', 'kosher', 'yom kippur', 
                            'hanukkah', 'shabbat', 'talmud']
            },
            'Dharmic_Religions': {
                'Hinduism': ['hindu', 'hinduism', 'karma', 'moksha', 'yoga', 'vedas', 'bhagavad gita', 
                            'upanishads', 'puja', 'diwali', 'holi', 'shiva', 'vishnu', 'krishna', 'ayurveda'],
                'Buddhism': ['buddhist', 'buddhism', 'buddha', 'samsara', 'nirvana', 'dharma', 'vajrayana', 
                            'theravada', 'mahayana'],
                'Sikhism': ['sikh', 'sikhism', 'guru nanak', 'gurdwara', 'khalsa', 'turban', 'guru granth sahib'],
                'Jainism': ['jainism']
            },
            'East_Asian_Religions': {
                'Taoism': ['taoism', 'taoist', 'daoism', 'yin-yang', 'tao te ching'],
                'Shinto': ['shinto', 'kami', 'shrine', 'shintoism', 'torii', 'shinto priest']
            },
            'Other_Religion': {
                'Spirituality': ['spiritual', 'spirituality', 'spiritualism', 'new age', 'esoteric', 
                                'mysticism', 'meditation', 'energy healing'],
                'Agnosticism': ['agnostic', 'agnosticism', 'uncertain', 'agnosticity', 'questioning belief'],
                'Atheism': ['atheist', 'atheism', 'godless', 'secular', 'non-believer', 'freethinker', 'humanist'],
                'Indigenous & Animistic Beliefs': ['native american spirituality', 'shamanism', 'aboriginal dreamtime', 'inuit animism'],
                'Other_Religions': ['baha’i', 'zoroastrianism', 'rastafarianism', 'wicca', 'paganism']
            }
        },
        'marital status': {
            'Currently_Married_or_Partnered': {
                'Married': ['married', 'spouse', 'husband', 'wife', 'partner', 'civil union', 
                            'common law marriage', 'domestic partnership', 'same-sex marriage', 
                            'arranged marriage', 'customary marriage'],
                'Engaged': ['engaged', 'fiancé', 'fiancée', 'betrothed'],
                'Cohabiting': ['cohabiting', 'living together', 'cohabit', 'roommate partner']
            },
            'Not_Married': {
                'Single': ['single', 'unmarried', 'never married', 'bachelor', 'bachelorette'],
                'Divorced or Separated': ['divorced', 'separated', 'ex-husband', 'ex-wife', 'dissolution of marriage', 
                                            'broken up', 'divorcee', 'separated but not divorced', 'pending divorce'],
                'Widowed': ['widowed', 'widower', 'widow', 'lost spouse', 'bereaved partner'],
                'Complicated Relationship': ['it\'s complicated', 'on a break', 'uncertain relationship']
            }
        },
        'profession': {
            'Legal_&_Law_Enforcement': {
                'Law': ['lawyer', 'attorney', 'law', 'jurisprudence', 'court', 'litigation', 
                        'legal counsel', 'barrister', 'solicitor', 'paralegal', 
                        'defense attorney', 'prosecutor', 'advocate'],
                'Police_&_Security': ['police officer', 'detective', 'investigator', 'patrol officer', 
                                    'sheriff', 'security officer', 'FBI agent', 'law enforcement'],
                'Military': ['soldier', 'marine', 'airman', 'navy', 'army', 'military officer', 
                            'veteran', 'combat engineer', 'infantry']
            },
            'Technology_&_Engineering': {
                'Information_Technology': ['information technology', 'software engineer', 'developer', 
                                            'programmer', 'web developer', 'network administrator', 'system analyst', 
                                            'data scientist', 'cloud architect', 'AI engineer', 'ML engineer', 
                                            'cybersecurity specialist', 'DevOps engineer', 'full-stack developer', 
                                            'front-end developer', 'back-end developer', 'database administrator'],
                'Engineering': ['engineer', 'engineering', 'mechanical engineer', 'civil engineer', 
                                'electrical engineer', 'aerospace engineer', 'chemical engineer', 
                                'structural engineer', 'robotics engineer', 'environmental engineer', 
                                'automotive engineer', 'biomedical engineer']
            },
            'Healthcare_&_Social Services': {
                'Medical': ['doctor', 'physician', 'surgeon', 'nurse', 'medical professional', 
                            'healthcare provider', 'general practitioner', 'pediatrician', 'dentist', 
                            'orthopedic', 'radiologist', 'gynecologist', 'psychiatrist', 'anesthesiologist'],
                'Social_Work_&_Counseling': ['social worker', 'counselor', 'case manager', 'therapist', 
                                            'mental health counselor', 'advocate', 'community organizer', 
                                            'child welfare specialist']
            },
            'Education_&_Academia': {
                'Teaching': ['teacher', 'educator', 'professor', 'instructor', 'tutor', 
                            'mentor', 'principal', 'lecturer', 'academic', 
                            'trainer', 'curriculum developer', 'coach'],
                'Student_Roles': ['student', 'intern', 'trainee', 'apprentice']
            },
            'Science_&_Research': {
                'Scientist': ['scientist', 'researcher', 'chemist', 'biologist', 'physicist', 
                                'astronomer', 'laboratory', 'biotechnology', 'research scientist', 
                                'data analyst', 'environmental scientist', 'geneticist', 
                                'neuroscientist', 'pharmacologist']
            },
            'Creative_& Media': {
                'Art_&_Design': ['artist', 'painter', 'sculptor', 'designer', 'photographer', 
                                'illustrator', 'visual artist', 'graphic designer', 
                                'fashion designer', 'digital artist', 'animator', 'video editor'],
                'Writing_&_Journalism': ['writer', 'author', 'journalist', 'novelist', 'content creator', 
                                        'blogger', 'editor', 'poet', 'copywriter', 'scriptwriter', 
                                        'columnist', 'biographer', 'screenwriter']
            },
            'Business_&_Finance': {
                'Entrepreneurship': ['entrepreneur', 'business owner', 'startup', 'founder', 'CEO', 
                                    'businessman', 'businesswoman', 'small business', 
                                    'co-founder', 'investor', 'angel investor', 'startup founder'],
                'Finance': ['accountant', 'auditor', 'investment banker', 'financial analyst', 
                            'CFA', 'wealth manager', 'banker', 'consultant', 'fund manager', 
                            'stockbroker', 'bookkeeper']
            },
            'Skilled_Trades_&_Services': {
                'Construction_&_Technical': ['construction worker', 'contractor', 'builder', 'carpenter', 
                                            'plumber', 'electrician', 'welder', 'architect'],
                'Retail_&_Customer_Service': ['retail worker', 'cashier', 'store manager', 'sales associate', 
                                                'shopkeeper', 'merchandiser', 'customer service'],
                'Hospitality': ['chef', 'cook', 'waiter', 'waitress', 'bartender', 
                                'hotel manager', 'housekeeper', 'concierge']
            },
            'Other Profession': {
                'Self-Employment_&_Gig_Economy': ['self-employed', 'freelancer', 'independent contractor', 'consultant', 
                                                    'gig worker', 'rideshare driver', 'food delivery worker', 'online tutor'],
                'Unemployed': ['unemployed', 'not working', 'job seeker', 'between jobs'],
                'Other_Professions': ['secretary', 'office manager', 'receptionist', 'clerical worker', 
                                        'truck driver', 'pilot', 'bus driver', 'delivery driver', 
                                        'farmer', 'agricultural worker', 'rancher', 'librarian', 
                                        'school counselor', 'firefighter', 'paramedic', 'postal worker']
            }
        },
        'household income classification': {
            'Lower Class': {
                'label': '<= $30,000',
                'range': [1000, 30000],
                'income_terms': ['low income', 'poverty', 'minimum wage', 'below poverty line', 
                                'living paycheck to paycheck', 'low-wage job', 'part-time work', 
                                'unemployment benefits', 'disability benefits']
            },
            'Lower-Middle Class': {
                'label': '$30,001 - $58,020',
                'range': [30001, 58020],
                'income_terms': ['lower-middle class', 'working class', 'modest income', 'entry-level salary', 
                                'starting salary', 'average wage', 'lower middle income', 'blue-collar', 
                                'hourly pay', 'gig worker', 'side hustle']
            },
            'Middle Class': {
                'label': '$58,021 - $94,000',
                'range': [58021, 94000],
                'income_terms': ['middle class', 'middle income', 'average income', 'moderate salary', 
                                'moderate earnings', 'stable income', 'standard wage', 'dual-income household', 
                                'salaried employee', 'white-collar worker', 'median income', 'modest', 'comfortable', 'stable']
            },
            'Upper-Middle Class': {
                'label': '$94,001 - $153,000',
                'range': [94001, 153000],
                'income_terms': ['upper-middle class', 'upper middle income', 'higher salary', 'professional income', 
                                'upper middle wage', 'well-off', 'comfortable income', 'career growth salary', 
                                'six-figure job', 'middle management', 'senior professional']
            },
            'Upper Class': {
                'label': '> $153,000',
                'range': [153001, 999999999],  # Large upper bound for open-ended range
                'income_terms': ['upper class', 'high income', 'luxury', 'six-figure salary', 'wealthy', 'millionaire', 
                                'affluent', 'high earnings', 'top 1%', 'high net worth', 'elite income', 'executive salary', 
                                'C-suite', 'venture capitalist', 'investor', 'entrepreneur', 'business mogul', 
                                'seven-figure salary', 'financially independent', 'trust fund', 'inheritance']
            }
        },
        'housing situation': {
            'Single-Family Home': ['single-family home', 'detached house', 'household', 'private house', 
                        'bungalow', 'ranch house', 'mobile home', 'tiny house'],
            'Two-Family Home': ['two-family home', 'duplex', 'semi-detached house', 'in-law suite'],
            'Three-Family Home': ['three-family home', 'triplex', 'three-family residence', 'multi-unit residence'],
            'Apartment': ['apartment', 'condo', 'flat', 'studio', 'unit', 'rented apartment', 
                            'apartment complex', 'shared apartment', 'loft', 'penthouse', 'co-op', 
                            'high-rise', 'low-rise', 'walk-up'],
            'Shared_Housing': ['shared housing', 'housemates', 'roommates', 'co-living', 'boarding house', 
                                'group housing', 'dormitory', 'hostel', 'fraternity house', 
                                'sorority house', 'shared room', 'communal living'],
            'Homeless': ['homeless', 'no fixed address', 'shelter resident', 'living on the streets', 
                        'houseless', 'temporary housing', 'emergency shelter', 'transitional housing', 
                        'motel', 'couch surfing', 'vehicle living'],
            'Other_Housing': ['nursing home', 'assisted living', 'senior housing', 'military housing', 
                        'on-base housing', 'shanty', 'informal settlement', 'compound', 'village hut'],
            'Unspecified_Housing': ['unspecified residence', 'temporary address', 'housing status unknown']
        }

} 

In [359]:
import pandas as pd
import re 

def extract_age(question):
    """
    Extracts age only if it appears in an age-related phrase (e.g., '18 years old', 'aged 30').
    """
    age_patterns = [
        r'(\d{1,3})\s*years?\s*old',
        r'aged\s*(\d{1,3})',
        r'(\d{1,3})-year-old',
        r'(\d{1,3})\s*yrs?\s*old',
        r'(\d{1,3})\s*yo',
        r'\b(\d{1,3})\b\s*(?:years?|yrs?)?',  # Matches "36" or "36 years"
        r'\bI\'?m\s*(\d{1,3})\b'  # Matches "I’m 36" or "I am 36"
    ]

    for pattern in age_patterns:
        match = re.search(pattern, question, re.IGNORECASE)
        if match:
            age = int(match.group(1))
            return age
    return None

def classify_age(question, keyword_categories):
    """
    Determines the correct age category based on numerical age or keywords.
    """
    if 'age' not in keyword_categories:
        print("Error: 'age' category missing")
        return 'N/A'

    age = extract_age(question)
    if age is not None:
        for subcategory, data in keyword_categories['age'].items():
            if not isinstance(data.get('range'), list):
                print(f"Error: Invalid range for {subcategory}")
                continue
            if age in data['range']:
                #print(f"Age {age} mapped to {subcategory}")  # Debug
                return subcategory

    # Keyword matching with hyphen normalization
    question_words = [word.replace('-', ' ') for word in question.lower().split()]
    for subcategory, data in keyword_categories['age'].items():
        for keyword in data.get('life_stage', []):
            keyword_clean = keyword.replace('-', ' ')
            threshold = 100 if len(keyword) <= 5 else 80  # Lowered threshold
            if threshold == 100:
                if any(keyword_clean.lower() == word.lower() for word in question_words):
                    #print(f"Matched age keyword: {keyword} (exact) -> {subcategory}")  # Debug
                    return subcategory
            else:
                if fuzz and any(fuzz.ratio(keyword_clean.lower(), word.lower()) >= threshold for word in question_words):
                    score = max(fuzz.ratio(keyword_clean.lower(), word.lower()) for word in question_words)
                    #print(f"Matched age keyword: {keyword} (fuzzy, score: {score}) -> {subcategory}")  # Debug
                    return subcategory
    return 'N/A'

In [361]:
def get_region_from_state(question, keyword_categories):
    """
    Identifies a region based on the presence of a state name.
    Uses dynamic similarity threshold:
    - 100% (exact match) for keywords with 5 or fewer letters
    - 90% for keywords longer than 5 letters
    """
    question = question.lower()
    for region, states in keyword_categories['region'].items():
        for state in states:
            threshold = 100 if len(state) <= 5 else 90
            if threshold == 100:
                # Exact match for short keywords
                if any(state.lower() == word.lower() for word in question.split()):
                    return region
            else:
                # Fuzzy match for longer keywords
                if any(fuzz.ratio(state.lower(), word.lower()) >= threshold for word in question.split()):
                    return region
    return 'N/A'


def classify_income(question, keyword_categories):
    import re
    from fuzzywuzzy import fuzz

    income_data = keyword_categories.get('household income classification', {})

    # Step 1: Try numeric income extraction
    income_match = re.search(r'\$?\s*(\d{1,3}(?:,\d{3})*)\s*(?:USD)?\b', question, re.IGNORECASE)
    if income_match:
        income = int(income_match.group(1).replace(',', ''))
        for subcategory, data in income_data.items():
            if isinstance(data, dict) and 'range' in data:
                min_income, max_income = data['range']
                if min_income <= income <= max_income:
                    return subcategory.lower().replace(" ", "_")

    # Step 2: Match income-related keywords
    words = re.findall(r'\w+', question.lower())  # safer split
    for subcategory, data in income_data.items():
        if not isinstance(data, dict) or 'income_terms' not in data:
            continue
        for keyword in data['income_terms']:
            threshold = 100 if len(keyword) <= 5 else 90
            for word in words:
                if threshold == 100:
                    if keyword.lower() == word:
                        return subcategory.lower().replace(" ", "_")
                else:
                    if fuzz.ratio(keyword.lower(), word) >= threshold:
                        return subcategory.lower().replace(" ", "_")

    return 'N/A'

In [363]:
from fuzzywuzzy import fuzz
import pandas as pd
import re

def categorize_question_enhanced(question, keyword_categories, label_size=15, similarity_threshold=90):
    """
    Enhanced version with better phrase matching and debugging.
    """
    if pd.isna(question) or not isinstance(question, str):
        return ['N/A'] * label_size, 0, {category: {} for category in keyword_categories.keys()}
    
    question_original = question
    question = question.lower()
    labels = []
    matched_categories = 0
    category_hits = {category: {} for category in keyword_categories.keys()}
    
    def check_keyword_match(keyword_str, question_text, debug_category=None):
        """
        Enhanced matching function with debugging capability.
        """
        keyword_str = str(keyword_str).lower().strip()
        
        # Direct substring match for phrases
        if keyword_str in question_text:
            if debug_category:
                print(f"  ✓ Direct match: '{keyword_str}' found in text")
            return True
        
        # Normalize hyphens and underscores
        normalized_keyword = keyword_str.replace('-', ' ').replace('_', ' ')
        normalized_question = question_text.replace('-', ' ').replace('_', ' ')
        
        if normalized_keyword in normalized_question:
            if debug_category:
                print(f"  ✓ Normalized match: '{normalized_keyword}' found in normalized text")
            return True
        
        # For very short keywords, require exact word match
        if len(keyword_str) <= 3:
            pattern = r'\b' + re.escape(keyword_str) + r'\b'
            match = bool(re.search(pattern, question_text))
            if debug_category and match:
                print(f"  ✓ Exact word match: '{keyword_str}'")
            return match
        
        # Fuzzy matching for single words
        if ' ' not in normalized_keyword:
            question_words = normalized_question.split()
            for word in question_words:
                if fuzz.ratio(keyword_str, word) >= similarity_threshold:
                    if debug_category:
                        print(f"  ✓ Fuzzy match: '{keyword_str}' matches '{word}' (score: {fuzz.ratio(keyword_str, word)})")
                    return True
        
        return False
    
    for category, subcategories in keyword_categories.items():
        assigned_label = 'N/A'
        debug_this_category = category in ['disability status', 'household income classification']
        
        if debug_this_category:
            print(f"\n=== Debugging category: {category} ===")
            print(f"Question: {question_original}")
        
        if category == 'region':
            assigned_label = get_region_from_state(question, keyword_categories)
            if assigned_label != 'N/A':
                category_hits[category][assigned_label] = 1
                
        elif category == 'age':
            assigned_label = classify_age(question, keyword_categories)
            if assigned_label != 'N/A':
                category_hits[category][assigned_label] = 1
                
        elif category == 'household income classification':
            assigned_label = classify_income(question, keyword_categories)
            if debug_this_category:
                print(f"classify_income returned: {assigned_label}")
            if assigned_label != 'N/A':
                category_hits[category][assigned_label] = 1
            else:
                # If classify_income fails, try keyword matching as fallback
                if debug_this_category:
                    print("classify_income failed, trying keyword matching...")
                
                if isinstance(subcategories, dict):
                    found = False
                    for parent, child_dict in subcategories.items():
                        if found:
                            break
                        if isinstance(child_dict, dict):
                            for child, details in child_dict.items():
                                if 'income_terms' in details:
                                    for keyword in details['income_terms']:
                                        if check_keyword_match(keyword, question, debug_this_category if debug_this_category else None):
                                            assigned_label = parent
                                            category_hits[category][assigned_label] = 1
                                            found = True
                                            if debug_this_category:
                                                print(f"  → Assigned: {assigned_label}")
                                            break
                                if found:
                                    break
                
        elif category == 'languages spoken':
            # Match multiple languages
            matched_languages = set()
            for lang in subcategories:
                if check_keyword_match(lang, question):
                    matched_languages.add(lang)
            
            if matched_languages:
                assigned_label = "_".join(sorted(matched_languages))
                for lang in matched_languages:
                    category_hits[category][lang] = 1
                    
        else:
            if isinstance(subcategories, dict):
                found = False
                for parent, child_dict in subcategories.items():
                    if found:
                        break
                    if debug_this_category:
                        print(f"Checking parent: {parent}")
                    
                    if isinstance(child_dict, dict):  # Two levels deep
                        for child, keywords in child_dict.items():
                            if debug_this_category:
                                print(f"  Checking child: {child}")
                                print(f"    Keywords: {keywords}")
                            
                            for keyword in keywords:
                                if check_keyword_match(keyword, question, debug_this_category if debug_this_category else None):
                                    assigned_label = child  # Use the deepest label
                                    category_hits[category][assigned_label] = 1
                                    found = True
                                    if debug_this_category:
                                        print(f"    → Assigned: {assigned_label}")
                                    break
                            if found:
                                break
                                
                    elif isinstance(child_dict, list):  # One level deep
                        for keyword in child_dict:
                            if check_keyword_match(keyword, question, debug_this_category if debug_this_category else None):
                                assigned_label = parent
                                category_hits[category][assigned_label] = 1
                                found = True
                                if debug_this_category:
                                    print(f"  → Assigned: {assigned_label}")
                                break
                        
            elif isinstance(subcategories, list):  # Flat structure
                for keyword in subcategories:
                    if check_keyword_match(keyword, question):
                        assigned_label = keyword
                        category_hits[category][assigned_label] = 1
                        break
        
        if debug_this_category:
            print(f"Final result for {category}: {assigned_label}")
        
        labels.append(assigned_label)
        if assigned_label != 'N/A':
            matched_categories += 1

    required_size = max(len(keyword_categories), label_size)
    labels = labels[:required_size]
    labels += ['N/A'] * (required_size - len(labels))
    return labels, matched_categories, category_hits


In [365]:
import openai
from openai import OpenAI
from fuzzywuzzy import process
import re

# Categories in fixed order — index matters
categories = [
    "gender", "age", "disability status", "race", "country", "state", "region",
    "languages spoken", "education level", "social media usage", "religion", "marital status",
    "profession", "household income classification", "housing situation"
]

# GPT client
client = OpenAI(api_key="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")  # Replace with your actual key

def flatten_keyword_categories(keyword_categories):
    """Return allowed labels for each category key."""
    allowed_labels = {}
    for category, subcategories in keyword_categories.items():
        label_set = set()
        def collect_labels(node):
            if isinstance(node, dict):
                for key, val in node.items():
                    if isinstance(val, (dict, list)):
                        label_set.add(key)
                        collect_labels(val)
            elif isinstance(node, list):
                for item in node:
                    if isinstance(item, str):
                        label_set.add(item)
                    else:
                        collect_labels(item)
        
        collect_labels(subcategories)
        # If no labels found, use the keys from subcategories
        if not label_set and isinstance(subcategories, dict):
            label_set = set(subcategories.keys())
        
        allowed_labels[category] = label_set
    
    return allowed_labels

def normalize_to_known_label(raw_label, allowed_labels, category_name=""):
    """Normalize extracted value to closest known label if confidence is high enough."""
    if not raw_label or raw_label.lower().strip() in {"n/a", "not mentioned", "none", "unknown", ""}:
        return "n/a"
    
    # Clean the raw label
    raw_clean = raw_label.strip().lower()
    
    # Try exact match first (case insensitive)
    for label in allowed_labels:
        if raw_clean == label.lower().strip():
            return label.lower().strip().replace(" ", "_")
    
    # Try fuzzy matching with lower threshold for better matching
    if allowed_labels:
        match, score = process.extractOne(raw_label, list(allowed_labels))
        print(f"🔍 Fuzzy match for '{raw_label}' -> '{match}' (score: {score})")
        
        if score >= 75:  # Lowered threshold from 85 to 75
            return match.strip().lower().replace(" ", "_")
    
    # If no good match found, return a cleaned version but keep it recognizable
    clean_result = raw_clean.replace(" ", "_")
    print(f"⚠️ No good match found for '{raw_label}' in category '{category_name}'. Returning: {clean_result}")
    return clean_result

def create_qa_prompt(question, category):
    """Create a specific question for each category to extract relevant information."""
    
    qa_prompts = {
        "gender": "What is the gender of the person mentioned in this text? Answer with only the gender (male, female, non-binary, etc.) or 'N/A' if not mentioned.",
        
        "age": "What is the age or age group of the person mentioned in this text? Answer with only the age/age range (adult, elderly, child, teen, etc.) or 'N/A' if not mentioned.",
        
        "disability status": "Is any disability mentioned for the person in this text? Answer with only the type of disability (physical_disability, sensory_disability, cognitive_disability, etc.) or 'N/A' if not mentioned.",
        
        "race": "What is the race or ethnicity of the person mentioned in this text? Answer with only the race/ethnicity or 'N/A' if not mentioned.",
        
        "country": "Which country is mentioned in relation to the person in this text? Answer with only the country name or 'N/A' if not mentioned.",
        
        "state": "Which state or province is mentioned in relation to the person in this text? Answer with only the state/province name or 'N/A' if not mentioned.",
        
        "region": "Which region or area is mentioned in relation to the person in this text? Answer with only the region name or 'N/A' if not mentioned.",
        
        "languages spoken": "What languages does the person speak according to this text? Answer with only the language name(s) or 'N/A' if not mentioned.",
        
        "education level": "What is the education level of the person mentioned in this text? Answer with only the education level (high_school, bachelors_degree, masters_degree, etc.) or 'N/A' if not mentioned.",
        
        "social media usage": "What social media platforms or usage is mentioned for the person in this text? Answer with only the platform name(s) or usage pattern or 'N/A' if not mentioned.",
        
        "religion": "What religion is mentioned for the person in this text? Answer with only the religion name or 'N/A' if not mentioned.",
        
        "marital status": "What is the marital status of the person mentioned in this text? Answer with only the marital status (single, married, divorced, etc.) or 'N/A' if not mentioned.",
        
        "profession": "What is the profession or job of the person mentioned in this text? Answer with only the profession name or 'N/A' if not mentioned.",
        
        "household income classification": "What income level is mentioned for the person in this text? Answer with only the income classification (low_income, middle_class, high_income, etc.) or 'N/A' if not mentioned.",
        
        "housing situation": "What housing situation is mentioned for the person in this text? Answer with only the housing type (apartment, house, etc.) or 'N/A' if not mentioned."
    }
    
    return qa_prompts.get(category, f"What {category} is mentioned for the person in this text? Answer with only the specific value or 'N/A' if not mentioned.")

def ask_gpt_qa(question, category, allowed_labels=None):
    """Use GPT as a QA agent to extract specific category information from the text."""
    qa_question = create_qa_prompt(question, category)
    
    # Add context about allowed values if available
    context_note = ""
    if allowed_labels and len(allowed_labels) > 0:
        examples = list(allowed_labels)[:8]
        context_note = f"\n\nImportant: Try to use one of these standard values if possible: {', '.join(examples)}. If none fit exactly, use the closest match or a brief, specific term."
    
    full_prompt = f"{qa_question}{context_note}\n\nText to analyze:\n\"{question}\""
    
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a precise information extraction assistant. Extract only the specific information requested from the given text. Give very brief, keyword-style answers. If the information is not explicitly mentioned, respond with 'N/A'. Avoid long descriptions or sentences."
                },
                {
                    "role": "user", 
                    "content": full_prompt
                }
            ],
            temperature=0,
            max_tokens=20  # Reduced from 50 to force shorter answers
        )
        
        raw_answer = response.choices[0].message.content.strip()
        
        if raw_answer.lower() in {"n/a", "not mentioned", "none", "unknown"}:
            return "n/a"
            
        if allowed_labels:
            normalized = normalize_to_known_label(raw_answer, allowed_labels, category)
            return f"{normalized} (qa)"         
        else:
            clean_answer = raw_answer.strip().lower().replace(" ", "_")
            return f"{clean_answer} (qa)"
            
    except Exception as e:
        print(f"⚠️ Error from GPT QA API on category '{category}': {e}")
        return "n/a" 

def is_meaningful_result(result):
    """Check if a result is meaningful (not n/a or empty)."""
    if not result:
        return False
    
    result_clean = str(result).strip().lower()
    return result_clean not in {"n/a", "none", "not mentioned", "unknown", ""}

def label_question_with_qa_enhanced(question, keyword_categories):
    """
    Uses keyword matching first, then QA for any categories that return 'n/a'.
    Fixed version that properly checks original pipeline results.
    """
    print(f"\n=== Processing question: {question[:100]}... ===")
    
    # Step 1: Get keyword-based results
    pipeline_labels, matched_categories, category_hits = categorize_question(
        question, keyword_categories, label_size=15, similarity_threshold=90
    )
    
    print(f"Original pipeline results: {pipeline_labels}")
    
    # Step 2: Get allowed labels per category
    allowed_label_map = flatten_keyword_categories(keyword_categories)
    
    # Step 3: Process each category - use QA only if pipeline result is truly empty/n/a
    final_labels = []
    qa_used_categories = []
    
    for i, category in enumerate(categories):
        original_result = pipeline_labels[i] if i < len(pipeline_labels) else None
        
        # Check if the original result is meaningful
        if is_meaningful_result(original_result):
            # Keep the keyword result, just clean it
            clean_result = str(original_result).strip().lower().replace(" ", "_")
            final_labels.append(clean_result)
            print(f"✓ Keeping keyword result for '{category}': {clean_result}")
        else:
            # Use QA for empty/n/a results
            print(f"🔍 Using QA for category '{category}' (pipeline returned: {original_result})")
            allowed_labels = allowed_label_map.get(category, set())
            qa_result = ask_gpt_qa(question, category, allowed_labels)
            
            if qa_result != "n/a":
                qa_used_categories.append(category)
                final_labels.append(qa_result)
                print(f"✓ QA result for '{category}': {qa_result}")
            else:
                final_labels.append("n/a")
                print(f"✗ No result found for '{category}'")
    
    print(f"Final results: {final_labels}")
    print(f"QA was used for: {qa_used_categories}")
    
    return final_labels, matched_categories, category_hits, qa_used_categories

def debug_enhanced_labeling(question, keyword_categories):
    """Debug version that shows what's happening at each step."""
    
    print("=== DEBUGGING ENHANCED LABELING PROCESS ===")
    print(f"Question: {question[:100]}...")
    
    # Step 1: Get keyword results
    pipeline_labels, matched_categories, category_hits = categorize_question(
        question, keyword_categories, label_size=15, similarity_threshold=90
    )
    
    print(f"Original pipeline results: {pipeline_labels}")
    
    # Step 2: Show detailed analysis for each category
    allowed_label_map = flatten_keyword_categories(keyword_categories)
    
    for i, category in enumerate(categories):
        if i < len(pipeline_labels):
            result = pipeline_labels[i]
            meaningful = is_meaningful_result(result)
            allowed = allowed_label_map.get(category, set())
            
            print(f"Category {i} ({category}):")
            print(f"  Original result: '{result}'")
            print(f"  Is meaningful: {meaningful}")
            print(f"  Allowed labels: {list(allowed)[:5]}...")  # Show first 5
            print(f"  Will use: {'Keyword' if meaningful else 'QA'}")
    
    # Step 3: Process normally
    final_labels, matched_categories, category_hits, qa_used = label_question_with_qa_enhanced(
        question, keyword_categories
    )
    
    print("=" * 50)
    
    return final_labels, matched_categories, category_hits, qa_used

# Example usage
if __name__ == "__main__":
    # Test with your example
    test_question = "The patient is a 45-year-old male who works as a high school science teacher, holding a Master's degree in Education. He has been diagnosed with moderate hearing loss and utilizes hearing aids in both ears to assist with communication in his professional and daily life. Despite his condition, he maintains a middle-class household income, which he attributes to his steady employment and his spouse's part-time work."
    
    print("Fixed behavior:")
    print("- QA will only be used for categories where keyword matching returns empty/n/a results")
    print("- QA responses will be normalized to keyword format using allowed labels")
    print("- Original meaningful keyword results will be preserved")
    print("- Better fuzzy matching for QA normalization")
    print()
    
    # You would call this like:
    # debug_enhanced_labeling(test_question, your_keyword_categories)

Fixed behavior:
- QA will only be used for categories where keyword matching returns empty/n/a results
- QA responses will be normalized to keyword format using allowed labels
- Original meaningful keyword results will be preserved
- Better fuzzy matching for QA normalization



In [367]:
def process_csv(input_file, output_file, keyword_categories, similarity_threshold=80):
    """
    Reads a CSV file, applies labeling to the 'question' column, and saves a new CSV.
    Includes fuzzy matching statistics for all categories.
    """
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
        return

    if 'question' not in df.columns:     # change when changing csv's
        print("Error: The 'question' column is missing in the input CSV.")
        return

    total_questions = len(df)
    results = df.apply(lambda row: label_question_with_qa_enhanced(
        question=row['question'],
        keyword_categories=keyword_categories,
        #ground_truth_labels=row['ground_truth_labels']
    ), axis=1)
    df[['labels', 'matched_categories', 'category_hits', 'qa_used']] = pd.DataFrame(results.tolist(), index=df.index)

    # Aggregate subcategory hits
    category_hits_list = df.pop('category_hits').tolist()
    subcategory_totals = {}
    for category in keyword_categories.keys():
        subcategory_totals[category] = {}
        for hit in category_hits_list:
            for subcategory, value in hit[category].items():
                subcategory_totals[category][subcategory] = subcategory_totals[category].get(subcategory, 0) + value

    # Calculate percentages
    subcategory_percentages = {}
    for category in keyword_categories.keys():
       subcategory_percentages[category] = {
       subcategory: (count / total_questions) * 100
       for subcategory, count in subcategory_totals[category].items()
    }

    total_categories_matched = df['matched_categories'].sum()
    recognition_percentage = (total_categories_matched / (total_questions * len(keyword_categories))) * 100

    df.to_csv(output_file, index=False)

    print(f"Recognition Percentage: {recognition_percentage:.2f}%")
    print("\nSubcategory-wise hit counts and percentages:")
    for category in keyword_categories.keys():
        print(f"\nCategory: {category}")
        for subcategory, count in subcategory_totals[category].items():
            print(f"  {subcategory}: {count} questions matched ({subcategory_percentages[category][subcategory]:.2f}%)")

input_csv = 'synthetic_patient_descriptions_and_ground_truth.csv'
output_csv = 'synthetic_patient_descriptions_labels_reroute.csv'
process_csv(input_csv, output_csv, keyword_categories, similarity_threshold=90)


=== Processing question: Maria Juarez, a 55-year-old woman of Mexican descent, resides in a rural area of New Mexico. She rec... ===
Original pipeline results: ['female', 'adult', 'N/A', 'central_american', 'mexico', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'Spirituality', 'N/A', 'N/A', 'N/A', 'N/A']
✓ Keeping keyword result for 'gender': female
✓ Keeping keyword result for 'age': adult
🔍 Using QA for category 'disability status' (pipeline returned: N/A)
✓ QA result for 'disability status': physical_disability (qa)
✓ Keeping keyword result for 'race': central_american
✓ Keeping keyword result for 'country': mexico
🔍 Using QA for category 'state' (pipeline returned: N/A)
✓ QA result for 'state': new_mexico (qa)
🔍 Using QA for category 'region' (pipeline returned: N/A)
✓ QA result for 'region': new_mexico (qa)
🔍 Using QA for category 'languages spoken' (pipeline returned: N/A)
✗ No result found for 'languages spoken'
🔍 Using QA for category 'education level' (pipeline returned: N/A)
✗ No resu