In [123]:
import pandas as pd
from fuzzywuzzy import fuzz

# Load the CSV file
df = pd.read_csv('synthetic_patient_descriptions_gpt4o.csv')

# Display the first few rows to check the structure
df.head()


Unnamed: 0,question,included_fields
0,As a 34-year-old woman who primarily speaks Sp...,"religion, age, languages, gender"
1,I'm a 45-year-old female software engineer liv...,"profession, religion, college degree, gender, ..."
2,As a Black software developer from the Pacific...,"race, disability status, profession, region"
3,As a 34-year-old Black woman with a degree in ...,"race, college degree, country, marital status"
4,As a 34-year-old software developer from Canad...,"languages, country, profession, social media, ..."


In [125]:
print(df.columns)


Index(['question', 'included_fields'], dtype='object')


In [127]:
print(df.head())


                                            question  \
0  As a 34-year-old woman who primarily speaks Sp...   
1  I'm a 45-year-old female software engineer liv...   
2  As a Black software developer from the Pacific...   
3  As a 34-year-old Black woman with a degree in ...   
4  As a 34-year-old software developer from Canad...   

                                     included_fields  
0                   religion, age, languages, gender  
1  profession, religion, college degree, gender, ...  
2        race, disability status, profession, region  
3      race, college degree, country, marital status  
4  languages, country, profession, social media, ...  


In [129]:
keyword_categories = {
        'gender': {
            'male': ['he', 'him', 'father', 'brother', 'husband', 'dad', 'son','boy', 'man', 'gentleman', 'masculine', 'sir'],
            'female': ['she', 'her', 'mother', 'sister', 'wife', 'mom', 'daughter', 
                       'pregnancy','female', 'girl', 'woman', 'lady', 'feminine', 'madam'],
            'other gender': ['they', 'them', 'transgender', 'non-binary', 'gender identity', 'genderqueer', 
                            'genderfluid', 'agender', 'bigender', 'two-spirit', 
                            'demiboy', 'demigirl', 'androgyne', 'neutrois']
        },       
        'age': {
            'young': {
                'range': list(range(0, 18)),
                'life_stage': ['infant', 'toddler', 'child', 'teenager', 'minor']
            },
            'adult': {
                'range': list(range(18, 65)),
                'life_stage': ['young adult', 'adult', 'middle-aged']
            },
            'senior': {
                'range': list(range(65, 120)),
                'life_stage': ['elderly', 'senior citizen', 'retiree']
            }
        },
        'disability_status': {
            'has_disability': {
                'neurodevelopmental_disability': ['autism', 'Down syndrome', 'cerebral palsy', 'epilepsy', 'dyslexia', 'ADHD'],
                'mental_health_conditions': ['PTSD', 'anxiety disorder', 'mental health condition'],
                'speech_and_cognitive_disability': ['speech disorder', 'cognitive disability', 
                                                      'intellectual disability', 'developmental disability'],
                'sensory_disability': ['hearing impaired', 'visually impaired', 'deaf', 'blind'],
                'physical_disability': ['mobility aid', 'wheelchair', 'amputee', 'prosthetic', 'temporary disability',
                                        'post-surgery recovery', 'temporary mobility aid', 'chronic pain'],
                'assistive_supports': ['assistive device', 'service animal', 'braille', 'sign language',
                                        'rehabilitation', 'inclusion support'],
                'general_disability': ['disabled', 'impairment', 'handicap', 'challenged', 'accessible',
                                        'accessibility', 'special needs', 'neurodiverse', 'disability']
            },
            'does_not_have_disability': {
                'has_no_disabuility': ['able-bodied', 'no disability', 'not disabled', 'fully abled',
                                        'physically fit', 'unimpaired', 'independent mobility', 'no special needs']
            }
        },
        'race': {
            'asian': {
                'east_asian': [
                    'chinese', 'japanese', 'korean', 'mongolian', 'taiwanese', 'hongkonger', 'macanese'
                ],
                'south_asian': [
                    'indian', 'pakistani', 'bangladeshi', 'nepali', 'sri lankan', 'maldivian', 'bhutanese'
                ],
                'southeast_asian': [
                    'filipino', 'vietnamese', 'thai', 'burmese', 'cambodian', 'laotian', 'malaysian', 'indonesian', 'bruneian', 'singaporean', 'east timorese'
                ]
            },
            'black_or_african_descent': {
                'african': [
                    'nigerian', 'ethiopian', 'ghanaian', 'kenyan', 'south african', 'somali', 'congolese', 'sudanese', 'zimbabwean', 'ugandan', 'tanzanian'
                ],
                'african_american': [
                    'african-american', 'black'
                ],
                'caribbean': [
                    'jamaican', 'haitian', 'barbadian', 'trinidadian', 'afro-caribbean', 'indo-caribbean'
                ],
                'afro_latino': [
                    'afro-latino', 'afro-brazilian', 'afro-cuban', 'afro-colombian'
                ]
            },
            'white': {
                'north_american': [
                    'american', 'canadian'
                ],
                'european': [
                    'british', 'irish', 'german', 'french', 'italian', 'spanish', 'portuguese', 'dutch',
                    'greek', 'swedish', 'norwegian', 'finnish', 'danish', 'polish', 'russian', 'ukrainian',
                    'czech', 'hungarian', 'serbian', 'romanian', 'bulgarian'
                ],
                'australian_new_zealand': [
                    'australian', 'new zealander'
                ]
            },
            'latinx_or_hispanic': {
                'central_american': [
                    'mexican', 'guatemalan', 'honduran', 'nicaraguan', 'salvadoran', 'panamanian', 'costa rican'
                ],
                'caribbean_hispanic': [
                    'puerto rican', 'cuban', 'dominican'
                ],
                'south_american': [
                    'colombian', 'argentinian', 'chilean', 'peruvian', 'venezuelan', 'ecuadorian', 'paraguayan', 'bolivian', 'uruguayan'
                ],
                'hispanic': [
                    'hispanic'
                ],
                'latino': [
                    'latino', 'latina', 'latinx'
                ]
            },
            'indigenous': {
                'native_north_american': [
                    'native american', 'first nations', 'alaskan native'
                ],
                'indigenous_central_south_american': [
                    'quechua', 'aymara', 'mapuche', 'guarani'
                ],
                'indigenous_oceanian': [
                    'aboriginal australian', 'torres strait islander', 'maori'
                ],
                'indigenous_arctic': [
                    'inuit', 'sami', 'yupik', 'aleut'
                ]
            },
            'middle_eastern_or_north_african': {
                'middle_eastern': [
                    'arab', 'persian', 'kurdish', 'turkish', 'assyrian', 'armenian', 'azerbaijani'
                ],
                'north_african': [
                    'berber', 'egyptian', 'moroccan', 'algerian', 'tunisian', 'libyan'
                ]
            },
            'pacific_islander': {
                'polynesian': [
                    'hawaiian', 'samoan', 'tongan', 'tahitian'
                ],
                'melanesian': [
                    'fijian', 'papuan', 'solomon islander', 'ni-vanuatu'
                ],
                'micronesian': [
                    'guamanian', 'marshallese', 'kiribati', 'palauan'
                ],
            },
            'jewish': ['jewish', 'ashkenazi', 'sephardic', 'mizrahim'],
            'roma': ['roma', 'romani', 'gypsy'],
            'multiracial_or_other': {
                'multiracial': ['biracial', 'mixed race']
            }
        },
        'country': {
            'usa': ['united states', 'usa', 'u.s.', 'united states of america', 'american','rural america', 'urban america'],
            'canada': ['canada', 'canadian','rural canada', 'urban canada'],
            'uk': ['united kingdom', 'uk', 'britain', 'england', 'scotland', 'wales', 'northern ireland','british'],
            'germany': ['germany', 'deutschland', 'german','berlin'],
            'france': ['france', 'french','paris'],
            'india':['india', 'bharat', 'hindustan','indian','rural india', 'urban india'],
            'china': ['china','chinese','beijing'],
            'japan': ['japan','japanese','tokyo'],
            'australia': ['australia','australian', 'aussie'],
            'brazil': ['brazil','brazilian'],
            'mexico': ['mexico','mexican'],
            'italy': ['italy','italian','rome'],
            'spain': ['spain','spanish','madrid'],
            'russia': ['russia','russian','moscow'],
            'south africa': ['south africa','south african'],
            'other_country': ['nigeria', 'ethiopia', 'kenya', 'saudi arabia', 'iran', 'pakistan',
                        'bangladesh', 'philippines', 'vietnam', 'colombia', 'argentina', 'peru']
        },
        'state': {
            'Alabama': ['alabama', 'birmingham', 'montgomery', 'rural alabama', 'urban alabama'],
            'Alaska': ['alaska', 'anchorage', 'juneau', 'rural alaska', 'urban alaska'],
            'Arizona': ['arizona', 'phoenix', 'tucson', 'rural arizona', 'urban arizona'],
            'Arkansas': ['arkansas', 'little rock', 'rural arkansas', 'urban arkansas'],
            'California': ['california', 'cali', 'los angeles', 'san francisco', 
                           'sacramento', 'San Diego', 'Oakland', 'rural california', 'urban california'],
            'Colorado': ['colorado', 'denver', 'boulder', 'rural colorado', 'urban colorado'],
            'Connecticut': ['connecticut', 'hartford', 'new haven', 'rural connecticut', 'urban connecticut'],
            'Delaware': ['delaware', 'dover', 'wilmington', 'rural delaware', 'urban delaware'],
            'Florida': ['florida', 'fl', 'miami', 'orlando', 'tampa', 'Jacksonville', 'rural florida', 'urban florida'],
            'Georgia': ['georgia', 'atlanta', 'savannah', 'rural georgia', 'urban georgia'],
            'Hawaii': ['hawaii', 'honolulu', 'maui', 'rural hawaii', 'urban hawaii'],
            'Idaho': ['idaho', 'boise', 'rural idaho', 'urban idaho'],
            'Illinois': ['illinois', 'chicago', 'springfield', 'rural illinois', 'urban illinois'],
            'Indiana': ['indiana', 'indianapolis', 'rural indiana', 'urban indiana'],
            'Iowa': ['iowa', 'des moines', 'rural iowa', 'urban iowa'],
            'Kansas': ['kansas', 'topeka', 'wichita', 'rural kansas', 'urban kansas'],
            'Kentucky': ['kentucky', 'louisville', 'lexington', 'rural kentucky', 'urban kentucky'],
            'Louisiana': ['louisiana', 'new orleans', 'baton rouge', 'rural louisiana', 'urban louisiana'],
            'Maine': ['maine', 'portland', 'augusta', 'rural maine', 'urban maine'],
            'Maryland': ['maryland', 'baltimore', 'annapolis', 'rural maryland', 'urban maryland'],
            'Massachusetts': ['massachusetts', 'boston', 'cambridge', 'rural massachusetts', 'urban massachusetts'],
            'Michigan': ['michigan', 'detroit', 'lansing', 'rural michigan', 'urban michigan'],
            'Minnesota': ['minnesota', 'minneapolis', 'st. paul', 'rural minnesota', 'urban minnesota'],
            'Mississippi': ['mississippi', 'jackson', 'rural mississippi', 'urban mississippi'],
            'Missouri': ['missouri', 'st. louis', 'kansas city', 'rural missouri', 'urban missouri'],
            'Montana': ['montana', 'helena', 'billings', 'rural montana', 'urban montana'],
            'Nebraska': ['nebraska', 'lincoln', 'omaha', 'rural nebraska', 'urban nebraska'],
            'Nevada': ['nevada', 'las vegas', 'reno', 'rural nevada', 'urban nevada'],
            'New Hampshire': ['new hampshire', 'concord', 'manchester', 'rural new hampshire', 'urban new hampshire'],
            'New Jersey': ['new jersey', 'trenton', 'newark', 'rural new jersey', 'urban new jersey'],
            'New Mexico': ['new mexico', 'santa fe', 'albuquerque', 'rural new mexico', 'urban new mexico'],
            'New York': ['new york', 'nyc', 'albany', 'manhattan', 'rural new york', 'urban new york'],
            'North Carolina': ['north carolina', 'charlotte', 'raleigh', 'rural north carolina', 'urban north carolina'],
            'North Dakota': ['north dakota', 'bismarck', 'fargo', 'rural north dakota', 'urban north dakota'],
            'Ohio': ['ohio', 'columbus', 'cleveland', 'rural ohio', 'urban ohio'],
            'Oklahoma': ['oklahoma', 'oklahoma city', 'tulsa', 'rural oklahoma', 'urban oklahoma'],
            'Oregon': ['oregon', 'portland', 'salem', 'rural oregon', 'urban oregon'],
            'Pennsylvania': ['pennsylvania', 'philadelphia', 'pittsburgh', 'harrisburg', 'rural pennsylvania', 'urban pennsylvania'],
            'Rhode Island': ['rhode island', 'providence', 'rural rhode island', 'urban rhode island'],
            'South Carolina': ['south carolina', 'charleston', 'columbia', 'rural south carolina', 'urban south carolina'],
            'South Dakota': ['south dakota', 'pierre', 'sioux falls', 'rural south dakota', 'urban south dakota'],
            'Tennessee': ['tennessee', 'nashville', 'memphis', 'rural tennessee', 'urban tennessee'],
            'Texas': ['texas', 'houston', 'dallas', 'austin', 'san antonio', 'Fort Worth', 'El Paso', 'rural texas', 'urban texas'],
            'Utah': ['utah', 'salt lake city', 'rural utah', 'urban utah'],
            'Vermont': ['vermont', 'montpelier', 'burlington', 'rural vermont', 'urban vermont'],
            'Virginia': ['virginia', 'richmond', 'virginia beach', 'rural virginia', 'urban virginia'],
            'Washington': ['washington', 'seattle', 'olympia', 'rural washington', 'urban washington'],
            'West Virginia': ['west virginia', 'charleston', 'rural west virginia', 'urban west virginia'],
            'Wisconsin': ['wisconsin', 'madison', 'milwaukee', 'rural wisconsin', 'urban wisconsin'],
            'Wyoming': ['wyoming', 'cheyenne', 'rural wyoming', 'urban wyoming'],
            'Puerto Rico': ['puerto rico', 'san juan', 'hurricane-related health', 'rural puerto rico'],
            'Guam': ['guam'],
            'U.S. Virgin Islands': ['u.s. virgin islands', 'st. thomas']
        },
        'region': {
            'Northeast_usa': ['maine', 'new hampshire', 'vermont', 'massachusetts', 'rhode island', 'connecticut',
                              'new york', 'new jersey', 'pennsylvania','rural northeast', 'urban northeast'],
            'Midwest_usa': ['ohio', 'michigan', 'indiana', 'illinois', 'wisconsin', 'great lakes',
                            'minnesota', 'iowa', 'missouri', 'north dakota', 'south dakota', 'nebraska', 'kansas', 'heartland',
                            'rust belt','rural midwest', 'urban midwest'],
            'Southeast_usa': ['delaware', 'maryland', 'virginia', 'west virginia', 'north carolina', 'south carolina', 'florida',
                          'kentucky', 'tennessee', 'alabama', 'mississippi','arkansas', 'louisiana',
                          'deep south', 'appalachia', 'sun belt','rural southeast', 'urban southeast'],
            'Southwest_usa': ['oklahoma', 'texas', 'new mexico', 'arizona','border states', 'sun belt',
                          'rural southwest', 'urban southwest'],
            'West_usa': ['montana', 'idaho', 'wyoming', 'colorado', 'utah', 'nevada', 'mountain west',
                     'washington', 'oregon', 'california', 'alaska', 'hawaii', 'pacific northwest',
                     'sun belt', 'rural west', 'urban west'],
            'US_Territories': ['puerto rico', 'u.s. virgin islands','guam', 'american samoa', 'northern mariana islands']
        },
        'languages': {
            'english': ['english', 'british english', 'american english'],
            'spanish': ['spanish', 'español', 'castilian'],
            'french': ['french', 'français'],
            'german': ['german', 'deutsch'],
            'chinese': ['chinese', 'mandarin', 'cantonese', 'simplified chinese', 'traditional chinese', 
                        'putonghua'],
            'portuguese': ['portuguese', 'português', 'brazilian portuguese'],
            'arabic': ['arabic', 'arab'],
            'hindi': ['hindi', 'hindustani'],
            'russian': ['russian'],
            'japanese': ['japanese'],
            'italian': ['italian', 'italiano'],
            'korean': ['korean'],
            'dutch': ['dutch', 'nederlands'],
            'turkish': ['turkish', 'türkçe'],
            'swedish': ['swedish', 'svenska'],
            'polish': ['polish', 'polski'],
            'greek': ['greek'],
            'romanian': ['romanian', 'română'],
            'hebrew': ['hebrew'],
            'thai': ['thai'],
            'vietnamese': ['vietnamese'],
            'tagalog': ['tagalog', 'filipino'],
            'persian': ['persian', 'farsi'],
            'urdu': ['urdu'],
            'bengali': ['bengali'],
            'punjabi': ['punjabi'],
            'tamil': ['tamil'],
            'telugu': ['telugu'],
            'malayalam': ['malayalam', 'malay'],
            'indonesian': ['indonesian', 'bahasa indonesia'],
            'finnish': ['finnish', 'suomi'],
            'danish': ['danish', 'dansk'],
            'norwegian': ['norwegian'],
            'indigenous': ['navajo', 'cherokee', 'ojibwe', 'inuktitut', 'hawaiian', 'sami', 'maori'],
            'african': ['swahili', 'yoruba', 'amharic', 'hausa', 'zulu'],
            'sign_languages': ['american sign language', 'ASL', 'british sign language', 'BSL', 'auslan'],
            'creole': ['haitian creole', 'jamaican patois', 'tok pisin'],
        },
        'college_degree': {
            'High School Degree': ['high school', 'highschool', 'secondary school', 'secondary education', 'hsc', 'matriculation',
                                   'general education diploma', 'a-levels', 'baccalaureate'],
            'Bachelor Degree': ['undergrad', 'undergraduate', 'bachelor', "bachelor's degree",
                         'bachelor of arts', 'bachelor of science','b.tech', 'bachelor of engineering', 'bachelor of technology',
                         'licenciatura', 'laurea'],
            'Masters Degree': ['master', 'masters', 'graduate', "master's degree", 'graduate degree','master of arts', 'master of science',
                        'master of engineering', 'master of technology'],
            'Doctoral Degree': ['doctorate', 'phd', 'doctoral', 'doctoral degree', 'doctoral studies', 'doctor of philosophy',
                         'doctor', 'doctor of medicine', 'juris doctor', 'doctor of pharmacy', 'doctor of nursing practice',
                         'postdoctoral'],
            'Diploma/Certificate': ['associate degree','technical diploma', 'vocational diploma', 'trade school', 'apprenticeship',
                                    'certificate', 'certification program', 'technical training', 'professional certification', 'license']
        },
        'Social_Media': {
            'Has Social Media': ['facebook', 'twitter', 'x', 'instagram','youtube', 'snapchat', 'pinterest', 'reddit', 'tumblr', 
                                'threads', 'bereal', 'tiktok', 'clubhouse','whatsapp', 'telegram', 'wechat', 'wechat moments', 
                                'kakaotalk', 'line', 'discord','linkedin','twitch','vkontakte', 'odnoklassniki', 'sharechat',
                                 'patientslikeme', 'healthunlocked', 'myfitnesspal'],
            'No Social Media': ['no social media', 'off social media', 'quit social media', 'don’t use social media',
                                'not on facebook', 'not on twitter', 'no instagram', 'social media-free',
                                'avoid social media', 'deleted social media', 'deactivated account',
                                'offline', 'private person', 'not on any platform', 'anti-social media',
                                'minimal digital footprint', 'tech minimalist']
        },
        'religion': {
            'Abrahamic Religions': {
                'Christianity': ['christian', 'christianity', 'jesus', 'jesus christ', 'bible', 'church', 'catholic', 
                                'protestant', 'evangelical', 'orthodox', 'born again', 'pastor', 'gospel', 'holy spirit', 
                                'mormon', 'jehovah’s witness', 'seventh-day adventist', 'pentecostal', 'baptist', 
                                'methodist', 'fasting'],
                'Islam': ['islam', 'muslim', 'quran', 'mosque', 'prophet muhammad', 'ramadan', 'sharia', 
                            'eid', 'hijab', 'fasting'],
                'Judaism': ['jewish', 'judaism', 'torah', 'synagogue', 'rabbi', 'kosher', 'yom kippur', 
                            'hanukkah', 'shabbat', 'talmud']
            },
            'Dharmic Religions': {
                'Hinduism': ['hindu', 'hinduism', 'karma', 'moksha', 'yoga', 'vedas', 'bhagavad gita', 
                            'upanishads', 'puja', 'diwali', 'holi', 'shiva', 'vishnu', 'krishna', 'ayurveda'],
                'Buddhism': ['buddhist', 'buddhism', 'buddha', 'samsara', 'nirvana', 'dharma', 'vajrayana', 
                            'theravada', 'mahayana'],
                'Sikhism': ['sikh', 'sikhism', 'guru nanak', 'gurdwara', 'khalsa', 'turban', 'guru granth sahib'],
                'Jainism': ['jainism']
            },
            'East Asian Religions': {
                'Taoism': ['taoism', 'taoist', 'daoism', 'yin-yang', 'tao te ching'],
                'Shinto': ['shinto', 'kami', 'shrine', 'shintoism', 'torii', 'shinto priest']
            },
            'Other Religion': {
                'Spirituality': ['spiritual', 'spirituality', 'spiritualism', 'new age', 'esoteric', 
                                'mysticism', 'meditation', 'energy healing'],
                'Agnosticism': ['agnostic', 'agnosticism', 'uncertain', 'agnosticity', 'questioning belief'],
                'Atheism': ['atheist', 'atheism', 'godless', 'secular', 'non-believer', 'freethinker', 'humanist'],
                'Indigenous & Animistic Beliefs': ['native american spirituality', 'shamanism', 'aboriginal dreamtime', 'inuit animism'],
                'Other Religions': ['baha’i', 'zoroastrianism', 'rastafarianism', 'wicca', 'paganism']
            }
        },
        'marital status': {
            'Currently Married or Partnered': {
                'Married': ['married', 'spouse', 'husband', 'wife', 'partner', 'civil union', 
                            'common law marriage', 'domestic partnership', 'same-sex marriage', 
                            'arranged marriage', 'customary marriage'],
                'Engaged': ['engaged', 'fiancé', 'fiancée', 'betrothed'],
                'Cohabiting': ['cohabiting', 'living together', 'cohabit', 'roommate partner']
            },
            'Not Currently Married': {
                'Single': ['single', 'unmarried', 'never married', 'bachelor', 'bachelorette'],
                'Divorced or Separated': ['divorced', 'separated', 'ex-husband', 'ex-wife', 'dissolution of marriage', 
                                            'broken up', 'divorcee', 'separated but not divorced', 'pending divorce'],
                'Widowed': ['widowed', 'widower', 'widow', 'lost spouse', 'bereaved partner'],
                'Complicated Relationship': ['it\'s complicated', 'on a break', 'uncertain relationship']
            }
        },
        'profession': {
            'Legal & Law Enforcement': {
                'Law': ['lawyer', 'attorney', 'law', 'jurisprudence', 'court', 'litigation', 
                        'legal counsel', 'barrister', 'solicitor', 'paralegal', 
                        'defense attorney', 'prosecutor', 'advocate'],
                'Police & Security': ['police officer', 'detective', 'investigator', 'patrol officer', 
                                    'sheriff', 'security officer', 'FBI agent', 'law enforcement'],
                'Military': ['soldier', 'marine', 'airman', 'navy', 'army', 'military officer', 
                            'veteran', 'combat engineer', 'infantry']
            },
            'Technology & Engineering': {
                'Information Technology': ['information technology', 'software engineer', 'developer', 
                                            'programmer', 'web developer', 'network administrator', 'system analyst', 
                                            'data scientist', 'cloud architect', 'AI engineer', 'ML engineer', 
                                            'cybersecurity specialist', 'DevOps engineer', 'full-stack developer', 
                                            'front-end developer', 'back-end developer', 'database administrator'],
                'Engineering': ['engineer', 'engineering', 'mechanical engineer', 'civil engineer', 
                                'electrical engineer', 'aerospace engineer', 'chemical engineer', 
                                'structural engineer', 'robotics engineer', 'environmental engineer', 
                                'automotive engineer', 'biomedical engineer']
            },
            'Healthcare & Social Services': {
                'Medical': ['doctor', 'physician', 'surgeon', 'nurse', 'medical professional', 
                            'healthcare provider', 'general practitioner', 'pediatrician', 'dentist', 
                            'orthopedic', 'radiologist', 'gynecologist', 'psychiatrist', 'anesthesiologist'],
                'Social Work & Counseling': ['social worker', 'counselor', 'case manager', 'therapist', 
                                            'mental health counselor', 'advocate', 'community organizer', 
                                            'child welfare specialist']
            },
            'Education & Academia': {
                'Teaching': ['teacher', 'educator', 'professor', 'instructor', 'tutor', 
                            'mentor', 'principal', 'lecturer', 'academic', 
                            'trainer', 'curriculum developer', 'coach'],
                'Student Roles': ['student', 'intern', 'trainee', 'apprentice']
            },
            'Science & Research': {
                'Scientists': ['scientist', 'researcher', 'chemist', 'biologist', 'physicist', 
                                'astronomer', 'laboratory', 'biotechnology', 'research scientist', 
                                'data analyst', 'environmental scientist', 'geneticist', 
                                'neuroscientist', 'pharmacologist']
            },
            'Creative & Media': {
                'Art & Design': ['artist', 'painter', 'sculptor', 'designer', 'photographer', 
                                'illustrator', 'visual artist', 'graphic designer', 
                                'fashion designer', 'digital artist', 'animator', 'video editor'],
                'Writing & Journalism': ['writer', 'author', 'journalist', 'novelist', 'content creator', 
                                        'blogger', 'editor', 'poet', 'copywriter', 'scriptwriter', 
                                        'columnist', 'biographer', 'screenwriter']
            },
            'Business & Finance': {
                'Entrepreneurship': ['entrepreneur', 'business owner', 'startup', 'founder', 'CEO', 
                                    'businessman', 'businesswoman', 'small business', 
                                    'co-founder', 'investor', 'angel investor', 'startup founder'],
                'Finance': ['accountant', 'auditor', 'investment banker', 'financial analyst', 
                            'CFA', 'wealth manager', 'banker', 'consultant', 'fund manager', 
                            'stockbroker', 'bookkeeper']
            },
            'Skilled Trades & Services': {
                'Construction & Technical': ['construction worker', 'contractor', 'builder', 'carpenter', 
                                            'plumber', 'electrician', 'welder', 'architect'],
                'Retail & Customer Service': ['retail worker', 'cashier', 'store manager', 'sales associate', 
                                                'shopkeeper', 'merchandiser', 'customer service'],
                'Hospitality': ['chef', 'cook', 'waiter', 'waitress', 'bartender', 
                                'hotel manager', 'housekeeper', 'concierge']
            },
            'Other Profession': {
                'Self-Employment & Gig Economy': ['self-employed', 'freelancer', 'independent contractor', 'consultant', 
                                                    'gig worker', 'rideshare driver', 'food delivery worker', 'online tutor'],
                'Unemployed': ['unemployed', 'not working', 'job seeker', 'between jobs'],
                'Other Professions': ['secretary', 'office manager', 'receptionist', 'clerical worker', 
                                        'truck driver', 'pilot', 'bus driver', 'delivery driver', 
                                        'farmer', 'agricultural worker', 'rancher', 'librarian', 
                                        'school counselor', 'firefighter', 'paramedic', 'postal worker']
            }
        },
        'income': {
            'Lower Class': {
                'label': '<= $30,000',
                'range': [1000, 30000],
                'income_terms': ['low income', 'poverty', 'minimum wage', 'below poverty line', 
                                'living paycheck to paycheck', 'low-wage job', 'part-time work', 
                                'unemployment benefits', 'disability benefits']
            },
            'Lower-Middle Class': {
                'label': '$30,001 - $58,020',
                'range': [30001, 58020],
                'income_terms': ['lower-middle class', 'working class', 'modest income', 'entry-level salary', 
                                'starting salary', 'average wage', 'lower middle income', 'blue-collar', 
                                'hourly pay', 'gig worker', 'side hustle']
            },
            'Middle Class': {
                'label': '$58,021 - $94,000',
                'range': [58021, 94000],
                'income_terms': ['middle class', 'middle income', 'average income', 'moderate salary', 
                                'moderate earnings', 'stable income', 'standard wage', 'dual-income household', 
                                'salaried employee', 'white-collar worker', 'median income', 'modest', 'comfortable', 'stable']
            },
            'Upper-Middle Class': {
                'label': '$94,001 - $153,000',
                'range': [94001, 153000],
                'income_terms': ['upper-middle class', 'upper middle income', 'higher salary', 'professional income', 
                                'upper middle wage', 'well-off', 'comfortable income', 'career growth salary', 
                                'six-figure job', 'middle management', 'senior professional']
            },
            'Upper Class': {
                'label': '> $153,000',
                'range': [153001, 999999999],  # Large upper bound for open-ended range
                'income_terms': ['upper class', 'high income', 'luxury', 'six-figure salary', 'wealthy', 'millionaire', 
                                'affluent', 'high earnings', 'top 1%', 'high net worth', 'elite income', 'executive salary', 
                                'C-suite', 'venture capitalist', 'investor', 'entrepreneur', 'business mogul', 
                                'seven-figure salary', 'financially independent', 'trust fund', 'inheritance']
            }
        },
        'residence': {
            'Single-Family Home': ['single-family home', 'detached house', 'household', 'private house', 
                        'bungalow', 'ranch house', 'mobile home', 'tiny house'],
            'Two-Family Home': ['two-family home', 'duplex', 'semi-detached house', 'in-law suite'],
            'Three-Family Home': ['three-family home', 'triplex', 'three-family residence', 'multi-unit residence'],
            'Apartment': ['apartment', 'condo', 'flat', 'studio', 'unit', 'rented apartment', 
                            'apartment complex', 'shared apartment', 'loft', 'penthouse', 'co-op', 
                            'high-rise', 'low-rise', 'walk-up'],
            'Shared Housing': ['shared housing', 'housemates', 'roommates', 'co-living', 'boarding house', 
                                'group housing', 'dormitory', 'hostel', 'fraternity house', 
                                'sorority house', 'shared room', 'communal living'],
            'Homeless': ['homeless', 'no fixed address', 'shelter resident', 'living on the streets', 
                        'houseless', 'temporary housing', 'emergency shelter', 'transitional housing', 
                        'motel', 'couch surfing', 'vehicle living'],
            'Other Housing': ['nursing home', 'assisted living', 'senior housing', 'military housing', 
                        'on-base housing', 'shanty', 'informal settlement', 'compound', 'village hut'],
            'Unspecified Residence': ['unspecified residence', 'temporary address', 'housing status unknown']
        }

} 

In [131]:
import pandas as pd
import re 

def extract_age(question):
    """
    Extracts age only if it appears in an age-related phrase (e.g., '18 years old', 'aged 30').
    """
    age_patterns = [
        r'(\d{1,3})\s*years?\s*old',
        r'aged\s*(\d{1,3})',
        r'(\d{1,3})-year-old',
        r'(\d{1,3})\s*yrs?\s*old',
        r'(\d{1,3})\s*yo',
        r'\b(\d{1,3})\b\s*(?:years?|yrs?)?',  # Matches "36" or "36 years"
        r'\bI\'?m\s*(\d{1,3})\b'  # Matches "I’m 36" or "I am 36"
    ]

    for pattern in age_patterns:
        match = re.search(pattern, question, re.IGNORECASE)
        if match:
            age = int(match.group(1))
            return age
    return None

def classify_age(question, keyword_categories):
    """
    Determines the correct age category based on numerical age or keywords.
    """
    if 'age' not in keyword_categories:
        print("Error: 'age' category missing")
        return 'N/A'

    age = extract_age(question)
    if age is not None:
        for subcategory, data in keyword_categories['age'].items():
            if not isinstance(data.get('range'), list):
                print(f"Error: Invalid range for {subcategory}")
                continue
            if age in data['range']:
                #print(f"Age {age} mapped to {subcategory}")  # Debug
                return subcategory

    # Keyword matching with hyphen normalization
    question_words = [word.replace('-', ' ') for word in question.lower().split()]
    for subcategory, data in keyword_categories['age'].items():
        for keyword in data.get('life_stage', []):
            keyword_clean = keyword.replace('-', ' ')
            threshold = 100 if len(keyword) <= 5 else 80  # Lowered threshold
            if threshold == 100:
                if any(keyword_clean.lower() == word.lower() for word in question_words):
                    #print(f"Matched age keyword: {keyword} (exact) -> {subcategory}")  # Debug
                    return subcategory
            else:
                if fuzz and any(fuzz.ratio(keyword_clean.lower(), word.lower()) >= threshold for word in question_words):
                    score = max(fuzz.ratio(keyword_clean.lower(), word.lower()) for word in question_words)
                    #print(f"Matched age keyword: {keyword} (fuzzy, score: {score}) -> {subcategory}")  # Debug
                    return subcategory
    return 'N/A'

In [133]:
def get_region_from_state(question, keyword_categories):
    """
    Identifies a region based on the presence of a state name.
    Uses dynamic similarity threshold:
    - 100% (exact match) for keywords with 5 or fewer letters
    - 90% for keywords longer than 5 letters
    """
    question = question.lower()
    for region, states in keyword_categories['region'].items():
        for state in states:
            threshold = 100 if len(state) <= 5 else 90
            if threshold == 100:
                # Exact match for short keywords
                if any(state.lower() == word.lower() for word in question.split()):
                    return region
            else:
                # Fuzzy match for longer keywords
                if any(fuzz.ratio(state.lower(), word.lower()) >= threshold for word in question.split()):
                    return region
    return 'N/A'


def classify_income(question, keyword_categories):
    """
    Classifies income based on numerical values or keywords.
    Uses dynamic similarity threshold:
    - 100% (exact match) for keywords with 5 or fewer letters
    - 90% for keywords longer than 5 letters
    """
    income_match = re.search(r'\$?\s*(\d{1,3}(?:,\d{3})*)\s*(?:USD)?\b', question, re.IGNORECASE)
    if income_match:
        income = int(income_match.group(1).replace(',', ''))
        for subcategory, data in keyword_categories['income'].items():
            min_income, max_income = data['range']
            if min_income <= income <= max_income:
                return subcategory

    for subcategory, data in keyword_categories['income'].items():
        for keyword in data['income_terms']:
            threshold = 100 if len(keyword) <= 5 else 90
            if threshold == 100:
                # Exact match for short keywords
                if any(keyword.lower() == word.lower() for word in question.lower().split()):
                    return subcategory
            else:
                # Fuzzy match for longer keywords
                if any(fuzz.ratio(keyword.lower(), word.lower()) >= threshold for word in question.lower().split()):
                    return subcategory

    return 'N/A'

In [135]:
from fuzzywuzzy import fuzz
import pandas as pd

def categorize_question(question, keyword_categories, label_size=15, similarity_threshold=90):
    if pd.isna(question) or not isinstance(question, str):
        return ['N/A'] * label_size, 0, {category: {} for category in keyword_categories.keys()}

    question = question.lower()
    labels = []
    matched_categories = 0
    category_hits = {category: {} for category in keyword_categories.keys()}

    for category, subcategories in keyword_categories.items():
        assigned_label = 'N/A'

        if category == 'region':
            assigned_label = get_region_from_state(question, keyword_categories)
            if assigned_label != 'N/A':
                category_hits[category][assigned_label] = 1

        elif category == 'age':
            assigned_label = classify_age(question, keyword_categories)
            if assigned_label != 'N/A':
                category_hits[category][assigned_label] = 1

        elif category == 'income':
            assigned_label = classify_income(question, keyword_categories)
            if assigned_label != 'N/A':
                category_hits[category][assigned_label] = 1
        elif category == "languages":
            # Match multiple languages
            matched_languages = set()
            for lang in subcategories:
                threshold = 100 if len(lang) <= 5 else 90
                if threshold == 100:
                    if any(lang.lower() == word.lower() for word in question.split()):
                        matched_languages.add(lang)
                else:
                    if any(fuzz.ratio(lang.lower(), word.lower()) >= threshold for word in question.split()):
                        matched_languages.add(lang)

            if matched_languages:
                assigned_label = "_".join(sorted(matched_languages))
                for lang in matched_languages:
                    category_hits[category][lang] = 1

        else:
            if isinstance(subcategories, dict):
                for parent, child_dict in subcategories.items():
                    if isinstance(child_dict, dict):
                        for child, keywords in child_dict.items():
                            for keyword in keywords:
                                threshold = 100 if len(keyword) <= 5 else 90
                                if threshold == 100:
                                    if any(keyword.lower() == word.lower() for word in question.split()):
                                        assigned_label = child
                                        category_hits[category][assigned_label] = 1
                                        break
                                else:
                                    if any(fuzz.ratio(keyword.lower(), word.lower()) >= threshold for word in question.split()):
                                        assigned_label = child
                                        category_hits[category][assigned_label] = 1
                                        break
                            if assigned_label != 'N/A':
                                break
                    elif isinstance(child_dict, list):  # One-level deep
                        for keyword in child_dict:
                            threshold = 100 if len(keyword) <= 5 else 90
                            if threshold == 100:
                                if any(keyword.lower() == word.lower() for word in question.split()):
                                    assigned_label = parent
                                    category_hits[category][assigned_label] = 1
                                    break
                            else:
                                if any(fuzz.ratio(keyword.lower(), word.lower()) >= threshold for word in question.split()):
                                    assigned_label = parent
                                    category_hits[category][assigned_label] = 1
                                    break
                    if assigned_label != 'N/A':
                        break
            elif isinstance(subcategories, list):  # Flat structure
                for keyword in subcategories:
                    threshold = 100 if len(keyword) <= 5 else 90
                    if threshold == 100:
                        if any(keyword.lower() == word.lower() for word in question.split()):
                            assigned_label = keyword
                            category_hits[category][keyword] = 1
                            break
                    else:
                        if any(fuzz.ratio(keyword.lower(), word.lower()) >= threshold for word in question.split()):
                            assigned_label = keyword
                            category_hits[category][keyword] = 1
                            break

        labels.append(assigned_label)
        if assigned_label != 'N/A':
            matched_categories += 1

    required_size = max(len(keyword_categories), label_size)
    labels = labels[:required_size]
    labels += ['N/A'] * (required_size - len(labels))

    return labels, matched_categories, category_hits


In [137]:
def process_csv(input_file, output_file, keyword_categories, similarity_threshold=80):
    """
    Reads a CSV file, applies labeling to the 'question' column, and saves a new CSV.
    Includes fuzzy matching statistics for all categories.
    """
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
        return

    if 'question' not in df.columns:     # change when changing csv's
        print("Error: The 'question' column is missing in the input CSV.")
        return

    total_questions = len(df)
    results = df['question'].apply(lambda q: categorize_question(q, keyword_categories, similarity_threshold=similarity_threshold))
    
    df[['labels', 'matched_categories', 'category_hits']] = pd.DataFrame(results.tolist(), index=df.index)

    # Aggregate subcategory hits
    category_hits_list = df.pop('category_hits').tolist()
    subcategory_totals = {}
    for category in keyword_categories.keys():
        subcategory_totals[category] = {}
        for hit in category_hits_list:
            for subcategory, value in hit[category].items():
                subcategory_totals[category][subcategory] = subcategory_totals[category].get(subcategory, 0) + value

    # Calculate percentages
    subcategory_percentages = {}
    for category in keyword_categories.keys():
        subcategory_percentages[category] = {
            subcategory: (count / total_questions) * 100
            for subcategory, count in subcategory_totals[category].items()
        }

    total_categories_matched = df['matched_categories'].sum()
    recognition_percentage = (total_categories_matched / (total_questions * len(keyword_categories))) * 100

    df.to_csv(output_file, index=False)

    print(f"Recognition Percentage: {recognition_percentage:.2f}%")
    print("\nSubcategory-wise hit counts and percentages:")
    for category in keyword_categories.keys():
        print(f"\nCategory: {category}")
        for subcategory, count in subcategory_totals[category].items():
            print(f"  {subcategory}: {count} questions matched ({subcategory_percentages[category][subcategory]:.2f}%)")

# Example usage
input_csv = 'synthetic_patient_descriptions_gpt4o.csv'
output_csv = 'synthetic_patient_descriptions_gpt4o_output.csv'
process_csv(input_csv, output_csv, keyword_categories, similarity_threshold=90)

Recognition Percentage: 38.00%

Subcategory-wise hit counts and percentages:

Category: gender
  female: 73 questions matched (48.67%)
  male: 13 questions matched (8.67%)
  other gender: 1 questions matched (0.67%)

Category: age
  adult: 107 questions matched (71.33%)
  senior: 1 questions matched (0.67%)
  young: 2 questions matched (1.33%)

Category: disability_status
  general_disability: 42 questions matched (28.00%)

Category: race
  european: 42 questions matched (28.00%)
  african_american: 21 questions matched (14.00%)
  north_american: 19 questions matched (12.67%)
  multiracial: 1 questions matched (0.67%)
  hispanic: 1 questions matched (0.67%)
  south_asian: 2 questions matched (1.33%)
  afro_latino: 1 questions matched (0.67%)
  australian_new_zealand: 1 questions matched (0.67%)
  east_asian: 1 questions matched (0.67%)

Category: country
  spain: 37 questions matched (24.67%)
  canada: 21 questions matched (14.00%)
  uk: 1 questions matched (0.67%)
  usa: 21 questions 