In [25]:
import pandas as pd
from fuzzywuzzy import fuzz

# Load the CSV file
df = pd.read_csv('IRF_testing.csv')

# Display the first few rows to check the structure
df.head()


Unnamed: 0,text,Thwarted_Belongingness,Perceived_Burdensomeness
0,"[I'm not seeking any medical advice, just want...",0,0
1,Im afraid. The suicide scale is true. Its gets...,1,1
2,I have found that one of the main issues keepi...,1,1
3,:(( not sure wats wrong with me. i think i‚Äôv...,1,1
4,Help? I'm just feeling really shitty at how I ...,0,0


In [27]:
print(df.columns)


Index(['text', 'Thwarted_Belongingness', 'Perceived_Burdensomeness'], dtype='object')


In [29]:
print(df.head())


                                                text  Thwarted_Belongingness  \
0  [I'm not seeking any medical advice, just want...                       0   
1  Im afraid. The suicide scale is true. Its gets...                       1   
2  I have found that one of the main issues keepi...                       1   
3  :(( not sure wats wrong with me. i think i‚Äôv...                       1   
4  Help? I'm just feeling really shitty at how I ...                       0   

   Perceived_Burdensomeness  
0                         0  
1                         1  
2                         1  
3                         1  
4                         0  


In [31]:
keyword_categories = {
        'gender': {
            'male': {
                'pronouns': ['he', 'him'],
                'roles': ['father', 'brother', 'husband', 'dad', 'son'],
                'descriptors': ['male', 'boy', 'man', 'gentleman', 'masculine', 'sir']
            },
            'female': {
                'pronouns': ['she', 'her'],
                'roles': ['mother', 'sister', 'wife', 'mom', 'daughter', 'pregnancy'],
                'descriptors': ['female', 'girl', 'woman', 'lady', 'feminine', 'madam']
            },
            'others': {
                'pronouns': ['they', 'them'],
                'identities': ['transgender', 'non-binary', 'gender identity', 'genderqueer', 
                            'genderfluid', 'agender', 'bigender', 'two-spirit', 
                            'demiboy', 'demigirl', 'androgyne', 'neutrois']
            }
        },       
        'age': {
            'young': {
                'label': 'young (<18)',
                'range': list(range(0, 18)),
                'life_stage': ['infant', 'toddler', 'child', 'teenager', 'minor']
            },
            'adult': {
                'label': 'adult (18-64)',
                'range': list(range(18, 65)),
                'life_stage': ['young adult', 'adult', 'middle-aged']
            },
            'senior': {
                'label': 'senior (65+)',
                'range': list(range(65, 120)),
                'life_stage': ['elderly', 'senior citizen', 'retiree']
            }
        },
        'disability_status': {
            'has_disability': {
                'conditions': ['autism', 'Down syndrome', 'cerebral palsy', 'epilepsy', 'dyslexia', 'ADHD',
                                'PTSD', 'anxiety disorder', 'speech disorder', 'cognitive disability',
                                'intellectual disability', 'developmental disability', 'mental health condition'],
                'sensory_impairments': ['hearing impaired', 'visually impaired', 'deaf', 'blind'],
                'physical_impairments': ['mobility aid', 'wheelchair', 'amputee', 'prosthetic', 'temporary disability',
                                        'post-surgery recovery', 'temporary mobility aid', 'chronic pain'],
                'aids_support': ['assistive device', 'service animal', 'braille', 'sign language',
                                'rehabilitation', 'inclusion support'],
                'descriptors': ['disabled', 'impairment', 'handicap', 'challenged', 'accessible',
                                'accessibility', 'special needs', 'neurodiverse', 'disability']
            },
            'does_not_have_disability': {
                'descriptors': ['able-bodied', 'no disability', 'not disabled', 'fully abled',
                                'physically fit', 'unimpaired', 'independent mobility', 'no special needs']
            }
        },
        'race': {
            'asian': {
                'regions': ['east asian', 'south asian', 'southeast asian'],
                'ethnicities': ['asian', 'chinese', 'japanese', 'korean', 'indian', 'pakistani',
                                'bangladeshi', 'filipino', 'thai', 'vietnamese', 'mongolian']
            },
            'african': {
                'regions': ['african', 'caribbean'],
                'ethnicities': ['african-american', 'black', 'nigerian', 'ethiopian', 'ghanaian',
                                'kenyan', 'south african', 'caribbean', 'afro-latino']
            },
            'white': {
                'regions': ['european', 'north american', 'australian'],
                'ethnicities': ['white', 'caucasian', 'american', 'canadian', 'australian',
                                'british', 'irish', 'german', 'french', 'italian']
            },
            'hispanic': {
                'identities': ['hispanic', 'latino', 'latina', 'latinx'],
                'ethnicities': ['mexican', 'puerto rican', 'cuban', 'dominican',
                                'colombian', 'argentinian', 'chilean', 'peruvian']
            },
            'indigenous': {
                'identities': ['native american', 'indigenous', 'first nations', 'tribal'],
                'groups': ['aboriginal', 'inuit', 'maori', 'sami']
            },
            'caribbean': {
                'subgroups': ['caribbean', 'afro-caribbean', 'indo-caribbean'],
                'ethnicities': ['jamaican', 'haitian', 'barbadian']
            },
            'other': {
                'multiracial': ['multiracial'],
                'pacific_islanders': ['pacific islander', 'hawaiian', 'polynesian', 'melanesian', 'micronesian'],
                'middle_eastern': ['middle eastern', 'arab', 'persian', 'turkish', 'berber'],
                'jewish': ['jewish', 'ashkenazi', 'sephardic'],
                'roma': ['roma', 'gypsy']
            }
        },  
        'country': {
            'usa': {
                'names': ['united states', 'usa', 'u.s.', 'united states of america'],
                'demonyms': ['american'],
                'regional_terms': ['rural america', 'urban america']
            },
            'canada': {
                'names': ['canada'],
                'demonyms': ['canadian'],
                'regional_terms': ['rural canada', 'urban canada']
            },
            'uk': {
                'names': ['united kingdom', 'uk', 'britain'],
                'subregions': ['england', 'scotland', 'wales', 'northern ireland'],
                'demonyms': ['british']
            },
            'germany': {
                'names': ['germany', 'deutschland'],
                'demonyms': ['german'],
                'cities': ['berlin']
            },
            'france': {
                'names': ['france'],
                'demonyms': ['french'],
                'cities': ['paris']
            },
            'india': {
                'names': ['india', 'bharat', 'hindustan'],
                'demonyms': ['indian'],
                'regional_terms': ['rural india', 'urban india']
            },
            'china': {
                'names': ['china'],
                'demonyms': ['chinese'],
                'cities': ['beijing']
            },
            'japan': {
                'names': ['japan'],
                'demonyms': ['japanese'],
                'cities': ['tokyo']
            },
            'australia': {
                'names': ['australia'],
                'demonyms': ['australian', 'aussie']
            },
            'brazil': {
                'names': ['brazil'],
                'demonyms': ['brazilian']
            },
            'mexico': {
                'names': ['mexico'],
                'demonyms': ['mexican']
            },
            'italy': {
                'names': ['italy'],
                'demonyms': ['italian'],
                'cities': ['rome']
            },
            'spain': {
                'names': ['spain'],
                'demonyms': ['spanish'],
                'cities': ['madrid']
            },
            'russia': {
                'names': ['russia'],
                'demonyms': ['russian'],
                'cities': ['moscow']
            },
            'south africa': {
                'names': ['south africa'],
                'demonyms': ['south african']
            },
            'other_countries': {
                'names': ['nigeria', 'ethiopia', 'kenya', 'saudi arabia', 'iran', 'pakistan',
                        'bangladesh', 'philippines', 'vietnam', 'colombia', 'argentina', 'peru']
            }
        },
        'state': {
            'Alabama': ['AL', 'alabama', 'birmingham', 'montgomery', 'rural alabama', 'urban alabama'],
            'Alaska': ['AK', 'alaska', 'anchorage', 'juneau', 'rural alaska', 'urban alaska'],
            'Arizona': ['AZ', 'arizona', 'phoenix', 'tucson', 'rural arizona', 'urban arizona'],
            'Arkansas': ['AR', 'arkansas', 'little rock', 'rural arkansas', 'urban arkansas'],
            'California': ['CA', 'california', 'cali', 'los angeles', 'san francisco', 
                           'sacramento', 'San Diego', 'Oakland', 'rural california', 'urban california'],
            'Colorado': ['CO', 'colorado', 'denver', 'boulder', 'rural colorado', 'urban colorado'],
            'Connecticut': ['CT', 'connecticut', 'hartford', 'new haven', 'rural connecticut', 'urban connecticut'],
            'Delaware': ['DE', 'delaware', 'dover', 'wilmington', 'rural delaware', 'urban delaware'],
            'Florida': ['FL', 'florida', 'fl', 'miami', 'orlando', 'tampa', 'Jacksonville', 'rural florida', 'urban florida'],
            'Georgia': ['GA', 'georgia', 'atlanta', 'savannah', 'rural georgia', 'urban georgia'],
            'Hawaii': ['HI', 'hawaii', 'honolulu', 'maui', 'rural hawaii', 'urban hawaii'],
            'Idaho': ['ID', 'idaho', 'boise', 'rural idaho', 'urban idaho'],
            'Illinois': ['IL', 'illinois', 'chicago', 'springfield', 'rural illinois', 'urban illinois'],
            'Indiana': ['IN', 'indiana', 'indianapolis', 'rural indiana', 'urban indiana'],
            'Iowa': ['IA', 'iowa', 'des moines', 'rural iowa', 'urban iowa'],
            'Kansas': ['KS', 'kansas', 'topeka', 'wichita', 'rural kansas', 'urban kansas'],
            'Kentucky': ['KY', 'kentucky', 'louisville', 'lexington', 'rural kentucky', 'urban kentucky'],
            'Louisiana': ['LA', 'louisiana', 'new orleans', 'baton rouge', 'rural louisiana', 'urban louisiana'],
            'Maine': ['ME', 'maine', 'portland', 'augusta', 'rural maine', 'urban maine'],
            'Maryland': ['MD', 'maryland', 'baltimore', 'annapolis', 'rural maryland', 'urban maryland'],
            'Massachusetts': ['MA', 'massachusetts', 'boston', 'cambridge', 'rural massachusetts', 'urban massachusetts'],
            'Michigan': ['MI', 'michigan', 'detroit', 'lansing', 'rural michigan', 'urban michigan'],
            'Minnesota': ['MN', 'minnesota', 'minneapolis', 'st. paul', 'rural minnesota', 'urban minnesota'],
            'Mississippi': ['MS', 'mississippi', 'jackson', 'rural mississippi', 'urban mississippi'],
            'Missouri': ['MO', 'missouri', 'st. louis', 'kansas city', 'rural missouri', 'urban missouri'],
            'Montana': ['MT', 'montana', 'helena', 'billings', 'rural montana', 'urban montana'],
            'Nebraska': ['NE', 'nebraska', 'lincoln', 'omaha', 'rural nebraska', 'urban nebraska'],
            'Nevada': ['NV', 'nevada', 'las vegas', 'reno', 'rural nevada', 'urban nevada'],
            'New Hampshire': ['NH', 'new hampshire', 'concord', 'manchester', 'rural new hampshire', 'urban new hampshire'],
            'New Jersey': ['NJ', 'new jersey', 'trenton', 'newark', 'rural new jersey', 'urban new jersey'],
            'New Mexico': ['NM', 'new mexico', 'santa fe', 'albuquerque', 'rural new mexico', 'urban new mexico'],
            'New York': ['NY', 'new york', 'nyc', 'albany', 'manhattan', 'rural new york', 'urban new york'],
            'North Carolina': ['NC', 'north carolina', 'charlotte', 'raleigh', 'rural north carolina', 'urban north carolina'],
            'North Dakota': ['ND', 'north dakota', 'bismarck', 'fargo', 'rural north dakota', 'urban north dakota'],
            'Ohio': ['OH', 'ohio', 'columbus', 'cleveland', 'rural ohio', 'urban ohio'],
            'Oklahoma': ['OK', 'oklahoma', 'oklahoma city', 'tulsa', 'rural oklahoma', 'urban oklahoma'],
            'Oregon': ['OR', 'oregon', 'portland', 'salem', 'rural oregon', 'urban oregon'],
            'Pennsylvania': ['PA', 'pennsylvania', 'philadelphia', 'pittsburgh', 'harrisburg', 'rural pennsylvania', 'urban pennsylvania'],
            'Rhode Island': ['RI', 'rhode island', 'providence', 'rural rhode island', 'urban rhode island'],
            'South Carolina': ['SC', 'south carolina', 'charleston', 'columbia', 'rural south carolina', 'urban south carolina'],
            'South Dakota': ['SD', 'south dakota', 'pierre', 'sioux falls', 'rural south dakota', 'urban south dakota'],
            'Tennessee': ['TN', 'tennessee', 'nashville', 'memphis', 'rural tennessee', 'urban tennessee'],
            'Texas': ['TX', 'texas', 'houston', 'dallas', 'austin', 'san antonio', 'Fort Worth', 'El Paso', 'rural texas', 'urban texas'],
            'Utah': ['UT', 'utah', 'salt lake city', 'rural utah', 'urban utah'],
            'Vermont': ['VT', 'vermont', 'montpelier', 'burlington', 'rural vermont', 'urban vermont'],
            'Virginia': ['VA', 'virginia', 'richmond', 'virginia beach', 'rural virginia', 'urban virginia'],
            'Washington': ['WA', 'washington', 'seattle', 'olympia', 'rural washington', 'urban washington'],
            'West Virginia': ['WV', 'west virginia', 'charleston', 'rural west virginia', 'urban west virginia'],
            'Wisconsin': ['WI', 'wisconsin', 'madison', 'milwaukee', 'rural wisconsin', 'urban wisconsin'],
            'Wyoming': ['WY', 'wyoming', 'cheyenne', 'rural wyoming', 'urban wyoming'],
            'Puerto Rico': ['PR', 'puerto rico', 'san juan', 'hurricane-related health', 'rural puerto rico'],
            'Guam': ['GU', 'guam'],
            'U.S. Virgin Islands': ['VI', 'u.s. virgin islands', 'st. thomas']
        },
        'region': {
            'Northeast': {
                'New England': ['maine', 'new hampshire', 'vermont', 'massachusetts', 'rhode island', 'connecticut', 'new england'],
                'Mid-Atlantic': ['new york', 'new jersey', 'pennsylvania'],
                'General': ['rural northeast', 'urban northeast']
            },
            'Midwest': {
                'East North Central': ['ohio', 'michigan', 'indiana', 'illinois', 'wisconsin', 'great lakes'],
                'West North Central': ['minnesota', 'iowa', 'missouri', 'north dakota', 'south dakota', 'nebraska', 'kansas', 'heartland'],
                'Other': ['rust belt'],
                'General': ['rural midwest', 'urban midwest']
            },
            'Southeast': {
                'South Atlantic': ['delaware', 'maryland', 'virginia', 'west virginia', 'north carolina', 'south carolina', 'florida'],
                'East South Central': ['kentucky', 'tennessee', 'alabama', 'mississippi'],
                'West South Central (shared)': ['arkansas', 'louisiana'],
                'Cultural': ['deep south', 'appalachia', 'sun belt'],
                'General': ['rural southeast', 'urban southeast']
            },
            'Southwest': {
                'Core States': ['oklahoma', 'texas', 'new mexico', 'arizona'],
                'Cultural': ['border states', 'sun belt'],
                'General': ['rural southwest', 'urban southwest']
            },
            'West': {
                'Mountain West': ['montana', 'idaho', 'wyoming', 'colorado', 'utah', 'nevada', 'mountain west'],
                'Pacific': ['washington', 'oregon', 'california', 'alaska', 'hawaii', 'pacific northwest'],
                'General': ['sun belt', 'rural west', 'urban west']
            },
            'Territories': {
                'Caribbean': ['puerto rico', 'u.s. virgin islands'],
                'Pacific Islands': ['guam', 'american samoa', 'northern mariana islands']
            }
        },
        'languages': {
            'english': ['english', 'british english', 'american english'],
            'spanish': ['spanish', 'español', 'castilian'],
            'french': ['french', 'français'],
            'german': ['german', 'deutsch'],
            'chinese': ['chinese', 'mandarin', 'cantonese', 'simplified chinese', 'traditional chinese', 
                        'putonghua'],
            'portuguese': ['portuguese', 'português', 'brazilian portuguese'],
            'arabic': ['arabic', 'arab'],
            'hindi': ['hindi', 'hindustani'],
            'russian': ['russian'],
            'japanese': ['japanese'],
            'italian': ['italian', 'italiano'],
            'korean': ['korean'],
            'dutch': ['dutch', 'nederlands'],
            'turkish': ['turkish', 'türkçe'],
            'swedish': ['swedish', 'svenska'],
            'polish': ['polish', 'polski'],
            'greek': ['greek'],
            'romanian': ['romanian', 'română'],
            'hebrew': ['hebrew'],
            'thai': ['thai'],
            'vietnamese': ['vietnamese'],
            'tagalog': ['tagalog', 'filipino'],
            'persian': ['persian', 'farsi'],
            'urdu': ['urdu'],
            'bengali': ['bengali'],
            'punjabi': ['punjabi'],
            'tamil': ['tamil'],
            'telugu': ['telugu'],
            'malayalam': ['malayalam', 'malay'],
            'indonesian': ['indonesian', 'bahasa indonesia'],
            'finnish': ['finnish', 'suomi'],
            'danish': ['danish', 'dansk'],
            'norwegian': ['norwegian'],
            'indigenous': ['navajo', 'cherokee', 'ojibwe', 'inuktitut', 'hawaiian', 'sami', 'maori'],
            'african': ['swahili', 'yoruba', 'amharic', 'hausa', 'zulu'],
            'sign_languages': ['american sign language', 'ASL', 'british sign language', 'BSL', 'auslan'],
            'creole': ['haitian creole', 'jamaican patois', 'tok pisin'],
        },
        'college_degree': {
            'High School': {
                'Standard': ['high school', 'highschool', 'secondary school', 'secondary education', 'hsc', 'matriculation'],
                'International': ['general education diploma', 'a-levels', 'baccalaureate']
            },
            'Bachelor': {
                'General': ['undergrad', 'undergraduate', 'bachelor', "bachelor's degree"],
                'Arts & Science': ['bachelor of arts', 'bachelor of science'],
                'Engineering & Technology': ['b.tech', 'bachelor of engineering', 'bachelor of technology'],
                'International': ['licenciatura', 'laurea']
            },
            'Masters': {
                'General': ['master', 'masters', 'graduate', "master's degree", 'graduate degree'],
                'Arts & Science': ['master of arts', 'master of science'],
                'Engineering & Technology': ['master of engineering', 'master of technology']
            },
            'Doctoral': {
                'Academic': ['doctorate', 'phd', 'doctoral', 'doctoral degree', 'doctoral studies', 'doctor of philosophy'],
                'Professional': ['doctor', 'doctor of medicine', 'juris doctor', 'doctor of pharmacy', 'doctor of nursing practice'],
                'Postdoctoral': ['postdoctoral']
            },
            'Diploma/Certificate': {
                'Degree Equivalents': ['associate degree'],
                'Vocational': ['technical diploma', 'vocational diploma', 'trade school', 'apprenticeship'],
                'Certifications': ['certificate', 'certification program', 'technical training', 'professional certification', 'license']
            }
        },
        'SNS': {
            'Has Social Media': {
                'General Social Media': ['sns', 'social media', 'facebook', 'twitter', 'x', 'instagram', 
                                        'youtube', 'snapchat', 'pinterest', 'reddit', 'tumblr', 
                                        'threads', 'bereal', 'tiktok', 'clubhouse'],
                'Messaging Platforms': ['whatsapp', 'telegram', 'wechat', 'wechat moments', 
                                        'kakaotalk', 'line', 'discord'],
                'Professional Networks': ['linkedin'],
                'Streaming & Gaming': ['twitch'],
                'Regional Platforms': ['vkontakte', 'odnoklassniki', 'sharechat'],
                'Health & Wellness Communities': ['patientslikeme', 'healthunlocked', 'myfitnesspal']
            }
        },
        'religion': {
            'Abrahamic Religions': {
                'Christianity': ['christian', 'christianity', 'jesus', 'jesus christ', 'bible', 'church', 'catholic', 
                                'protestant', 'evangelical', 'orthodox', 'born again', 'pastor', 'gospel', 'holy spirit', 
                                'mormon', 'jehovah’s witness', 'seventh-day adventist', 'pentecostal', 'baptist', 
                                'methodist', 'fasting'],
                'Islam': ['islam', 'muslim', 'quran', 'mosque', 'prophet muhammad', 'ramadan', 'sharia', 
                            'eid', 'hijab', 'fasting'],
                'Judaism': ['jewish', 'judaism', 'torah', 'synagogue', 'rabbi', 'kosher', 'yom kippur', 
                            'hanukkah', 'shabbat', 'talmud']
            },
            'Dharmic Religions': {
                'Hinduism': ['hindu', 'hinduism', 'karma', 'moksha', 'yoga', 'vedas', 'bhagavad gita', 
                            'upanishads', 'puja', 'diwali', 'holi', 'shiva', 'vishnu', 'krishna', 'ayurveda'],
                'Buddhism': ['buddhist', 'buddhism', 'buddha', 'samsara', 'nirvana', 'dharma', 'vajrayana', 
                            'theravada', 'mahayana'],
                'Sikhism': ['sikh', 'sikhism', 'guru nanak', 'gurdwara', 'khalsa', 'turban', 'guru granth sahib'],
                'Jainism': ['jainism']
            },
            'East Asian Religions': {
                'Taoism': ['taoism', 'taoist', 'daoism', 'yin-yang', 'tao te ching'],
                'Shinto': ['shinto', 'kami', 'shrine', 'shintoism', 'torii', 'shinto priest']
            },
            'New Religious Movements and Others': {
                'Spirituality': ['spiritual', 'spirituality', 'spiritualism', 'new age', 'esoteric', 
                                'mysticism', 'meditation', 'energy healing'],
                'Agnosticism': ['agnostic', 'agnosticism', 'uncertain', 'agnosticity', 'questioning belief'],
                'Atheism': ['atheist', 'atheism', 'godless', 'secular', 'non-believer', 'freethinker', 'humanist'],
                'Indigenous & Animistic Beliefs': ['native american spirituality', 'shamanism', 'aboriginal dreamtime', 'inuit animism'],
                'Other Religions': ['baha’i', 'zoroastrianism', 'rastafarianism', 'wicca', 'paganism']
            }
        },
        'marital status': {
            'Currently Married or Partnered': {
                'Married': ['married', 'spouse', 'husband', 'wife', 'partner', 'civil union', 
                            'common law marriage', 'domestic partnership', 'same-sex marriage', 
                            'arranged marriage', 'customary marriage'],
                'Engaged': ['engaged', 'fiancé', 'fiancée', 'betrothed'],
                'Cohabiting': ['cohabiting', 'living together', 'cohabit', 'roommate partner']
            },
            'Not Currently Married': {
                'Single': ['single', 'unmarried', 'never married', 'bachelor', 'bachelorette'],
                'Divorced or Separated': ['divorced', 'separated', 'ex-husband', 'ex-wife', 'dissolution of marriage', 
                                            'broken up', 'divorcee', 'separated but not divorced', 'pending divorce'],
                'Widowed': ['widowed', 'widower', 'widow', 'lost spouse', 'bereaved partner'],
                'Complicated': ['it\'s complicated', 'on a break', 'uncertain relationship']
            }
        },
        'profession': {
            'Legal & Law Enforcement': {
                'Law': ['lawyer', 'attorney', 'law', 'jurisprudence', 'court', 'litigation', 
                        'legal counsel', 'barrister', 'solicitor', 'paralegal', 
                        'defense attorney', 'prosecutor', 'advocate'],
                'Police & Security': ['police officer', 'detective', 'investigator', 'patrol officer', 
                                    'sheriff', 'security officer', 'FBI agent', 'law enforcement'],
                'Military': ['soldier', 'marine', 'airman', 'navy', 'army', 'military officer', 
                            'veteran', 'combat engineer', 'infantry']
            },
            'Technology & Engineering': {
                'Information Technology': ['IT', 'information technology', 'software engineer', 'developer', 
                                            'programmer', 'web developer', 'network administrator', 'system analyst', 
                                            'data scientist', 'cloud architect', 'AI engineer', 'ML engineer', 
                                            'cybersecurity specialist', 'DevOps engineer', 'full-stack developer', 
                                            'front-end developer', 'back-end developer', 'database administrator'],
                'Engineering': ['engineer', 'engineering', 'mechanical engineer', 'civil engineer', 
                                'electrical engineer', 'aerospace engineer', 'chemical engineer', 
                                'structural engineer', 'robotics engineer', 'environmental engineer', 
                                'automotive engineer', 'biomedical engineer']
            },
            'Healthcare & Social Services': {
                'Medical': ['doctor', 'physician', 'surgeon', 'nurse', 'medical professional', 
                            'healthcare provider', 'general practitioner', 'pediatrician', 'dentist', 
                            'orthopedic', 'radiologist', 'gynecologist', 'psychiatrist', 'anesthesiologist'],
                'Social Work & Counseling': ['social worker', 'counselor', 'case manager', 'therapist', 
                                            'mental health counselor', 'advocate', 'community organizer', 
                                            'child welfare specialist']
            },
            'Education & Academia': {
                'Teaching': ['teacher', 'educator', 'professor', 'instructor', 'tutor', 
                            'mentor', 'principal', 'lecturer', 'academic', 
                            'trainer', 'curriculum developer', 'coach'],
                'Student Roles': ['student', 'intern', 'trainee', 'apprentice']
            },
            'Science & Research': {
                'Scientists': ['scientist', 'researcher', 'chemist', 'biologist', 'physicist', 
                                'astronomer', 'laboratory', 'biotechnology', 'research scientist', 
                                'data analyst', 'environmental scientist', 'geneticist', 
                                'neuroscientist', 'pharmacologist']
            },
            'Creative & Media': {
                'Art & Design': ['artist', 'painter', 'sculptor', 'designer', 'photographer', 
                                'illustrator', 'visual artist', 'graphic designer', 
                                'fashion designer', 'digital artist', 'animator', 'video editor'],
                'Writing & Journalism': ['writer', 'author', 'journalist', 'novelist', 'content creator', 
                                        'blogger', 'editor', 'poet', 'copywriter', 'scriptwriter', 
                                        'columnist', 'biographer', 'screenwriter']
            },
            'Business & Finance': {
                'Entrepreneurship': ['entrepreneur', 'business owner', 'startup', 'founder', 'CEO', 
                                    'businessman', 'businesswoman', 'small business', 
                                    'co-founder', 'investor', 'angel investor', 'startup founder'],
                'Finance': ['accountant', 'auditor', 'investment banker', 'financial analyst', 
                            'CFA', 'wealth manager', 'banker', 'consultant', 'fund manager', 
                            'stockbroker', 'bookkeeper']
            },
            'Skilled Trades & Services': {
                'Construction & Technical': ['construction worker', 'contractor', 'builder', 'carpenter', 
                                            'plumber', 'electrician', 'welder', 'architect'],
                'Retail & Customer Service': ['retail worker', 'cashier', 'store manager', 'sales associate', 
                                                'shopkeeper', 'merchandiser', 'customer service'],
                'Hospitality': ['chef', 'cook', 'waiter', 'waitress', 'bartender', 
                                'hotel manager', 'housekeeper', 'concierge']
            },
            'Other': {
                'Self-Employment & Gig Economy': ['self-employed', 'freelancer', 'independent contractor', 'consultant', 
                                                    'gig worker', 'rideshare driver', 'food delivery worker', 'online tutor'],
                'Unemployed': ['unemployed', 'not working', 'job seeker', 'between jobs'],
                'Other Professions': ['secretary', 'office manager', 'receptionist', 'clerical worker', 
                                        'truck driver', 'pilot', 'bus driver', 'delivery driver', 
                                        'farmer', 'agricultural worker', 'rancher', 'librarian', 
                                        'school counselor', 'firefighter', 'paramedic', 'postal worker']
            }
        },
        'income': {
            'Lower Class': {
                'label': '<= $30,000',
                'range': [0, 30000],
                'income_terms': ['income', 'salary', 'wage', 'earnings', 'pay', 'compensation', 
                                'low income', 'poverty', 'minimum wage', 'below poverty line', 
                                'living paycheck to paycheck', 'low-wage job', 'part-time work', 
                                'unemployment benefits', 'disability benefits']
            },
            'Lower-Middle Class': {
                'label': '$30,001 - $58,020',
                'range': [30001, 58020],
                'income_terms': ['lower-middle class', 'working class', 'modest income', 'entry-level salary', 
                                'starting salary', 'average wage', 'lower middle income', 'blue-collar', 
                                'hourly pay', 'gig worker', 'side hustle']
            },
            'Middle Class': {
                'label': '$58,021 - $94,000',
                'range': [58021, 94000],
                'income_terms': ['middle class', 'middle income', 'average income', 'moderate salary', 
                                'moderate earnings', 'stable income', 'standard wage', 'dual-income household', 
                                'salaried employee', 'white-collar worker', 'median income']
            },
            'Upper-Middle Class': {
                'label': '$94,001 - $153,000',
                'range': [94001, 153000],
                'income_terms': ['upper-middle class', 'upper middle income', 'higher salary', 'professional income', 
                                'upper middle wage', 'well-off', 'comfortable income', 'career growth salary', 
                                'six-figure job', 'middle management', 'senior professional']
            },
            'Upper Class': {
                'label': '> $153,000',
                'range': [153001, 999999999],  # Large upper bound for open-ended range
                'income_terms': ['upper class', 'high income', 'luxury', 'six-figure salary', 'wealthy', 'millionaire', 
                                'affluent', 'high earnings', 'top 1%', 'high net worth', 'elite income', 'executive salary', 
                                'C-suite', 'venture capitalist', 'investor', 'entrepreneur', 'business mogul', 
                                'seven-figure salary', 'financially independent', 'trust fund', 'inheritance']
            }
        },
        'residence': {
            'Single-Family Home': {
                'Terms': ['single-family home', 'detached house', 'household', 'private house', 
                        'bungalow', 'ranch house', 'mobile home', 'tiny house']
            },
            'Two-Family Home': {
                'Terms': ['two-family home', 'duplex', 'semi-detached house', 'in-law suite']
            },
            'Three-Family Home': {
                'Terms': ['three-family home', 'triplex', 'three-family residence', 'multi-unit residence']
            },
            'Apartment': {
                'Terms': ['apartment', 'condo', 'flat', 'studio', 'unit', 'rented apartment', 
                    'apartment complex', 'shared apartment', 'loft', 'penthouse', 'co-op', 
                    'high-rise', 'low-rise', 'walk-up']
            },
            'Shared Housing': {
                'Terms': ['shared housing', 'housemates', 'roommates', 'co-living', 'boarding house', 
                        'group housing', 'dormitory', 'hostel', 'fraternity house', 
                        'sorority house', 'shared room', 'communal living']
            },
            'Homeless': {
                'Terms': ['homeless', 'no fixed address', 'shelter resident', 'living on the streets', 
                        'houseless', 'temporary housing', 'emergency shelter', 'transitional housing', 
                        'motel', 'couch surfing', 'vehicle living']
            },
            'Other': {
                'Terms': ['nursing home', 'assisted living', 'senior housing', 'military housing', 
                        'on-base housing', 'shanty', 'informal settlement', 'compound', 'village hut']
            },
            'Neutral': {
                'Terms': ['unspecified residence', 'temporary address', 'housing status unknown']
            }
        }

} 

In [33]:
import pandas as pd
import re 

def extract_age(question):
    """
    Extracts age only if it appears in an age-related phrase (e.g., '18 years old', 'aged 30').
    """
    age_patterns = [
        r'(\d{1,3})\s*years?\s*old',
        r'aged\s*(\d{1,3})',
        r'(\d{1,3})-year-old',
        r'(\d{1,3})\s*yrs?\s*old',
        r'(\d{1,3})\s*yo',
        r'(\d{1,3})\s*years?',
        r'(\d{1,3})\s*yrs?'
    ]

    for pattern in age_patterns:
        match = re.search(pattern, question, re.IGNORECASE)
        if match:
            age = int(match.group(1))
            return age
    return None

def classify_age(question, keyword_categories):
    """
    Determines the correct age category based on numerical age or keywords.
    """
    age = extract_age(question)
    if age is not None:
        for subcategory, data in keyword_categories['age'].items():
            if age in data['range']:
                return subcategory

    for subcategory, data in keyword_categories['age'].items():
        for keyword in data['life_stage']:
            if any(fuzz.ratio(keyword.lower(), word.lower()) >= 80 for word in question.lower().split()):
                return subcategory

    return 'N/A'

In [35]:
def get_region_from_state(question, keyword_categories):
    """
    Identifies a region based on the presence of a state name with fuzzy matching.
    """
    question = question.lower()
    for region, states in keyword_categories['region'].items():
        for state in states:
            if any(fuzz.ratio(state.lower(), word.lower()) >= 80 for word in question.split()):
                return region
    return 'N/A'

def classify_income(question, keyword_categories):
    """
    Classifies income based on numerical values or keywords with fuzzy matching.
    """
    income_match = re.search(r'\$?\s*(\d{1,3}(?:,\d{3})*)\s*(?:USD)?\b', question, re.IGNORECASE)
    if income_match:
        income = int(income_match.group(1).replace(',', ''))
        for subcategory, data in keyword_categories['income'].items():
            min_income, max_income = data['range']
            if min_income <= income <= max_income:
                return subcategory

    for subcategory, data in keyword_categories['income'].items():
        for keyword in data['income_terms']:
            if any(fuzz.ratio(keyword.lower(), word.lower()) >= 80 for word in question.lower().split()):
                return subcategory

    return 'N/A'

In [37]:
# Function to categorize a question based on predefined keywords
# Function to categorize a question based on predefined keywords
def categorize_question(question, keyword_categories, label_size=15, similarity_threshold=80):
    """
    Assigns a single subcategory per category based on keyword matches or numerical ranges.
    Uses fuzzy matching for text-based categories.
    """
    if pd.isna(question) or not isinstance(question, str):
        return ['N/A'] * label_size, 0, {category: 0 for category in keyword_categories.keys()}

    question = question.lower()
    labels = []
    matched_categories = 0
    category_hits = {category: 0 for category in keyword_categories.keys()}

    # Handle special categories
    for category, subcategories in keyword_categories.items():
        assigned_label = 'N/A'

        if category == 'region':
            assigned_label = get_region_from_state(question, keyword_categories)
        elif category == 'age':
            assigned_label = classify_age(question, keyword_categories)
        elif category == 'income':
            assigned_label = classify_income(question, keyword_categories)
        else:
            # Handle both nested and flat structures
            keywords = []
            if isinstance(subcategories, dict) and all(isinstance(v, dict) for v in subcategories.values()):
                # Nested structure (e.g., gender, disability_status)
                for subcategory, data in subcategories.items():
                    sub_keywords = []
                    for key, value in data.items():
                        if isinstance(value, list):
                            sub_keywords.extend(value)
                    # Fuzzy matching for this subcategory
                    for keyword in sub_keywords:
                        if any(fuzz.ratio(keyword.lower(), word.lower()) >= similarity_threshold for word in question.split()):
                            assigned_label = subcategory
                            break
                    if assigned_label != 'N/A':
                        break
            else:
                # Flat structure (e.g., languages, religion)
                if isinstance(subcategories, dict):
                    keywords = subcategories.get('keywords', [])
                elif isinstance(subcategories, list):
                    keywords = subcategories
                for keyword in keywords:
                    if any(fuzz.ratio(keyword.lower(), word.lower()) >= similarity_threshold for word in question.split()):
                        assigned_label = keyword  # Use keyword as label for flat categories
                        break

        labels.append(assigned_label)
        if assigned_label != 'N/A':
            matched_categories += 1
            category_hits[category] = 1

    # Adjust label size
    required_size = max(len(keyword_categories), label_size)
    labels = labels[:required_size]
    labels += ['N/A'] * (required_size - len(labels))

    return labels, matched_categories, category_hits
    

In [41]:
def process_csv(input_file, output_file, keyword_categories, similarity_threshold=80):
    """
    Reads a CSV file, applies labeling to the 'question' column, and saves a new CSV.
    Includes fuzzy matching statistics for all categories.
    """
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
        return

    if 'text' not in df.columns:     # change when changing csv's
        print("Error: The 'question' column is missing in the input CSV.")
        return

    total_questions = len(df)
    results = df['text'].apply(lambda q: categorize_question(q, keyword_categories, similarity_threshold=similarity_threshold))
    
    df[['labels', 'matched_categories', 'category_hits']] = pd.DataFrame(results.tolist(), index=df.index)

    category_hits_list = df.pop('category_hits').tolist()
    category_totals = {category: sum(hit[category] for hit in category_hits_list) for category in keyword_categories.keys()}
    category_percentages = {category: (count / total_questions) * 100 for category, count in category_totals.items()}
    total_categories_matched = df['matched_categories'].sum()
    recognition_percentage = (total_categories_matched / (total_questions * len(keyword_categories))) * 100

    df.to_csv(output_file, index=False)

    print(f"Recognition Percentage (with {similarity_threshold}% similarity threshold): {recognition_percentage:.2f}%")
    print("\nCategory-wise hit counts and percentages:")
    for category, count in category_totals.items():
        print(f"{category}: {count} questions matched ({category_percentages[category]:.2f}%)")

# Example usage
input_csv = 'IRF_testing.csv'
output_csv = 'IRF_output.csv'
process_csv(input_csv, output_csv, keyword_categories, similarity_threshold=80)

Recognition Percentage (with 80% similarity threshold): 31.67%

Category-wise hit counts and percentages:
gender: 974 questions matched (92.15%)
age: 142 questions matched (13.43%)
disability_status: 72 questions matched (6.81%)
race: 334 questions matched (31.60%)
country: 332 questions matched (31.41%)
state: 0 questions matched (0.00%)
region: 412 questions matched (38.98%)
languages: 0 questions matched (0.00%)
college_degree: 128 questions matched (12.11%)
SNS: 669 questions matched (63.29%)
religion: 387 questions matched (36.61%)
marital status: 93 questions matched (8.80%)
profession: 807 questions matched (76.35%)
income: 477 questions matched (45.13%)
residence: 194 questions matched (18.35%)
