# Easy Feature Extraction

The purpose of this notebook is to provide "one click" solution to reproducing the dataset.  Note, this assumes that you are building dataset from the enclosed .csv files rather than web scraping and calling pushshift.io APIs.  If you want to build .csv files use the notebooks for getting the data into .csv files first:

- reddit2db.ipynb - calls Pushshift.io APIs and stored output in MongoDB.  The instructions on how to run MongoDB are included inside of the notebook.
- reddit_db_2_csv.ipynb - queries database and saves only relevant information in to .csv file.
- migraine.com_data.ipynb - web scrapes Migraine.com and stores the data in .csv file.
- tbd.wbsite.com_data.ipynb - TBD

## Note

Before running make sure that the cell below was executed once in your environment.  This download is needed fro Spacy library and it should be downloaded once.

For subsequent runs this should stay commented out.

In [1]:
# ! python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import re
import unittest
import copy

In [3]:
reddis_data_filename = 'reddis_migraine_posts.csv'
migraine_dot_com = 'migraine.com.csv'
patient_info_com = 'patient.info.csv'
csv_files = [reddis_data_filename, migraine_dot_com, patient_info_com]

## Read Input Data

In [4]:
def read_reddis_data(files):
    dfs = []
    for file in files:
        dfs.append(pd.read_csv(f'data/{file}', header=0))
        df = pd.concat(dfs)
        df = df.dropna(subset=['Text'])
    return df

In [5]:
posts_and_commnets = read_reddis_data(csv_files)

In [6]:
posts_and_commnets = list(posts_and_commnets[['Author', 'Text']].to_records(index=False))

## Execution Pipeline

In [7]:
def pipe(functions):
    def run_pipe(input):
        result = input
        for function in functions:
            result = function(result)
        return result
    return run_pipe

## Define Author Index

In [8]:
# Author index
from collections import defaultdict


author_index = defaultdict(dict)

## Gender Discovery Function

In [9]:
# regex patterns
male_matchers = [
    re.compile('my\s+wife', re.IGNORECASE),
    re.compile('my\s.*girlfriend', re.IGNORECASE),
    re.compile('\s[0-9][0-9](m\s|\(m\)|\s\(m\))', re.IGNORECASE),
    re.compile('\s[0-9][0-9].*male', re.IGNORECASE),
    re.compile('male.*[0-9][0-9]', re.IGNORECASE)
]

female_matchers = [
    re.compile('my\s+husband', re.IGNORECASE),
    re.compile('I( am|\'m)\s.*pregnant', re.IGNORECASE),
    re.compile('I\s.*menstruation', re.IGNORECASE),
    re.compile('my\s.*boyfriend', re.IGNORECASE),
    re.compile('\s[0-9][0-9](f|\(f\)|\s\(f\))', re.IGNORECASE),
    re.compile('\s[0-9][0-9].*female', re.IGNORECASE),
    re.compile('female.*[0-9][0-9]', re.IGNORECASE)
]

In [10]:
# Gender discovery functions
def discover_gender(matchers):
    def find_in_text(text):
        return any([
            matcher.search(text) for matcher in matchers
        ])
    return find_in_text

find_females = discover_gender(female_matchers)
find_males = discover_gender(male_matchers)

In [11]:
def identify_gender(text):
    if find_males(text):
        return 'male'
    elif find_females(text):
        return 'female'
    return 'unknown'

In [12]:
def identify_gender_in_posts(idx):
    gender_idx = copy.deepcopy(idx)
    def process_entry(author, text):
        gender_idx[author]['gender'] = identify_gender(text)

    for author, text in posts_and_commnets:
        process_entry(author, text)
    return gender_idx

## Medicine Information Functions

In [13]:
drug_list = [
    'Amitriptyline',
    'Elavil',
    'Divalproex',
    'Depakote',
    'Eletriptan',
    'Relpax',
    'triptan',
    'Metoprolol',
    'Lopressor',
    'Toprol',
    'Propranolol',
    'Inderal',
    'beta blocker',
    'Rizatriptan',
    'Maxalt',
    'Sumatriptan',
    'Imitrex',
    'Topiramate',
    'Topamax',
    'Trokendi',
    'Venlafaxine',
    'Effexor',
    'Zolmitriptan',
    'Zomig',
    'OnabotulinumtoxinA',
    'Botox',
    'Erenumab',
    'Aimovig',
    'CGRP',
    'Nurtec',  # found in the subreddit post
    'Topomax',  # popular misspelling of Topamax,
    'nortiptyline',  # found in the subreddit post
    'metoclopramide',  # found in the subreddit post
    'caffeine pill',  # found in the subreddit posts_and_comments
    'naproxen',
    'magnesium',
    'Delta 8',
    'Aimovig',
    'sulfate',
    'Xanax',
    'amitryptiline',
    'Amoxicillin'
]


In [14]:
# Regex patterns
drug_matchers = [
    {
        'regex': re.compile('([0-9]x).*([0-9]+\.?[0-9]+mg|[0-9]+\.?[0-9]+\smg|\.?[0-9]+mg|\.?[0-9]+\smg)', flags=re.IGNORECASE),
        'dosage_group': 2,
        'qty_group': 1
    },
    {
        'regex': re.compile('([0-9]+\.?[0-9]+mg|[0-9]+\.?[0-9]+\smg|\.?[0-9]+mg|\.?[0-9]+\smg).*([0-9]x)', flags=re.IGNORECASE),
        'dosage_group': 1,
        'qty_group': 2
    },
    {
        'regex': re.compile('([0-9]+\.?[0-9]+mg|[0-9]+\.?[0-9]+\smg|\.?[0-9]+mg|\.?[0-9]+\smg).*(three times a day|four times a day|twice a day|one a day|twice daily)', flags=re.IGNORECASE),
        'dosage_group': 1,
        'qty_group': 2
    },
    {
        'regex': re.compile('([0-9]+mg|[0-9]+\smg).*(nightly|daily|dose|day)', flags=re.IGNORECASE),
        'dosage_group': 1,
        'qty_group': 2
    },
    {
        'regex': re.compile('([0-9]+\.?[0-9]+mg|[0-9]+\.?[0-9]+\smg|\.?[0-9]+mg|\.?[0-9]+\smg)', flags=re.IGNORECASE),
        'dosage_group': 1,
        'qty_group': -1
    }
]

In [15]:
# Medicine discovery functions
def normalize_qty(qty_text):
    if qty_text == 'daily' or qty_text == 'dose' or qty_text == 'day' or qty_text == 'one a day' or qty_text == '1x':
        return '1x'

    if qty_text == 'twice daily' or qty_text == 'twice a day':
        return '2x'

    if qty_text == 'three times a day':
        return '3x'

    return qty_text

def find_medicine_name(text):
    meds_matched = []
    for drug in drug_list:
        if re.search(drug, text, re.IGNORECASE):
            meds_matched.append(drug)
    return meds_matched

def find_dosage(reg_res, matcher):
    if matcher['qty_group'] == -1:
        qty = '1x'
    else:
        qty = normalize_qty(reg_res.group(matcher['qty_group']))
    return reg_res.group(matcher['dosage_group']), qty

def discover_medicine_dosage(matchers):
    def process_medicine_dosage(text):
        for matcher in matchers:
            if (reg_res := matcher['regex'].search(text)):
                dosage, qty = find_dosage(reg_res, matcher)
                if dosage:
                    med = find_medicine_name(text)
                    if med:
                        return (
                            med[0],
                            dosage,
                            qty
                        )

        return 'unknown', 'unknown', 'unknown'
    return process_medicine_dosage

find_medicine_dosage = discover_medicine_dosage(drug_matchers)

In [16]:
def identify_medicine_in_posts(idx):
    medicine_idx = copy.deepcopy(idx)
    def process_entry(author, text):
        med, dosage, qty = find_medicine_dosage(text)
        medicine_idx[author]['medicine'] = med
        medicine_idx[author]['dosage'] = dosage
        medicine_idx[author]['qty'] = qty

    for author, text in posts_and_commnets:
        process_entry(author, text)
    return medicine_idx

## Suicidal Thoughts Functions

In [17]:
# Regex expressions
positive_suicide_matchers = [
    re.compile('(am|have|had|felt|having|me|was|been|think|about|feeling).*(suicidal|suicide)', re.IGNORECASE),
    re.compile('(my near|made me|have been|thought about|).*(suicidal|suicide)', re.IGNORECASE)
]

negative_suicide_matchers = [
    re.compile('(am|have|had|felt|having|me|was|been|think|about|feeling) (not|never).*(suicidal|suicide)', re.IGNORECASE),
    re.compile('(my near|made me|have been|thought about|) (not|never).*(suicidal|suicide)', re.IGNORECASE)
]

In [18]:
def search_for_suicide(text):
    return any([matcher.search(text) for matcher in positive_suicide_matchers]) \
        and not any([matcher.search(text) for matcher in negative_suicide_matchers])


In [19]:
def identify_suicidal_thoughts_in_posts(idx):
    suicidal_thoughts_idx = copy.deepcopy(idx)
    def process_entry(author, text):
        if search_for_suicide(text):
            suicidal_thoughts_idx[author]['suicidal'] = 'yes'
        else:
            suicidal_thoughts_idx[author]['suicidal'] = 'no'

    for author, text in posts_and_commnets:
        process_entry(author, text)
    return suicidal_thoughts_idx

## Author's Age Functions

In [20]:
# Regex expressions
age_matchers = [
    {'matcher': re.compile("I('m| am) ([0-9][0-9]*)", re.IGNORECASE), 'group': 2},
    {'matcher': re.compile("I('m| am) in my ([0-9][0-9]*)", re.IGNORECASE), 'group': 2},
    {'matcher': re.compile("([0-9][0-9]*) years old", re.IGNORECASE), 'group': 1},
    {'matcher': re.compile("I('m| am) now ([0-9][0-9]*)", re.IGNORECASE), 'group': 2},
    {'matcher': re.compile("I('m| am) now at ([0-9][0-9]*)", re.IGNORECASE), 'group': 2},
    {'matcher': re.compile("([0-9][0-9]*)(f\b|m\b|f$|m$)", re.IGNORECASE), 'group': 1},
    {'matcher': re.compile("([0-9][0-9]*) (f\b|m\b|f$|m$)", re.IGNORECASE), 'group': 1},
    {'matcher': re.compile("([0-9][0-9]*)\((f|m)\)", re.IGNORECASE), 'group': 1},
    {'matcher': re.compile("([0-9][0-9]*) \((f|m)\)", re.IGNORECASE), 'group': 1}
]

In [21]:
# Find age in text or return 0 if no age information
def find_age(text):
    for matcher in age_matchers:
        if (r := matcher['matcher'].search(text)):
            return int(r.group(matcher['group']))
    return 0

In [22]:
def identify_authors_age_in_posts(idx):
    age_idx = copy.deepcopy(idx)
    def process_entry(author, text):
        age = find_age(text)
        age_idx[author]['age'] = age

    for author, text in posts_and_commnets:
        process_entry(author, text)
    return age_idx

## Migraine Triggers Functions

In [23]:
# To this once
# ! python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load('en_core_web_sm')

In [24]:
migraine_triggers = {
    'alcohol',
    'anxiety',
    'caffeine',
    'cheese',
    'chocolate',
    'coffee',
    'dehydration',
    'exercise',
    'foods',
    'heat',
    'hormones',
    'light',
    'lights',
    'medication',
    'meds',
    'nausea',
    'pain',
    'pressure',
    'sleep',
    'stress',
    'sugar',
    'tension',
    'water',
    'weather',
    'wine'
}

In [25]:
def find_triggers_in_posts(nlp):
    def process(idx):
        trigger_idx = copy.deepcopy(idx)
        pattern = re.compile('(trigger|triggers)', re.IGNORECASE)

        def normalize_triggers(word):
            if word == 'pressure':
                return 'barometric pressure'
            if word == 'water':
                return 'dehydration'
            if word == 'meds':
                return 'medication'
            return word

        def find_triggers_in_text(text):
            triggers = []
            doc = nlp(text)
            dep_type = None
            for token in doc:
                if (token.dep_ == 'nsubj' or token.dep_ == 'dobj' or token.dep_ == 'pobj') \
                    and pattern.search(token.text):
                    dep_type = token.dep_
                if token.pos_ == 'NOUN' and token.dep_ != dep_type and (token.dep_ == 'punc' or token.dep_ == 'dobj' or token.dep_ == 'conj'):
                    if token.text in migraine_triggers:
                        triggers.append(
                            normalize_triggers(token.text)
                        )
            return triggers

        def process_entry(author, text):
            triggers = []
            if pattern.search(text) is None:
                trigger_idx[author]['triggers'] = triggers
                return
            for sentence in sentences_with_triggers(text):
                triggers.extend(find_triggers_in_text(sentence))
            trigger_idx[author]['triggers'] = triggers

        def sentences_with_triggers(text):
            doc = nlp(text)
            sentences = [str(sent) for sent in doc.sents if pattern.search(str(sent))]
            return sentences

        for author, text in posts_and_commnets:
            process_entry(author, text)
        return trigger_idx
    return process

In [26]:
identify_authors_triggers_in_posts = find_triggers_in_posts(nlp)

## Author's with Aura Functions

In [27]:
positive_aura_matchers = [
    re.compile('(i|my).*(aura|auras)', re.IGNORECASE),
    re.compile('(with|first).*(aura|auras)', re.IGNORECASE),
    re.compile('(aura|auras).*(me)', re.IGNORECASE)
]

negative_aura_matchers = [
    re.compile('(i|my).*(\snot\s|without).*(aura|auras)', re.IGNORECASE),
    re.compile("(i|my).*(don't|w/o).*(aura|auras)", re.IGNORECASE)
]

In [28]:
def search_for_auras(text):
    return any([matcher.search(text) for matcher in positive_aura_matchers]) \
        and not any([matcher.search(text) for matcher in negative_aura_matchers])

In [29]:
def identify_authors_aura_in_posts(idx):
    aura_idx = copy.deepcopy(idx)
    def process_entry(author, text):
        aura_idx[author]['aura'] = str(bool(search_for_auras(text))).lower()

    for author, text in posts_and_commnets:
        process_entry(author, text)
    return aura_idx

## Author's with ADHD Functions

In [30]:
positive_adhd_matchers = [
    re.compile('(i|my).*(adhd)', re.IGNORECASE),
    re.compile('(take|treat|treating|diagnosed|prescription).*(adhd)', re.IGNORECASE),
    re.compile('(adhd).*(here)', re.IGNORECASE)
]

negative_adhd_matchers = [
    re.compile('(i|my).*(\snot\s|without).*(aura|auras)', re.IGNORECASE),
    re.compile("(i|my).*(don't|\sno\s).*(aura|auras)", re.IGNORECASE)
]

In [31]:
def search_for_adhd(text):
    return any([matcher.search(text) for matcher in positive_adhd_matchers]) \
        and not any([matcher.search(text) for matcher in negative_adhd_matchers])

In [32]:
def identify_authors_adhd_in_posts(idx):
    adhd_idx = copy.deepcopy(idx)
    def process_entry(author, text):
        adhd_idx[author]['adhd'] = str(bool(search_for_adhd(text))).lower()

    for author, text in posts_and_commnets:
        process_entry(author, text)
    return adhd_idx

# Create Author Reverse Index

In [33]:
# Feature functions
feature_discovery = [
    identify_gender_in_posts,
    identify_medicine_in_posts,
    identify_suicidal_thoughts_in_posts,
    identify_authors_age_in_posts,
    identify_authors_triggers_in_posts,
    identify_authors_aura_in_posts,
    identify_authors_adhd_in_posts
]

In [34]:
pipeline = pipe(feature_discovery)
author_index = pipeline(author_index)

KeyboardInterrupt: 

# Final Features Counts

In [None]:
# Features like suicidal, ADHD, aura are set for all of the authors
# so we ignore those 
def has_features(entry):
    if entry['age'] == 0 and \
       len(entry['triggers']) == 0 and \
       entry['medicine'] == 'unknown' and \
       entry['gender'] == 'unknown':
        return False
    return True

def has_all_features(entry):
    if entry['age'] != 0 and \
       len(entry['triggers']) != 0 and \
       entry['medicine'] != 'unknown' and \
       entry['gender'] != 'unknown':
        return True
    return False

In [None]:
# Count at least one and all features
total_at_least_one = 0
total_all = 0

for author, entry in author_index.items():
    if has_features(entry):
        total_at_least_one += 1
    if has_all_features(entry):
        total_all += 1

print(f'Authors with at least one feature: {total_at_least_one}')
print(f'Authors with all features: {total_all}')

Authors with at least one feature: 4298
Authors with all features: 3


# Build the Dataset

In [None]:
import uuid

for _, entry in author_index.items():
    entry['id'] = uuid.uuid4()


In [None]:
data_list = [entry for _, entry in author_index.items() if has_features(entry)]
output_df = pd.DataFrame(data=data_list)
len(output_df)

4298

In [None]:
output_dataset_filename = 'migraine_all_group11.csv'
output_df.to_csv(f'data/{output_dataset_filename}')