In [15]:
import pandas
import itertools as it
import string
from collections import Counter

In [16]:
punctuation_translator = str.maketrans(
            string.punctuation, " " * len(string.punctuation))

In [17]:
with open('data/stopwords.txt') as fh:
    stopwords = set(fh.read().splitlines())

In [18]:
def normalise(text):
    return " ".join(
        text.translate(punctuation_translator).replace("\n", " ").lower().split()
    )

In [19]:
def tokenise(text):
    return [t.strip() for t in text.strip().split(" ")]

In [20]:
def parse_treatment_definitons(definion_file):
    treatment_set = set()
    treatment_mapping = {}
    max_length = 1
    for line in definion_file:
        line = line.strip()
        treatments = line.split(',')
        name = treatments[0]
        for treatment in treatments:
            treatment = tuple(tokenise(normalise(treatment)))
            max_length = max(len(treatment), max_length)
            treatment_set.add(treatment)
            treatment_mapping[treatment] = name
    return treatment_set, treatment_mapping, max_length

In [21]:
def window_sliding(iterable, n):
    gens = (
        it.chain(it.repeat(None, n - 1 - i), iterable, it.repeat(None, i))
        for i, gen in enumerate(it.tee(iterable, n)))
    return zip(*gens)

def find_treatments(text):
    tokens = tokenise(normalise(text))
    found_treatments = []
    for x in range(max_treatment_length, 0, -1):
        for window in window_sliding(tokens, x):
            if tuple(window) in treatment_set:
                found_treatments.append(treatment_mapping[tuple(window)])
    return found_treatments or None

In [22]:
def filter_stopwords(text):
    tokens = tokenise(normalise(text))
    return [t for t in tokens if len(t) > 2 and t not in stopwords]

In [23]:
with open('merged_tweets.jsonl', 'r') as fh:
    df = pandas.read_json(fh.read(), lines=True, convert_dates=True)

In [24]:
df = df.loc[df['retweeted_status'].isnull()]

In [25]:
df = df[["id", "created_at", "text", "retweet_count"]]

In [26]:
df = df.sort_values("created_at")

In [27]:
len(df.index)

16494

In [28]:
df.groupby(df.created_at.dt.dayofyear).count()

Unnamed: 0_level_0,id,created_at,text,retweet_count
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
306,83,83,83,83
307,246,246,246,246
308,232,232,232,232
309,229,229,229,229
310,285,285,285,285
311,327,327,327,327
312,322,322,322,322
313,327,327,327,327
314,297,297,297,297
315,251,251,251,251


In [29]:
treatment_set, treatment_mapping, max_treatment_length = parse_treatment_definitons(open("data/treatment_definitons.txt", 'r'))

In [30]:
df['treatments'] = df['text'].apply(find_treatments)

In [31]:
df['filtered_tokens'] = df['text'].apply(filter_stopwords)

In [32]:
df = df.loc[df['treatments'].notnull()]

In [33]:
treatment_counts = Counter()
word_counts = Counter()
for row in df.itertuples():
    treatment_counts += Counter(row[5])
    word_counts += Counter(row[6])

In [34]:
treatment_counts.most_common(11)

[('medicine', 759),
 ('Sound Therapy', 216),
 ('Transcranial Magnetic Stimulation', 150),
 ('Masking', 129),
 ('Tinnintus Retraining Therapy', 52),
 ('Ginkgo', 51),
 ('Acupuncture', 38),
 ('Antidepressants', 27),
 ('Cognitive Behavioural Therapy', 25),
 ('Zinc', 23),
 ('Hyperbaric Oxygen Therapy', 23)]

In [43]:
word_counts.most_common(100)

[('tinnitus', 1379),
 ('https', 879),
 ('therapy', 396),
 ('treatment', 392),
 ('cure', 331),
 ('sound', 251),
 ('youtube', 186),
 ('ear', 134),
 ('ringing', 131),
 ('hearing', 126),
 ('video', 116),
 ('tms', 107),
 ('treat', 93),
 ('help', 90),
 ('playlist', 82),
 ('relief', 81),
 ('powerful', 76),
 ('remedies', 66),
 ('masking', 66),
 ('ocd', 65),
 ('bipolar', 65),
 ('parkinson', 65),
 ('whew', 65),
 ('migraines', 65),
 ('ptsd', 65),
 ('ears', 62),
 ('binaural', 57),
 ('amp', 52),
 ('hypnosis', 49),
 ('sounds', 45),
 ('remedy', 44),
 ('brain', 44),
 ('beats', 43),
 ('people', 40),
 ('health', 40),
 ('masker', 39),
 ('acupuncture', 38),
 ('patients', 38),
 ('fitness', 32),
 ('improve', 32),
 ('aid', 31),
 ('loss', 30),
 ('frequency', 30),
 ('homeopathic', 29),
 ('pain', 28),
 ('vertigo', 28),
 ('trt', 27),
 ('effective', 27),
 ('suffer', 26),
 ('cognitive', 26),
 ('patient', 25),
 ('cbt', 25),
 ('ginkgo', 25),
 ('retraining', 25),
 ('minutes', 25),
 ('digital', 24),
 ('ric', 24),
 ('m