In [None]:
import pandas
import itertools as it
import string
from collections import Counter
import json
import re
import numpy
import nltk
import spacy

##### Load Sentence Splitting and POS Tag Models

In [None]:
sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
spacy_nlp = spacy.load("en_core_web_lg")

##### Define various helper functions

In [None]:
def normalise(text):
    
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " <url> ", text)
    text = text.replace("/", " / ")
    text = re.sub(r"@\w+", " <user> ", text)
    text = re.sub(r"[8:=;]['`\-][)d]+|[)d]+['`\-][8:=;]", " <smile> ", text)
    text = re.sub(r"[8:=;]['`\-]p+", " <lolface> ", text)
    text = re.sub(r"[8:=;]['`\-]\(+|\)+['`\-][8:=;]", " <sadface> ", text)
    text = re.sub(r"[8:=;]['`\-][\/|l*]", " <neutralface> ", text)
    text = text.replace(r"<3", " <heart> ")
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", " <number> ", text)
    text = text.replace(r"#", " <hashtag> ")
    text = re.sub(r"([!?.,()])+", r" \1 ", text)
    
    return text

In [None]:
def tokenise(text):
    return tuple([t.strip() for t in text.strip().split()])

In [None]:
def sentence_splitting(text):
    return sentence_detector.tokenize(text)

In [None]:
def parse_treatment_definitons(definion_file):
    treatment_set = set()
    treatment_mapping = {}
    max_length = 1
    for line in definion_file:
        line = line.strip()
        treatments = line.split(',')
        name = treatments[0]
        for treatment in treatments:
            treatment = tuple(tokenise(normalise(treatment)))
            max_length = max(len(treatment), max_length)
            treatment_set.add(treatment)
            treatment_mapping[treatment] = name
    return treatment_set, treatment_mapping, max_length

In [None]:
def window_sliding(iterable, n):
    gens = (
        it.chain(it.repeat(None, n - 1 - i), iterable, it.repeat(None, i))
        for i, gen in enumerate(it.tee(iterable, n)))
    return list(zip(*gens))

def find_treatments(tokens):
    found_treatments = []
    for x in range(max_treatment_length, 0, -1):
        for window in window_sliding(tokens, x):
            if tuple(window) in treatment_set:
                found_treatments.append(treatment_mapping[tuple(window)])
    return list(set(found_treatments))

In [None]:
treatment_set, treatment_mapping, max_treatment_length = parse_treatment_definitons(open("data/treatment_definitons.txt", 'r'))

In [None]:
def get_mapped_treatments(df):
    treatments = None
    merged_sentence = []
    for sent in (df[0], df[1], df[2]):
        if not sent:
            break
        tokens = tokenise(sent)
        mapped_treatments = find_treatments(tokens)
        if not mapped_treatments:
            break
        if treatments is None:
            treatments = mapped_treatments
            merged_sentence.append(sent)
        else:
            if set(mapped_treatments) - set(treatments):
                break
            merged_sentence.append(sent)
    df["treatments"] = treatments
    df["merged_sentence"] = " ".join(merged_sentence)
    return df

In [None]:
def filter_stopwords(tokens):
    return [t for t in tokens if len(t) > 2 and t not in stopwords]

In [None]:
def calculate_embeddings(tokens):
    token_embdgs = [embeddings.get(t) for t in tokens]
    token_embdgs = [t for t in token_embdgs if t is not None]
    return numpy.mean(token_embdgs, axis=0)

In [None]:
def add_pos_tags(df):
    tokens = []
    pos = []
    tag = []

    for doc in spacy_nlp.pipe(df['text'].astype('unicode').values, batch_size=50, n_threads=3):
        if doc.is_parsed:
            tokens.append([n.text for n in doc])
            pos.append([n.pos_ for n in doc])
            tag.append([n.tag_ for n in doc])
        else:
            tokens.append(None)
            pos.append(None)
            tag.append(None)

    df['tokens'] = tokens
    df['pos'] = pos
    df['tag'] = tag
    return df

##### Load posts into dataframe, do basic normalisations

In [None]:
with open('data/tinnitustalk_posts_filtered.csv', 'r') as fh:
    df = pandas.read_csv(fh, parse_dates=["timestamp"])

In [None]:
df = df.loc[df['text'].notnull()]

In [None]:
df["text"] = df["text"].apply(lambda x: normalise(x.replace("\n", " ")))

##### Get, count and export POS-Tags

In [None]:
df = add_pos_tags(df)

In [None]:
noun_tags = {"NN", "NNS", "NNP", "NNPS"}
adj_tags = {"JJ", "JJR", "JJS"}
nouns = Counter()
adjectives = Counter()
for _, row in df.iterrows():
    tokens = row["tokens"]
    tags = row["tag"]
    for cnt, tag in enumerate(tags):
        if tag in noun_tags:
            nouns[tokens[cnt]] += 1
        if tag in adj_tags:
            adjectives[tokens[cnt]] += 1

In [None]:
with open("data/nouns_tt.json", 'w') as fh:
    json.dump(nouns, fh)
with open("data/adjectives_tt.json", 'w') as fh:
    json.dump(adjectives, fh)

##### Split into sentences, explode the dataframe to have one row per sentence instead of post

In [None]:
df["sentences"] = df["text"].apply(lambda x: window_sliding(sentence_splitting(x), 3))

In [None]:
n_df = pandas.concat([pandas.DataFrame(v, index=numpy.repeat(k,len(v))) for k,v in df.sentences.to_dict().items()])

In [None]:
merged_df = pandas.concat([df, n_df], axis=1)

In [None]:
df = merged_df.drop(["sentences", "position_in_thread", "thread_name", "text"], axis=1)

##### Find mentioned treatments, merge sentences, filter out sentences without treatment

In [None]:
df = df.apply(get_mapped_treatments, axis=1)

In [None]:
df = df.loc[df['treatments'].notnull()]

In [None]:
len(df.author_id.unique())

In [None]:
df = df.drop([0, 1, 2], axis=1)

##### Tokenize sentences 

In [None]:
df["tokens"] = df["merged_sentence"].apply(lambda x: tokenise(x))

##### Get statistics on unique texts

In [None]:
unique_text_counts = df["tokens"].groupby(df.tokens).count().sort_values(ascending=False).values

In [None]:
unique_text_counts.mean()

In [None]:
numpy.median(unique_text_counts)

##### Store DF to CSV

In [None]:
df.to_csv("tt_with_treatments.csv")

##### Get treatment frequencies, store them

In [None]:
treatment_counts = Counter()
for elem in df["treatments"]:
    treatment_counts += Counter(elem)

In [None]:
treatment_counts.most_common()

In [None]:
with open("data/treatments_tt.json", 'w') as fh:
    json.dump(treatment_counts, fh)