In [1]:
import pandas as pd
import glob
import re
import snorkel

from textblob import TextBlob

from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LabelModel, MajorityLabelVoter

In [4]:
df = pd.read_excel('../../data/data tofu/Train - clean.xlsx')
df = df.drop('Unnamed: 0', axis=1)
df.columns = ['text', 'sen', 'label', 'topic']
df = df[df.text.notna()]
frame = df.drop(['sen', 'topic'], axis=1)
print(len(frame))

2549


In [5]:
# Define the label mappings for convenience
ABSTAIN = -1
ECO = 0
HEA = 1
ENV = 2

In [6]:
regex = re.compile('\*')
bins = pd.read_excel('Bin of words TOFU software 22-12-19.xlsx')
eco = list(bins.economic)
eco_keywords = [regex.sub('', x) for x in eco if str(x) != 'nan']
hea = list(bins.health)
hea_keywords = [regex.sub('', x) for x in hea if str(x) != 'nan']
env = list(bins.environment)
env_keywords = [regex.sub('', x) for x in env if str(x) != 'nan']

In [111]:
@labeling_function()
def lf_keyword_economic(x):
    flag = False
    for word in eco_keywords:
        if word in x.text.lower():
            flag = True
            break
    return ECO if flag else ABSTAIN

@labeling_function()
def lf_keyword_health(x):
    flag = False
    for word in hea_keywords:
        if word in x.text.lower():
            flag = True
            break
    return HEA if flag else ABSTAIN

@labeling_function()
def lf_keyword_environment(x):
    flag = False
    for word in env_keywords:
        if word in x.text.lower():
            flag = True
            break
    return ENV if flag else ABSTAIN

In [112]:
lfs = [lf_keyword_environment, lf_keyword_health, lf_keyword_economic]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(frame)

100%|██████████| 2549/2549 [00:16<00:00, 152.09it/s]


In [113]:
# Train the label model and compute the training labels
label_model = MajorityLabelVoter(cardinality=4, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
frame["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

In [114]:
en = len(list(frame[frame.label==2].label))
he = len(list(frame[frame.label==1].label))
ec = len(list(frame[frame.label==0].label))
frame = frame[frame.label != ABSTAIN]
print('Amount of labeled paragraphs: {}'.format(len(frame)))
print('Amount of paragraphs labeled as economic: {}'.format(ec))
print('Amount of paragraphs labeled as health: {}'.format(he))
print('Amount of paragraphs labeled as environmental: {}'.format(en))

Amount of labeled paragraphs: 833
Amount of paragraphs labeled as economic: 283
Amount of paragraphs labeled as health: 476
Amount of paragraphs labeled as environmental: 74


In [115]:
# to file
frame.to_excel('../../data/data frame/Train - lf.xlsx')