In [None]:
import codecs
import glob
import os
import numpy as np
import pandas as pd
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.model_selection import train_test_split


def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles


def read_predictions_from_file(filename):
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
            articles_id.append(article_id)
            gold_labels.append(gold_label)
            span_starts.append(span_start)
            span_ends.append(span_end)
    return articles_id, span_starts, span_ends, gold_labels


def load_data(data_folder, labels_file):
    articles = read_articles_from_file_list(data_folder)
    ref_articles_id, ref_span_starts, ref_span_ends, labels = read_predictions_from_file(labels_file)
    return articles, ref_articles_id, ref_span_starts, ref_span_ends, labels


def sents_token_bounds(text):
    sents_starts = []
    for start, end in PunktSentenceTokenizer().span_tokenize(text):
        sents_starts.append(start)
    sents_starts.append(100000)
    return np.array(sents_starts)


def clear(text):
    return text.strip().replace('\t', ' ').replace('\n', ' ')


def get_context(article, span_start, span_end):
    bounds = sents_token_bounds(article)
    context_start = bounds[np.where(bounds <= span_start)[0][-1]]
    context_end = bounds[np.where(bounds >= span_end)[0][0]]
    return clear(article[context_start:context_end])


def balance_pandas(data):
    lst = [data]
    max_size = data['label'].value_counts().max()
    for class_index, group in data.groupby('label'):
        lst.append(group.sample(max_size - len(group), replace=True))
    return pd.concat(lst)


def dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels):
    data = pd.DataFrame.from_dict({'article_id': ref_articles_id, 
              'article': [articles[id] for id in ref_articles_id], 
              'span_start': np.array(ref_span_starts).astype(int), 
              'span_end': np.array(ref_span_ends).astype(int),
              'label': train_gold_labels
             })
    data['span'] = data.apply(lambda x: clear(x['article'][x['span_start']:x['span_end']]), axis=1)
    data['context'] = data.apply(lambda x: get_context(x['article'], x['span_start'], x['span_end']), axis=1)
    return data[['article_id', 'span_start', 'span_end', 'span', 'context', 'label']]


def get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file, dev_file,
                     split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True):
    data = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
    if split_by_ids:
        train_ids, dev_ids = train_test_split(data.article_id.unique(), test_size=dev_size, random_state=random_state)
        train = data[data.article_id.isin(train_ids)]
        dev = data[data.article_id.isin(dev_ids)]
    else:
        train, dev = train_test_split(data, test_size=dev_size, random_state=random_state)
        
    if balance:
        train = balance_pandas(train)
    if shuffle:
        train = train.sample(frac=1).reset_index(drop=True)
    
    save_dataset(train, train_file)
    save_dataset(dev, dev_file)

    
def get_test_file(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, test_file):
    test = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
    save_dataset(test, test_file)
    

def save_dataset(data, file_path):
    data.to_csv(file_path, sep='\t', index=False)

In [None]:
train_folder="/content/drive/MyDrive/project_5_data/datasets/train-articles"
train_labels="/content/drive/MyDrive/project_5_data/datasets/train-task-flc-tc.labels"

propaganda_techniques_names="/content/drive/MyDrive/project_5_data/propaganda-techniques-scorer/data/propaganda-techniques-names-semeval2020task11.txt"

articles, ref_articles_id, ref_span_starts, ref_span_ends, labels=load_data(train_folder, train_labels)

In [None]:

dev_folder="/content/drive/MyDrive/project_5_data/datasets/dev-articles"

dev_labels="/content/drive/MyDrive/project_5_data/datasets/dev-task-flc-tc.labels"

import spacy
nlp = spacy.blank("en") # load a new spacy model

In [None]:
articles_dev, ref_articles_id_dev, ref_span_starts_dev, ref_span_ends_dev, labels_dev=load_data(dev_folder, dev_labels)

In [None]:
import pandas as pd
dev_df=pd.DataFrame()
cols=['art_id','starts','ends','text']

dev_df[cols]=cols
art_ids=[]
starts=[]
ends=[]
for i,id in enumerate(ref_articles_id_dev):
  # text=articles_dev[id]
  art_ids.append(id)
  starts.append(ref_span_starts_dev[i])
  ends.append(ref_span_ends_dev[i])
dev_df['art_id']=art_ids
dev_df['starts']=starts
dev_df['ends']=ends
print(dev_df)



  




         art_id starts  ends text
0     730093263    123   128  NaN
1     730093263    352   357  NaN
2     730093263   1370  1393  NaN
3     730093263   2434  2439  NaN
4     730093263   2699  2807  NaN
...         ...    ...   ...  ...
1058  999001419   4828  4851  NaN
1059  999001419    383   397  NaN
1060  999001419   1244  1261  NaN
1061  999001419   1319  1334  NaN
1062  999001419   3641  3657  NaN

[1063 rows x 4 columns]


In [None]:
def get_text_from_span(articles_dev,art_ids,starts,ends):
  sentences=[]
  for i,art_id in enumerate(art_ids):
    article=articles_dev[art_id]
    sentence = article[int(starts[i]):int(ends[i])]
    sentences.append(sentence)
  assert(len(sentences)==len(starts))
  return sentences


    


In [None]:
sents=get_text_from_span(articles_dev,art_ids,starts,ends)

In [None]:
dev_df['text']=sents

In [None]:
dev_df.head()


Unnamed: 0,art_id,starts,ends,text
0,730093263,123,128,white
1,730093263,352,357,black
2,730093263,1370,1393,“true American heroes.”
3,730093263,2434,2439,black
4,730093263,2699,2807,"If these two men had survived, and Quentin Lam..."


In [None]:

get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, 'train_tc.tsv', 'dev_tc.tsv',
                     split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True)

In [None]:
df=pd.read_csv('train_tc.tsv',sep='\t')

In [None]:
df.head(10)

Unnamed: 0,article_id,span_start,span_end,span,context,label
0,783702663,8097,8164,Which is just Allen’s more nuanced way of sayi...,"His opinion is: “Take it seriously, but with a...","Whataboutism,Straw_Men,Red_Herring"
1,774145019,412,417,DOING,Everything that the President is DOING belies ...,Loaded_Language
2,729303442,1555,1564,An outcry,"In a subsequent Facebook post, Lew wrote, “My ...",Loaded_Language
3,699142854,1402,1429,the authoritarian theocrats,By merging Iran’s external aggression with its...,"Name_Calling,Labeling"
4,752287274,0,34,Muslim Dem IT Aides' Family Member,"Muslim Dem IT Aides' Family Member Paid $45,00...","Name_Calling,Labeling"
5,793921939,7500,7505,swamp,How do you drain the swamp by adding to it?,Repetition
6,778664280,10832,11050,It is truly homophobic for any bishop to lie t...,It is truly homophobic for any bishop to lie t...,Doubt
7,701447437,1423,1442,The plague is a lie,"The plague is a lie,” Helene Raveloharisoa tol...","Exaggeration,Minimisation"
8,783702663,37874,38077,Thus did Paul describe the conciliar “opening ...,[emphasis added] Thus did Paul describe the co...,Causal_Oversimplification
9,999000145,247,384,Trump encouraged them to go out and vote Repub...,"At a rally of 8,500 in Indiana on Friday, Trum...",Appeal_to_fear-prejudice


In [None]:
g=[]
for i in df['span']:
    g.append(i)
maxl = max([len(s) for s in g])
print ('Maximum sequence length in the list of sentences:', maxl)

Maximum sequence length in the list of sentences: 577


In [None]:

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=50000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['span'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6922 unique tokens.


In [None]:
from keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(df['span'].values)
X = pad_sequences(X, maxlen=maxl)

Y = pd.get_dummies(df['label'],columns=df["label"]).values


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)



(3860, 577) (3860, 14)
(429, 577) (429, 14)


In [None]:
Y[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=uint8)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

model=Sequential()
model.add(Embedding(50000,100,input_length=maxl))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(14, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 577, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 577, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 14)                1414      
                                                                 
Total params: 5,081,814
Trainable params: 5,081,814
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, Y_train, epochs=20, batch_size=64,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/20
 3/55 [>.............................] - ETA: 1:27 - loss: 0.5021 - accuracy: 0.8490

KeyboardInterrupt: ignored

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.770
  Accuracy: 0.510


In [None]:
labels = list(set(df['label']))
print(labels)
print(len(labels))



['Flag-Waving', 'Whataboutism,Straw_Men,Red_Herring', 'Bandwagon,Reductio_ad_hitlerum', 'Loaded_Language', 'Doubt', 'Causal_Oversimplification', 'Appeal_to_fear-prejudice', 'Slogans', 'Exaggeration,Minimisation', 'Thought-terminating_Cliches', 'Repetition', 'Name_Calling,Labeling', 'Black-and-White_Fallacy', 'Appeal_to_Authority']
14


In [None]:
# new_complaint = ["no government operation ever goes perfectly	But no government operation ever goes perfectly."]

def get_pred(text):

  seq = tokenizer.texts_to_sequences([text])
  padded = pad_sequences(seq, maxlen=maxl)
  pred = model.predict(padded)
  labels = list(set(df['label']))
  print(pred, labels[np.argmax(pred)])
  return labels[np.argmax(pred)]

In [None]:
out=[]
for sent in sents:
  out.append(get_pred(sent))
  



[[3.76410579e-04 2.94238119e-03 4.89209488e-04 1.42834673e-04
  1.19800105e-04 6.22689491e-04 6.33987645e-03 1.93135266e-03
  5.73403418e-01 7.07996562e-02 3.38775277e-01 2.10024067e-03
  1.48452120e-03 4.72393294e-04]] Exaggeration,Minimisation
[[0.00117272 0.00167881 0.00143655 0.0008293  0.00101562 0.00337054
  0.0150092  0.01713253 0.01714536 0.8157948  0.0969143  0.0231227
  0.00162701 0.00375048]] Thought-terminating_Cliches
[[0.01129174 0.00839619 0.0101535  0.00981978 0.00660747 0.00606653
  0.03830741 0.4575002  0.00799868 0.15707748 0.12857404 0.13099292
  0.00830528 0.01890877]] Slogans
[[0.00117272 0.00167881 0.00143655 0.0008293  0.00101562 0.00337054
  0.0150092  0.01713253 0.01714536 0.8157948  0.0969143  0.0231227
  0.00162701 0.00375048]] Thought-terminating_Cliches
[[0.01974844 0.03175307 0.01512785 0.01442749 0.07227718 0.36029795
  0.3225896  0.00495874 0.00897958 0.04659981 0.00187016 0.00938838
  0.02327621 0.06870557]] Causal_Oversimplification
[[0.00256882 0.002

In [None]:
##for each text file as input get the start,end indices of each span identified in that text and perform prediciton

dev_df.head()
dev_df=dev_df.drop(columns=['text'])
dev_df['technique']=out






In [None]:
dev_df = dev_df.reindex(['art_id','technique','starts','ends'], axis=1)


In [None]:
dev_df.to_csv('output_tc_baseline.tsv', sep='\t', index=False,header=False)


In [None]:
sys.path.insert(0,"/content")

In [None]:
import sys
import argparse
import logging.handlers
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import src.annotation as an
import src.annotations as ans
import src.propaganda_techniques as pt

logger = logging.getLogger("propaganda_scorer")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.setLevel(logging.INFO)


def main():

    user_submission_file = "/content/output_tc_baseline.tsv"
    gold_file = "/content/drive/MyDrive/project_5_data/datasets/dev-task-flc-tc.labels"
    # output_log_file = args.log_file
    propaganda_techniques_list_file = "/content/drive/MyDrive/project_5_data/propaganda-techniques-scorer/data/propaganda-techniques-names-semeval2020task11.txt"
    # output_for_script = bool(args.output_for_script)

    # if not output_for_script:
    #     logger.addHandler(ch)

    # if args.debug_on_std:
    #     ch.setLevel(logging.DEBUG)

    # if output_log_file is not None:
    #     logger.info("Logging execution to file " + output_log_file)
    #     fileLogger = logging.FileHandler(output_log_file)
    #     fileLogger.setLevel(logging.DEBUG)
    #     fileLogger.setFormatter(formatter)
    #     logger.addHandler(fileLogger)

    propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file)
    an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques)

    user_annotations = ans.Annotations()
    user_annotations.load_annotation_list_from_file(user_submission_file)
    for article in user_annotations.get_article_id_list():
        user_annotations.get_article_annotations_obj(article).sort_spans()

    gold_annotations = ans.Annotations()
    gold_annotations.load_annotation_list_from_file(gold_file)
    for article in gold_annotations.get_article_id_list():
        gold_annotations.get_article_annotations_obj(article).sort_spans()

    logger.info("Checking format: User Predictions -- Gold Annotations")
    if not user_annotations.compare_annotations_identical_article_lists(gold_annotations) or not user_annotations.compare_annotations_identical(gold_annotations):
        logger.error("wrong format, no scoring will be performed")
        sys.exit()
    logger.info("OK: submission file format appears to be correct")
    res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, True)
    # res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, output_for_script)

    
    logger.info("Scoring submission" + res_for_output)
    # if output_for_script:
    print(res_for_script)
    print(res_for_output)


if __name__ == "__main__":

    # parser = argparse.ArgumentParser("Scorer for SemEval 2020 Task 11 subtask TC.\n" +
    # "Example: python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.task-FLC.labels -p data/propaganda-techniques-names-semeval2020task11.txt")

    # parser.add_argument('-s', '--submission-file', dest='submission', required=True, help="file with the submission of the team")
    # parser.add_argument('-r', '--reference-file', dest='gold', required=True, help="file with the gold labels.")
    # parser.add_argument('-d', '--enable-debug-on-standard-output', dest='debug_on_std', required=False,
    #                     action='store_true', help="Print debug info also on standard output.")
    # parser.add_argument('-l', '--log-file', dest='log_file', required=False, help="Output logger file.")
    # parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=True, 
    #                     help="file with list of propaganda techniques (one per line).")
    # parser.add_argument('-o', '--output-for-script', dest='output_for_script', required=False, action='store_true',
    #                     default=False, help="Prints the output in a format easy to parse for a script")
    # main(parser.parse_args())
    main()


0.074318	0.074318	0.074318	0.0	0.13636363636363638	0.0	0.0	0.17857142857142855	0.06896551724137931	0.08016877637130802	0.02197802197802198	0.0	0.0	0.18055555555555558	0.09259259259259259	0.02702702702702703	0.09523809523809523

F1=0.074318
Precision=0.074318
Recall=0.074318
F1_Appeal_to_Authority=0.0
F1_Appeal_to_fear-prejudice=0.13636363636363638
F1_Bandwagon,Reductio_ad_hitlerum=0.0
F1_Black-and-White_Fallacy=0.0
F1_Causal_Oversimplification=0.17857142857142855
F1_Doubt=0.06896551724137931
F1_Exaggeration,Minimisation=0.08016877637130802
F1_Flag-Waving=0.02197802197802198
F1_Loaded_Language=0.0
F1_Name_Calling,Labeling=0.0
F1_Repetition=0.18055555555555558
F1_Slogans=0.09259259259259259
F1_Thought-terminating_Cliches=0.02702702702702703
F1_Whataboutism,Straw_Men,Red_Herring=0.09523809523809523

