In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import spacy
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pickle
from general_functions import *

nlp = spacy.load("en_core_web_sm", exclude=["parser"])
nlp.enable_pipe("senter")

import logging, sys
logging.disable(sys.maxsize)

# CONFIG

In [None]:
#CONFIG FOR HYPERPARAMETERS
top_pos_list = [10] #TPOS and TNEG
dup_cnt_list = [5]  #DNCT
src_type_list = ["newsroom","cnn_dailymail","multi_news"]
take_list = [1] #How many samples

# Get Sentiment

In [None]:
# CODE FROM https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
# CODE FROM https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
def rank_sentence(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]

    return scores

# Get Top Sentences

In [None]:
def get_top_sent_idx(top_cnt,val_array):
    
    sent_idx = []
    top_values = []
    
    array_sort = np.sort(val_array)
    sent_cnt = 1
        
    if top_cnt > len(val_array):
        top_cnt = len(val_array)
        
    while sent_cnt <= top_cnt:
        top_value = array_sort[-(sent_cnt)]
        top_values.append(top_value)
        sent_cnt = sent_cnt + 1
            
    for value in top_values:
        if value > 0.1:
            sent_idx.append(val_array.index(value))
    
    return sent_idx

In [None]:
def get_top_sentences(scores_array):
    pos_array = []
    neg_array = []
    
    for entry in scores_array:
        pos_array.append(entry[0][2])
        neg_array.append(entry[0][0])

    pos_sent_idx = get_top_sent_idx(top_pos,pos_array)
    neg_sent_idx = get_top_sent_idx(top_neg,neg_array)
    
    return neg_sent_idx,pos_sent_idx

# Create Duplicate Article

In [None]:
def create_article_dup(pos_sent_idx,neg_sent_idx,sentence_idx,highlights):

    final_text = []
        
    for row in sentence_idx:

        # Do it for positive sentences
        try:
            pos_idx_found = pos_sent_idx.index(row[0])
            for i in range(dup_cnt):
                final_text.append(row[1])
        except ValueError:
            e_r = 1
        
        # Do it for negative sentences
        try:
            neg_idx_found = neg_sent_idx.index(row[0])
            for i in range(dup_cnt):
                final_text.append(row[1])
        except ValueError:
            e_r = 1
                
        final_text.append(row[1])       
    
    final_text = ' '.join(final_text)
    
    return final_text

In [None]:
def process_article(article,highlights,counter):
    scores_sent = []
    sentence_idx = []
    
    article_str = clean_article_new(article,src_type)
    doc = nlp(article_str)
    
    sent_counter = 0
    for sent in doc.sents:
        sent_score = [0,0,0]
        if len(sent.text) > 20:
            sent_score = rank_sentence(sent.text)
        scores_sent.append([sent_score,sent_counter])
        sentence_idx.append([sent_counter,sent.text])
        sent_counter = sent_counter + 1

    #Find top Scores for sentences
    neg_sent_idx,pos_sent_idx = get_top_sentences(scores_sent)        
    final_text = create_article_dup(pos_sent_idx,neg_sent_idx,sentence_idx,str(highlights))
        
    return scores_sent, final_text , pos_sent_idx,neg_sent_idx,sentence_idx

In [None]:
def run_sentence_sentiment_process():
    df = tfds.as_dataframe(ds.take(take),ds_info)

    example_counter = 0
    final_list_all = []

    if src_type == 'multi_news':
        article_k = "document"
        summary_k = "summary"

    if src_type == 'cnn_dailymail':
        article_k = "article"
        summary_k = "highlights"

    if src_type == 'scientific_papers':
        article_k = "article"
        summary_k = "abstract"    

    if src_type == 'billsum':
        article_k = "text"
        summary_k = "summary"        

    if src_type == 'newsroom':
        article_k = "text"
        summary_k = "summary"    

    if src_type == 'gigaword':
        article_k = "article"
        summary_k = "headline"     

    for index,row in df.iterrows(): 
        print(index)

        try:    
            article = str(row[article_k])
            highlights = str(row[summary_k])

            scores_sent, final_text , pos_sent_idx,neg_sent_idx,sentence_idx = process_article(article,highlights,example_counter)
            final_list_all.append([example_counter,scores_sent, final_text , pos_sent_idx,neg_sent_idx,sentence_idx,article,highlights])
            example_counter = example_counter + 1    
        except:
            print("error")
    
    with open(file_name, 'wb') as handle:
        pickle.dump(final_list_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# CODE FROM https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# START

In [None]:
#MAIN LOOP
for top_pos in top_pos_list:
    top_neg = top_pos
    for dup_cnt in dup_cnt_list:
        for src_type in src_type_list:
            for take in take_list:
                file_name = src_type + '_Pos_' +str(top_pos) + '_Neg_'+str(top_neg) + '_dCnt_' + str(dup_cnt) + '_take_' + str(take) +'.pickle'
                print(file_name)

                ds,ds_info = tfds.load(src_type, split='test', with_info=True) 
                assert isinstance(ds, tf.data.Dataset)

                run_sentence_sentiment_process()