In [1]:
# !pip install NewsSentiment

In [2]:
import hashlib
import re

import pickle

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from NewsSentiment import TargetSentimentClassifier
from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm 
import nltk
nltk.download('punkt')

True


In [3]:
NER_pretrained_model = "dslim/bert-base-NER"
embedding_model_name = "all-MiniLM-L6-v2"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(NER_pretrained_model)
model = AutoModelForTokenClassification.from_pretrained(NER_pretrained_model)
NER_nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer,  aggregation_strategy='simple')
tsc = TargetSentimentClassifier()

embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)






In [5]:
df = pd.read_csv('data/merged_data.csv')


In [6]:
df = df.dropna()
df = df.drop_duplicates(subset="content").reset_index(drop=True)


In [7]:
fix_fullstop_tokenizing_issue = lambda raw_sentence:  re.sub(r'(.)\.([A-Z\W])', r'\1. \2', raw_sentence)


def preprocess_sentence_splits(sentence_splits):
    processed_splits = []

    for sentence_list in sentence_splits:
        splits = []
        current_pos = 0
        for sentence in sentence_list:
            sentence_start = current_pos
            sentence_end = current_pos + len(sentence) - 1
            splits.append({"text": sentence, "start": sentence_start, "end": sentence_end})
            current_pos = sentence_end + 1 
        processed_splits.append(splits)
    return processed_splits

def fetch_NER_tokens(processed_sentence_splits):
  result = []
  for processed_sentence_split in processed_sentence_splits:
     split_list = [processed_sentence["text"] for processed_sentence in processed_sentence_split]
     result.append(NER_nlp_pipeline(split_list))
        
  return result


def filter_NER_tokens(NER_dict_lists, threshold=0.95):
  filtered_NER_dict_lists = []
  for article in NER_dict_lists:
    resp_article = [] 
    for sentence in article:
      resp_sentence = []
      for NER_dict in sentence:
         if(NER_dict["score"]>=threshold): resp_sentence.append(NER_dict)
      
      resp_article.append(resp_sentence)
    
    filtered_NER_dict_lists.append(resp_article)

  return filtered_NER_dict_lists


def process_NER_dict_lists(NER_dict_lists, sentence_splits):
    proc_NER_dict_lists = []
    for doc_ind, doc in enumerate(NER_dict_lists):
        resp_doc = []
        for sent_ind, sent in enumerate(doc):
            resp_sent = []
            for NER_dict_ind, NER_dict in enumerate(sent):
                resp_NER_dict  = NER_dict.copy()
                resp_NER_dict["word"] = sentence_splits[doc_ind][sent_ind]["text"][NER_dict["start"]:NER_dict["end"]]
                resp_sent.append(resp_NER_dict)
                
            resp_doc.append(resp_sent)
        
        proc_NER_dict_lists.append(resp_doc)

    return proc_NER_dict_lists


def truncate(text, limit=294, filler="filler"):
   if(len(text)>limit): return text[-limit:]
   if(len(text)==0): return filler
   return text
   

def fetch_sentence_window(NER_dict_lists, sentence_splits):
    sentence_window = []
    for doc_ind, doc in enumerate(NER_dict_lists):
        resp_doc = []
        for sent_ind, sent in enumerate(doc):
            resp_sent = []
            for NER_dict in sent:
              sentence_split = sentence_splits[doc_ind][sent_ind]["text"]
              start_pos = NER_dict["start"]
              end_pos = NER_dict["end"]

              
              left  = truncate(sentence_split[:start_pos])
              middle = truncate(sentence_split[start_pos:end_pos])
              right = truncate(sentence_split[end_pos:])

              resp_sent.append((left, middle, right))
                
            resp_doc.append(resp_sent)
        
        sentence_window.append(resp_doc)
        
    return sentence_window


def fetch_default_sentiments():
   return {'class_id': 1, 'class_label': 'neutral', 'class_prob': 0.0}
   


def fetch_NER_sentiment_from_windows(sentence_windows):
  resp_entity_sentiment_dict_list = []

  for doc_ind, doc in tqdm(enumerate(sentence_windows)):
    resp_docs = []
    for sentence_ind, sentence_windows in enumerate(doc):
      try:
         raw_sentiments = tsc.infer(targets=sentence_windows)
         refined_sentiments = [sentiment[0] for sentiment in raw_sentiments]
      except Exception as e:
         print("\n\n\n")
         print(f"Issue Handling doc index: {doc_ind} and sentence index: {sentence_ind}")
         print("\n\n\n")
         refined_sentiments = [fetch_default_sentiments() for _ in range (len(sentence_windows))]
      resp_docs.append(refined_sentiments)

  
    resp_entity_sentiment_dict_list.append(resp_docs)
  
  return resp_entity_sentiment_dict_list
          


def merged_entity_sentiments (entity_sentiment_dict_list,NER_dict_lists, texts):
   
   entity_sentiment_dict = {}
   for doc_ind, doc in enumerate(NER_dict_lists):
      text = texts[doc_ind]
      hashed_doc = hashlib.sha256(text.encode()).hexdigest()
      entity_sentiment_dict[hashed_doc] = {}
      for sent_ind, sent in enumerate(doc):
         for token_ind, token in enumerate(sent):
            NER_token_word = NER_dict_lists[doc_ind][sent_ind][token_ind]["word"]
            sentiment_dict = entity_sentiment_dict_list[doc_ind][sent_ind][token_ind]
            class_label, class_score = sentiment_dict["class_label"],sentiment_dict["class_prob"]
            if(NER_token_word not in entity_sentiment_dict[hashed_doc] ):
               
               entity_sentiment_dict[hashed_doc][NER_token_word] = {"negative":[],"neutral":[],"positive":[]}
            
            entity_sentiment_dict[hashed_doc][NER_token_word][class_label].append(class_score)
            
   return  entity_sentiment_dict        
    



def aggregate_sentiment_scores(merged_entity_sentiment_dict_list):
  
  refined_entity_sentiment_dict = {}
  for doc_hash, token_dict in merged_entity_sentiment_dict_list.items():
    refined_entity_sentiment_dict[doc_hash] = []
    for NER_token, NER_token_dict in token_dict.items():
       
       positive_scores_list = NER_token_dict["positive"]
       neutral_scores_list = NER_token_dict["neutral"]
       negative_scores_list = NER_token_dict["negative"]
       
       positive_mean_score,neutral_mean_score,negative_mean_score = np.nanmean(positive_scores_list), np.nanmean(neutral_scores_list), np.nanmean(negative_scores_list)
       
       mean_sentiment_scores = [positive_mean_score, neutral_mean_score, negative_mean_score ]
       max_score = np.nanmax(mean_sentiment_scores)
       max_score_arg = np.nanargmax(mean_sentiment_scores)

       
       if(max_score_arg==0): refined_entity_sentiment_dict[doc_hash].append({NER_token:{"label":"positive","score":max_score}})
       elif(max_score_arg==1): refined_entity_sentiment_dict[doc_hash].append({NER_token:{"label":"neutral","score":max_score}})
       elif(max_score_arg==2): refined_entity_sentiment_dict[doc_hash].append({NER_token:{"label":"negative","score":max_score}})
       
  return refined_entity_sentiment_dict


In [8]:
df["processed_texts"] = df["content"].apply(fix_fullstop_tokenizing_issue)

df["article_hash"] = df["processed_texts"].apply(lambda text:hashlib.sha256(text.encode()).hexdigest() )

df["text_embeddings"] = df["processed_texts"].apply(embeddings.embed_query)

processed_texts = list(df["processed_texts"].values)




In [9]:
print(f"Tokenizing Sentences of {len(processed_texts)} docs")
sentence_splits = [nltk.sent_tokenize(text) for text in processed_texts]
total_sentences = sum([len(sen) for sen in sentence_splits])
print(f"Indexing sentence splits of {total_sentences} sentences")
processed_sentence_splits = preprocess_sentence_splits(sentence_splits)
print("Sentence NER tagging")
NER_dict_lists = fetch_NER_tokens(processed_sentence_splits)
filtered_NER_dict_lists = filter_NER_tokens(NER_dict_lists)
processed_NER_dict_lists = process_NER_dict_lists(filtered_NER_dict_lists, processed_sentence_splits)
print("Context Window generation")
sentence_windows = fetch_sentence_window(processed_NER_dict_lists, processed_sentence_splits )
num_sentence_windows = len([ner for doc in sentence_windows for sentence_split in doc for ner in sentence_split])
print(f"Sentence Targeted Sentiment Classification for {num_sentence_windows} sentence windows")
entity_sentiment_dict_list = fetch_NER_sentiment_from_windows(sentence_windows)
merged_entity_sentiment_dict_list = merged_entity_sentiments(entity_sentiment_dict_list,processed_NER_dict_lists,processed_texts)
refined_entity_sentiment_dict = aggregate_sentiment_scores(merged_entity_sentiment_dict_list)

Tokenizing Sentences of 912 docs
Indexing sentence splits of 23611 sentences
Sentence NER tagging
Context Window generation
Sentence Targeted Sentiment Classification for 35765 sentence windows


Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.39batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.09batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.29batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.28batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.26batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.26batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.23batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.29batch/s]
Processing batches: 100%|██████████| 7/7 [00:03<00:00,  2.24batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.31batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.22batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.27batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.07batch/s]
Processing ba





Issue Handling doc index: 190 and sentence index: 27






Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai





Issue Handling doc index: 190 and sentence index: 28






Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai





Issue Handling doc index: 190 and sentence index: 29






Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai





Issue Handling doc index: 352 and sentence index: 46






Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.29batch/s]
Processing batches: 100%|██████████| 7/7 [00:03<00:00,  2.16batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  1.87batch/s]
Processing batches: 100%|██████████| 7/7 [00:03<00:00,  2.07batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.25batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.16batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.32batch/s]
Processing batches: 100%|██████████| 8/8 [00:03<00:00,  2.00batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.14batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.26batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.18batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.05batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.09batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.





Issue Handling doc index: 638 and sentence index: 30






Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 2/2 [00:01<00:00,  1.97batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.29batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.18batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.18batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.21batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.05batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.22batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.32batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.23batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.30batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.32batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.30batch/s]
Processing batches: 0batch [00:00, ?b





Issue Handling doc index: 779 and sentence index: 4






Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:  64%|██████▎   | 7/11 [00:03<00:02,  1.86batch/s]






Issue Handling doc index: 779 and sentence index: 5






Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.20batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.25batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.19batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.16batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.27batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.23batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.20batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.24batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.25batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.25batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.23batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.07batch/s]
Processing batches: 100%|██████████| 





Issue Handling doc index: 786 and sentence index: 38






Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.10batch/s]
Processing batches: 100%|██████████| 6/6 [00:02<00:00,  2.12batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.21batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.22batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.16batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.26batch/s]
Processing batches: 100%|██████████| 7/7 [00:03<00:00,  2.17batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.18batch/s]
Processing batches: 100%|██████████| 6/6 [00:02<00:00,  2.16batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.12batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.14batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.19batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.27batch/s]
Processing ba





Issue Handling doc index: 787 and sentence index: 32






Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.24batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.07batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.22batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.19batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.18batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.25batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.21batch/s]
Processing batches: 10





Issue Handling doc index: 901 and sentence index: 3






Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.22batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.35batch/s]
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:   0%|          | 0/2 [00:00<?, ?batch/s]






Issue Handling doc index: 901 and sentence index: 7






Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.32batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.24batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.31batch/s]
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:   0%|          | 0/1 [00:00<?, ?batch/s]






Issue Handling doc index: 901 and sentence index: 12






Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.27batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.27batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.23batch/s]
Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.10batch/s]
Processing batches: 100%|██████████| 5/5 [00:02<00:00,  2.24batch/s]
Processing batches: 100%|██████████| 6/6 [00:02<00:00,  2.22batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.28batch/s]
Processing batches: 0batch [00:00, ?batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.32batch/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00,  2.21batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.23batch/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.28batch/s]
Processing batches: 100%|██████████| 4/4 [00:01<00:00,  2.19batch/s]
Processing batches: 100%|██████████| 

In [10]:
import pickle

with open('output/refined_entity_sentiment_dict.pickle', 'wb') as f:
    pickle.dump(refined_entity_sentiment_dict, f)


In [11]:
entity_sentiment_dict = {"hash_id":[],"words":[], "labels":[], "scores":[]}

for hash, entity_dicts in refined_entity_sentiment_dict.items():
    entity_sentiment_dict["hash_id"].append(hash)
    words, labels, scores = [],[],[]
    for entity_dict in entity_dicts:
        for word, label_dict in entity_dict.items():
            words.append(word)
            labels.append(label_dict["label"])
            scores.append(label_dict["score"])
    
    entity_sentiment_dict["words"].append(words)
    entity_sentiment_dict["labels"].append(labels)
    entity_sentiment_dict["scores"].append(scores)




In [12]:
df_entity_sentiment = pd.DataFrame.from_dict(entity_sentiment_dict)


In [13]:
df_final = df.set_index("article_hash").join(df_entity_sentiment.set_index("hash_id"), how='left').reset_index().rename(columns={"index":"article_hash"})

### Recomendation Generation

In [14]:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
text_embeddings = list(df_final["text_embeddings"].values)


In [16]:
mlb = MultiLabelBinarizer()

one_hot_encoded = mlb.fit_transform(df_final['words'])

cosine_sim_matrix = cosine_similarity(one_hot_encoded)

In [17]:
def fetch_nearest_neighbors(row_index,cosine_sim_matrix, s):

    similarity_scores = cosine_sim_matrix[row_index]
    
    sorted_indices = similarity_scores.argsort()[::-1]
    

    sorted_indices = sorted_indices[1:]
    

    nearest_neighbors_indices = sorted_indices[:s]
    
    return nearest_neighbors_indices



def fetch_semantically_divergent(query_embedding, embedding_list, d):

    similarities = cosine_similarity([query_embedding], embedding_list)[0]

    # similarities = [np.abs(similarity) for similarity in similarities]

    sorted_indices = np.argsort(similarities)
    
    most_similar_indices = sorted_indices[:d]
    
    return most_similar_indices




In [18]:
recommended_indices = []
similarity_num, divergent_num = 12,4 

for query_index in range(df_final.shape[0]):
    query_embedding  = text_embeddings[query_index]
    nearest_neighbors_indices = fetch_nearest_neighbors(query_index, cosine_sim_matrix, s=similarity_num)
    similar_pre_picked_embeddings =  [text_embeddings[similar_index] for similar_index in nearest_neighbors_indices]

    divergent_indices = fetch_semantically_divergent(query_embedding,similar_pre_picked_embeddings,d=divergent_num )
    divergent_recommended_indices = [nearest_neighbors_indices[divergent_indice] for divergent_indice in divergent_indices]

    recommended_indices.append(divergent_recommended_indices)
    

In [19]:
df_final["recommended_hashes"] = pd.Series(recommended_indices)

In [20]:
df_frontend = df_final[['article_hash', 'media_source', 'author', 'headline',
       'description', 'url', 'image_url', 'publish_date',
       'current_date', 'text_embeddings',
       'words', 'labels', 'scores', 'recommended_hashes']]

### Token Word JSON profile

In [21]:
tokens_dict = {}
for doc_ind ,(hash_id, token_dict_list) in enumerate(merged_entity_sentiment_dict_list.items()):
    for token_word, token_dict in token_dict_list.items():
        if token_word not in tokens_dict:
            tokens_dict[token_word] = {"neutral":[],"positive":[],"negative":[],"indices":[],"media_source":{}} 
        tokens_dict[token_word]["indices"].append(doc_ind)
        tokens_dict[token_word]["neutral"].extend(token_dict["neutral"])
        tokens_dict[token_word]["positive"].extend(token_dict["positive"])
        tokens_dict[token_word]["negative"].extend(token_dict["negative"])
        media_source = df_final[df_final["article_hash"]==hash_id]["media_source"].values[0]
        if media_source not in tokens_dict[token_word]["media_source"]:
            tokens_dict[token_word]["media_source"][media_source] = {"neutral":[],"positive":[],"negative":[]}
        
        tokens_dict[token_word]["media_source"][media_source]["neutral"].extend(token_dict["neutral"])
        tokens_dict[token_word]["media_source"][media_source]["positive"].extend(token_dict["positive"])
        tokens_dict[token_word]["media_source"][media_source]["negative"].extend(token_dict["negative"])





In [22]:
for token_word, dic in tokens_dict.items():
    tokens_dict[token_word]["neutral"] = np.nanmean(tokens_dict[token_word]["neutral"])
    tokens_dict[token_word]["positive"] = np.nanmean(tokens_dict[token_word]["positive"])
    tokens_dict[token_word]["negative"] = np.nanmean(tokens_dict[token_word]["negative"])

    for media_source, media_dic in tokens_dict[token_word]["media_source"].items():
        tokens_dict[token_word]["media_source"][media_source]["neutral"] = np.nanmean(tokens_dict[token_word]["media_source"][media_source]["neutral"])
        tokens_dict[token_word]["media_source"][media_source]["positive"] = np.nanmean(tokens_dict[token_word]["media_source"][media_source]["positive"])
        tokens_dict[token_word]["media_source"][media_source]["negative"] = np.nanmean(tokens_dict[token_word]["media_source"][media_source]["negative"])



  tokens_dict[token_word]["media_source"][media_source]["positive"] = np.nanmean(tokens_dict[token_word]["media_source"][media_source]["positive"])
  tokens_dict[token_word]["media_source"][media_source]["negative"] = np.nanmean(tokens_dict[token_word]["media_source"][media_source]["negative"])
  tokens_dict[token_word]["media_source"][media_source]["neutral"] = np.nanmean(tokens_dict[token_word]["media_source"][media_source]["neutral"])
  tokens_dict[token_word]["positive"] = np.nanmean(tokens_dict[token_word]["positive"])
  tokens_dict[token_word]["negative"] = np.nanmean(tokens_dict[token_word]["negative"])
  tokens_dict[token_word]["neutral"] = np.nanmean(tokens_dict[token_word]["neutral"])


In [23]:
import json
with open('output/tokens_dict.json', 'w') as fp:
    json.dump(tokens_dict, fp)

### News Source JSON Profile

In [24]:
media_dict = {}
for doc_ind ,(hash_id, token_dict_list) in enumerate(merged_entity_sentiment_dict_list.items()):
    media_source = df_final[df_final["article_hash"]==hash_id]["media_source"].values[0]
    for token_word, token_dict in token_dict_list.items():
        if media_source not in media_dict:
            media_dict[media_source] = {"neutral": [], "positive": [], "negative": [], "words": {}}
        
        media_dict[media_source]["neutral"].extend(token_dict["neutral"])
        media_dict[media_source]["positive"].extend(token_dict["positive"])
        media_dict[media_source]["negative"].extend(token_dict["negative"])

        if token_word not in media_dict[media_source]["words"]:
            media_dict[media_source]["words"][token_word] = {"neutral":[],"positive":[],"negative":[],"indices":[]}
        
        media_dict[media_source]["words"][token_word]["indices"].append(doc_ind)
        media_dict[media_source]["words"][token_word]["neutral"].extend(token_dict["neutral"])
        media_dict[media_source]["words"][token_word]["positive"].extend(token_dict["positive"])
        media_dict[media_source]["words"][token_word]["negative"].extend(token_dict["negative"])



In [25]:
for media_source, media_dic in media_dict.items():
    media_dict[media_source]["neutral"] = np.nanmean(media_dict[media_source]["neutral"])
    media_dict[media_source]["positive"] = np.nanmean(media_dict[media_source]["positive"])
    media_dict[media_source]["negative"] = np.nanmean(media_dict[media_source]["negative"])

    
    for token_word, dic in media_dic["words"].items():
        media_dict[media_source]["words"][token_word]["neutral"] = np.nanmean(media_dict[media_source]["words"][token_word]["neutral"])
        media_dict[media_source]["words"][token_word]["positive"] = np.nanmean(media_dict[media_source]["words"][token_word]["positive"])
        media_dict[media_source]["words"][token_word]["negative"] = np.nanmean(media_dict[media_source]["words"][token_word]["negative"])


  media_dict[media_source]["words"][token_word]["neutral"] = np.nanmean(media_dict[media_source]["words"][token_word]["neutral"])
  media_dict[media_source]["words"][token_word]["negative"] = np.nanmean(media_dict[media_source]["words"][token_word]["negative"])
  media_dict[media_source]["words"][token_word]["positive"] = np.nanmean(media_dict[media_source]["words"][token_word]["positive"])
  media_dict[media_source]["negative"] = np.nanmean(media_dict[media_source]["negative"])


In [26]:
import json
with open('output/media_dict.json', 'w') as fp:
    json.dump(media_dict, fp)

### Exporting Datasets

In [27]:
df_frontend.to_csv("output/frontend_processed_articles.csv", index=False)

In [28]:
df_final.to_csv("output/final_processed_articles.csv", index=False)

[Reference paper for poster](https://aclanthology.org/2021.eacl-main.142.pdf)