In [None]:
!pip install transformers
!pip install pip --upgrade

In [None]:
import numpy as np
import pandas as pd
import torch
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

chunksize = 100

In [None]:
def chunk_text(txt):
  
  #  Tokenize text
  tokens = tokenizer.encode_plus(txt, add_special_tokens=False, return_tensors='pt')

  #  Split text to chunks of 100 tokens 
  input_id_chunks = tokens['input_ids'][0].split(chunksize-2)
  mask_chunks = tokens['attention_mask'][0].split(chunksize-2)

  input_id_chunks = list(input_id_chunks)
  mask_chunks = list(mask_chunks)

  #  Add the beginning, ending tokens to chunked arrays;
  for i in range(len(input_id_chunks)):
    input_id_chunks[i] = torch.cat([
        torch.Tensor([101]), input_id_chunks[i], torch.Tensor([102])
    ])
    mask_chunks[i] = torch.cat([
        torch.Tensor([1]), mask_chunks[i], torch.Tensor([1])
    ])

    pad_len = chunksize - input_id_chunks[i].shape[0]
    if pad_len > 0:
      input_id_chunks[i] = torch.cat([
        input_id_chunks[i], torch.Tensor([0] * pad_len)
      ])
      mask_chunks[i] = torch.cat([
        mask_chunks[i], torch.Tensor([0] * pad_len)
      ])

  #  Stack chunks to one tensor
  input_ids = torch.stack(input_id_chunks)
  attention_mask = torch.stack(mask_chunks)

  #  Create a map for the model later
  input_dict = {
      'attention_mask' : attention_mask.int(),
      'input_ids' : input_ids.long()
  }

  return input_dict

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Magistras/Data/test-anchor.csv', encoding='ISO-8859-13')
data = data[data['Year'] > '2018-12-31']
data = data[data['Year'] < '2020-01-01']
data = data.reset_index()
data = data.replace(np.nan,'',regex=True)
data['content'] = data['content_original'] + data['content2_original']

In [None]:
len(data)

2908

In [None]:
import time

start = time.time()

#  Dataframe to store negative/neutral sentiment columns
df = pd.DataFrame(columns = ['Negative', 'Neutral'])

print(len(data))

for i in range(len(data)):

  #  Chunk text with the function
  input_dict = chunk_text(data['content'][i])

  #  Feed it to the model
  output = model(**input_dict)

  #  Calculate shows' perentage of negative/neutral paragraphs
  #  Transform data to see which sentiment it mostly is
  scores = output[0].detach().numpy()
  scores = softmax(scores)
  ranking = np.argsort(scores)
  ranking = ranking[::-1]

  #  Calculate percentage of negative and neutral paragraphs
  sent_0 = np.count_nonzero(ranking[:,2] == 0)/len(ranking[:,2])
  sent_1 = np.count_nonzero(ranking[:,2] == 1)/len(ranking[:,2])

  #  Fit it back into a dataframe
  df = df.append({'Negative' : sent_0, 'Neutral' : sent_1}, ignore_index=True)

  if i % 100 == 0:
    df.to_csv('anchor_shows_sentiment_added_2019.csv')

  print(i)

df.to_csv('anchor_shows_sentiment_added_2019_2.csv')
#  Concat existing dataframe with sentiment columns
#result = pd.concat([data, df], axis=1)

#  Write it back to other csv
#result.to_csv('anchor_shows_sentiment_added_final.csv')
 
end = time.time()
print(end - start)  

In [None]:
df2 = pd.read_csv('/content/anchor_shows_sentiment_added.csv', encoding='ISO-8859-13')
test = pd.concat([data, df2], axis=1)           

In [None]:
tt = test.groupby(['Anchor', 'YearQ'])['Negative'].mean()
pd.DataFrame(tt)

Unnamed: 0_level_0,Unnamed: 1_level_0,Negative
Anchor,YearQ,Unnamed: 2_level_1
Baier,2020-01-01,0.229348
Baier,2020-04-01,0.224781
Baier,2020-07-01,0.209768
Baier,2020-10-01,0.199795
Cooper,2020-01-01,0.219561
Cooper,2020-04-01,0.268858
Cooper,2020-07-01,0.289986
Cooper,2020-10-01,0.295429
Cuomo,2020-01-01,0.196604
Cuomo,2020-04-01,0.232615
