<a href="https://colab.research.google.com/github/spatiebalk/text_mining_project/blob/master/score_generated_texts_hp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Scoring generated texts

Inputs to `score` are a list of candidate sentences and a list of reference sentences. 

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# needed imports
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import time 
import json
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
text = open('/content/gdrive/My Drive/TxMM/harrypotter.txt').read()
text_tokens = [word_tokenize(sent) for sent in sent_detector.tokenize(text.strip())]

In [4]:
from nltk.translate.bleu_score import sentence_bleu
!pip install rouge-score
from rouge_score import rouge_scorer
!pip install transformers
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

Collecting rouge-score
  Downloading https://files.pythonhosted.org/packages/1f/56/a81022436c08b9405a5247b71635394d44fe7e1dbedc4b28c740e09c2840/rouge_score-0.0.4-py2.py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 12.7MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 46.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |██████

In [5]:
BERT_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




# GRU

In [6]:
with open('/content/gdrive/My Drive/TxMM/GRU_results/hp_generated_texts.json') as json_file: 
    generated_text = json.load(json_file) 

## BLEU score
Between 0 and 1, where 1 is similar.

In [8]:
score_dict_BLEU = {}
start = time.time()

for i in range(0, 100):
  gen_text = generated_text[str(i)]
  gen_text_sent = word_tokenize(gen_text)

  BLEUscore = sentence_bleu(text_tokens, gen_text_sent, weights = (0, 0, 1, 0))
  score_dict_BLEU[i] = BLEUscore

end = time.time()
print("This took {:.2f} minutes".format((end-start)/60))
# with open('/content/gdrive/My Drive/TxMM/GRU_results/hp_scores_BLEU.json', 'w') as fp:
#   json.dump(score_dict_BLEU, fp)

del score_dict_BLEU

1016


## ROUGE score
Higher is more similar.

In [None]:
score_dict_ROUGE = {}
start = time.time()

for i in range(0, 100):
  gen_text = generated_text[str(i)]
  scorer = rouge_scorer.RougeScorer(['rouge3'], use_stemmer=True)

  ROUGEscore = scorer.score(text, gen_text)
  score_dict_ROUGE[i] = {"P": ROUGEscore["rouge3"][0], "R": ROUGEscore["rouge3"][1], "F1": ROUGEscore["rouge3"][2]}

end = time.time()
print("This took {:.2f} minutes".format((end-start)/60))
# with open('/content/gdrive/My Drive/TxMM/GRU_results/hp_scores_ROUGE.json', 'w') as fp:
#   json.dump(score_dict_ROUGE, fp)

del score_dict_ROUGE

This took 0.22 minutes


## BERT scores
Higher is more similar (cosine similarity is used).

Split the reference text up into about chunks of 512 tokens, aka 3200 steps of about 1900 characters. For each chunk of tokens, calculate the average BERT embedding and add to list. Later we will compare the token of the generated text to each of these 3200 tokens and add the cosine similarity score to be able to score this generated text using BERT.

In [None]:
ref_text_embeddings = []
i = 0

steps = 3200
par_len = int(len(text)/3200)

assert steps * par_len <= len(text)

for i in tqdm(range(0, steps)):

  par = text[i*par_len:i*par_len+par_len]
  inputs = BERT_tokenizer(par, truncation=True, return_tensors="pt")
  outputs = BERT_model(**inputs)

  last_hidden_states = outputs.last_hidden_state
  avg_embedding = torch.mean(last_hidden_states, dim=1)
  avg_embedding = torch.squeeze(avg_embedding)
  avg_embedding = avg_embedding.detach().numpy()

  ref_text_embeddings.append(avg_embedding)

100%|██████████| 3200/3200 [1:30:38<00:00,  1.70s/it]


In [None]:
def BERTscore(ref_text_embeddings, gen_text):
  sim = 0
  for embed in ref_text_embeddings:
    cos = cosine_similarity(embed.reshape(1, -1), gen_text.reshape(1, -1))
    sim += cos[0][0]

  float_len = float(len(ref_text_embeddings))
  avg_sim = sim/float_len
  assert isinstance(avg_sim, float), "Not a float score"
  return avg_sim

In [None]:
score_dict_BERT = {}
start = time.time()

for i in range(0, 100):
  gen_text = generated_text[str(i)]

  # create BERT embedding
  inputs = BERT_tokenizer(gen_text, truncation=True, return_tensors="pt")
  outputs = BERT_model(**inputs)

  last_hidden_states = outputs.last_hidden_state
  avg_embedding = torch.mean(last_hidden_states, dim=1)
  avg_embedding = torch.squeeze(avg_embedding)
  avg_embedding = avg_embedding.detach().numpy()

  # compare gen_text with ref_text
  sim = BERTscore(ref_text_embeddings, avg_embedding )

  score_dict_BERT[i] = sim

end = time.time()
print("This took {:.2f} minutes".format((end-start)/60))

# with open('/content/gdrive/My Drive/TxMM/GRU_results/hp_scores_BERT.json', 'w') as fp:
#   json.dump(score_dict_BERT, fp)

del score_dict_BERT

# GPT-2

In [9]:
with open('/content/gdrive/My Drive/TxMM/GPT-2_results/hp_generated_texts.json') as json_file: 
    generated_text = json.load(json_file)     

## BLEU score
Between 0 and 1, where 1 is similar.

In [None]:
score_dict_BLEU = {}
start = time.time()

for i in range(0, 100):
  gen_text = generated_text[str(i)]
  gen_text = gen_text[:1020]
  gen_text_sent = word_tokenize(gen_text)

  BLEUscore = sentence_bleu(text_tokens, gen_text_sent, weights = (0, 0, 1, 0))
  score_dict_BLEU[i] = BLEUscore

end = time.time()
print("This took {:.2f} minutes".format((end-start)/60))
with open('/content/gdrive/My Drive/TxMM/GPT-2_results/hp_scores_BLEU.json', 'w') as fp:
  json.dump(score_dict_BLEU, fp)

del score_dict_BLEU

## ROUGE score
Higher is more similar.

In [None]:
score_dict_ROUGE = {}
start = time.time()

for i in range(0, 100):
  gen_text = generated_text[str(i)]
  gen_text = gen_text[:1020]
  scorer = rouge_scorer.RougeScorer(['rouge3'], use_stemmer=True)

  ROUGEscore = scorer.score(text, gen_text)
  score_dict_ROUGE[i] = {"P": ROUGEscore["rouge3"][0], "R": ROUGEscore["rouge3"][1], "F1": ROUGEscore["rouge3"][2]}

end = time.time()
print("This took {:.2f} minutes".format((end-start)/60))
with open('/content/gdrive/My Drive/TxMM/GPT-2_results/hp_scores_ROUGE.json', 'w') as fp:
  json.dump(score_dict_ROUGE, fp)

del score_dict_ROUGE

## BERT scores
Higher is more similar (cosine similarity is used).

Split the reference text up into about chunks of 512 tokens, aka 3200 steps of about 1900 characters. For each chunk of tokens, calculate the average BERT embedding and add to list. Later we will compare the token of the generated text to each of these 3200 tokens and add the cosine similarity score to be able to score this generated text using BERT.

In [None]:
score_dict_BERT = {}
start = time.time()

for i in range(0, 100):
  gen_text = generated_text[str(i)]
  gen_text = gen_text[:1020]

  # create BERT embedding
  inputs = BERT_tokenizer(gen_text, truncation=True, return_tensors="pt")
  outputs = BERT_model(**inputs)

  last_hidden_states = outputs.last_hidden_state
  avg_embedding = torch.mean(last_hidden_states, dim=1)
  avg_embedding = torch.squeeze(avg_embedding)
  avg_embedding = avg_embedding.detach().numpy()

  # compare gen_text with ref_text
  sim = BERTscore(ref_text_embeddings, avg_embedding )

  score_dict_BERT[i] = int(sim[0][0])

end = time.time()
print("This took {:.2f} minutes".format((end-start)/60))

with open('/content/gdrive/My Drive/TxMM/GPT-2_results/hp_scores_BERT.json', 'w') as fp:
  json.dump(score_dict_BERT, fp)

del score_dict_BERT