# Fine Tuning Generated Titles through OOV Replacement and Grammar check

#### **Downloading and Importing required libraries**

In [None]:
!pip install compress-pickle
!pip install rouge
!sudo apt install openjdk-8-jdk
!sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!pip install language-check

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction
from rouge import Rouge 

import collections
import compress_pickle as pickle
import re
import bz2
import os 
import time
import warnings

import numpy as np
import pandas as pd

from tqdm.notebook import trange,tqdm

nltk.download('punkt')

#### **Necessary Utility functions**

In [None]:
default_path = "/Training/Model_with_attention/"
result_path = "/Training/Fine_tuning/"

valid_article_path = dataset_path + "abstract.valid.bz2"
valid_title_path   = dataset_path + "title.valid.bz2"

def clean_str(sentence):
    sentence = re.sub("[#.]+", " ", sentence)
    return sentence


def get_text_list(data_path, toy=False,clean=True):
    with bz2.open (data_path, "r") as f:
        if not clean:
            return [x.decode().strip() for x in f.readlines()[5000:10000:5]]
        if not toy:
            return [clean_str(x.decode().strip()) for x in tqdm(f.readlines())]
        else:
            return [clean_str(x.decode().strip()) for x in tqdm(f.readlines()[:20000])]

def get_generated_title(data_path):
    with open(data_path, 'r') as f:
        return [clean_str(x.strip()) for x in tqdm(f.readlines())]

def get_word_dict():
    word_dict = {}
    with open(default_path+"word_dict.bz", "rb") as f:
        word_dict = pickle.load(f,compression='bz2')
    return word_dict

#### **Title Modification (<Unk> replacment and Grammar Check)**

In [None]:
tool = language_check.LanguageTool('en-US')
smoothing = SmoothingFunction().method0

def get_unk_tokens(word_dict, article):
    unk = defaultdict(float)
    tokens = word_tokenize(article)
    n = min(250,len(tokens))
    for i,token in enumerate(tokens[:250]):
        if token not in word_dict:
            unk[token]+= get_weight(i,n)
    tup = []
    for i in unk:
        tup.append((unk[i],i))
    return sorted(tup[:5],reverse=True)

def get_weight(index, token_len):
    p = index/token_len
    if(p<=0.1):
        return 0.35 
    if(p<=0.2):
        return 0.3
    if(p<=0.4):
        return 0.2
    if(p<=0.7):
        return 0.1
    return 0.05

def correct(text):
    matches = tool.check(text)
    text = language_check.correct(text, matches)
    return text

def update_title(word_dict,article, title):
    replace_count = 0
    unk_list = get_unk_tokens(word_dict, article)
    for j in range(min(title.count('<unk>'), len(unk_list))):
        title = title.replace('<unk>', unk_list[j][1],1)
        replace_count += 1
    return (correct(title), unk_list, replace_count)

def calculate_bleu(title, reference):
    title_tok,reference_tok = word_tokenize(title), [word_tokenize(reference)]
    return sentence_bleu(reference_tok,title_tok,smoothing_function=smoothing)

#### **Fine Tuning Generated Headlines**

In [None]:
word_dict = get_word_dict()

abstract_list = get_text_list(valid_article_path)
titles_generated = get_generated_title(default_path + "result.txt")

fine_tuned_titles = []
for i in trange(len(titles_generated)):
    fine_tuned_titles.append(update_title(word_dict, abstract_list[i], titles_generated[i]))

with open(result_path + "result.txt", "w") as f:
        f.write('\n'.join(fine_tuned_titles))

##### **BLEU** and **Rouge** scores calculation

In [None]:
rouge = Rouge()
original_title,generated_title= [],[]

print("Loading Data...")
original_title = get_generated_title(default_path + 'original.txt')
generated_title = get_generated_title(result_path + "result.txt")
abstract = get_text_list(valid_article_path)

print('Tokenizing Data...')
tokens_original = [[word_tokenize(s)] for s in tqdm(original_title)]
tokens_generated = [word_tokenize(s) for s in tqdm(generated_title)]
token_abstract = [word_tokenize(s) for s in tqdm(abstract)]

minmized_abstract = []
for line in token_abstract:
    minmized_abstract.append(' '.join(line[:40])+'...')

smoothing = SmoothingFunction().method0
print('Calculating BLEU Score')
bleu_score = []
for i in trange(len(tokens_original)):
    bleu_score.append(sentence_bleu(tokens_original[i],tokens_generated[i],smoothing_function=smoothing))
bleu = np.array(bleu_score)
print("BLEU score report")
print("Min Score:",bleu.min(),"Max Score:",bleu.max(),"Avg Score:",bleu.mean())

print('Calculating Rouge Score')
rouge1f,rouge1p,rouge1r = [],[],[]
rouge2f,rouge2p,rouge2r = [],[],[]
rougelf,rougelp,rougelr = [],[],[]
for i in trange(len(tokens_original)):
    score = rouge.get_scores(original_title[i],generated_title[i])
    rouge1f.append(score[0]['rouge-1']['f'])
    rouge1p.append(score[0]['rouge-1']['p'])
    rouge1r.append(score[0]['rouge-1']['r'])
    rouge2f.append(score[0]['rouge-2']['f'])
    rouge2p.append(score[0]['rouge-2']['p'])
    rouge2r.append(score[0]['rouge-2']['r'])
    rougelf.append(score[0]['rouge-l']['f'])
    rougelp.append(score[0]['rouge-l']['p'])
    rougelr.append(score[0]['rouge-l']['r'])

rouge1f,rouge1p,rouge1r = np.array(rouge1f),np.array(rouge1p),np.array(rouge1r)
rouge2f,rouge2p,rouge2r = np.array(rouge2f),np.array(rouge2p),np.array(rouge2r)
rougelf,rougelp,rougelr = np.array(rougelf),np.array(rougelp),np.array(rougelr)

df = pd.DataFrame(zip(minmized_abstract,original_title,generated_title,bleu,rouge1f,rouge1p,rouge1r,rouge2f,rouge2p,rouge2r,rougelf,rougelp,rougelr),columns=['Abstract','Original_Headline','Generated_Headline_x','Bleu_Score_x','Rouge-1_F_x','Rouge-1_P_x','Rouge-1_R_x','Rouge-2_F_x','Rouge-2_P_x','Rouge-2_R_x','Rouge-l_F_x','Rouge-l_P_x','Rouge-l_R_x'])
df.to_csv(result_path+'output_with_fine_tuning.csv',index=False)

print('Done!!')