In [None]:
!pip install rouge 
!git clone https://github.com/microsoft/ProphetNet
!pip install fairseq==v0.9.0

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import os
from rouge import Rouge 
import string
from IPython.display import display, Markdown
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read data

In [None]:
PATH_TO_CRYPTO_NEWS = Path('../input/news-about-major-cryptocurrencies-20132018-40k/')


In [None]:
train_df = pd.read_csv(PATH_TO_CRYPTO_NEWS / 'crypto_news_parsed_2013-2017_train.csv')
valid_df = pd.read_csv(PATH_TO_CRYPTO_NEWS / 'crypto_news_parsed_2018_validation.csv')

In [None]:
valid_df['text'].fillna(' ', inplace=True)
train_df = train_df.dropna()
train_df = train_df[train_df['title']!=' ']

title_val = valid_df['title'] + '\n'
text_val = valid_df['text'] + '\n'

title_tr = train_df['title'] + '\n'
text_tr = train_df['text'] + '\n'

In [None]:
with open("val_text.txt", 'w') as f:
    f.writelines(text_val.values.tolist())
with open("val_target.txt", 'w') as f:
    f.writelines(title_val.values.tolist())

with open("tr_target.txt", 'w') as f:
    f.writelines(title_tr.values.tolist())
with open("tr_text.txt", 'w') as f:
    f.writelines(text_tr.values.tolist())

# Preprocessing the data
For the current dataset, I additionally truncate the source length at 400 tokens and the target at 15.

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
import tqdm
from transformers import BertTokenizer

def preprocess(fin, fout, keep_sep=False, max_len=512):
    fin = open(fin, 'r', encoding='utf-8')
    fout = open(fout, 'w', encoding='utf-8')
    twd = TreebankWordDetokenizer()
    bpe = BertTokenizer.from_pretrained('bert-base-uncased')
    for line in tqdm.tqdm(fin.readlines()):
        line = line.strip().replace('``', '"').replace('\'\'', '"').replace('`', '\'')
        s_list = [twd.detokenize(x.strip().split(
            ' '), convert_parentheses=True) for x in line.split('<S_SEP>')]
        tk_list = [bpe.tokenize(s) for s in s_list]
        output_string_list = [" ".join(s) for s in tk_list]
        if keep_sep:
            output_string = " [X_SEP] ".join(output_string_list)
        else:
            output_string = " ".join(output_string_list)
        output_string = " ".join(output_string.split(' ')[:max_len-1])
        fout.write('{}\n'.format(output_string))

In [None]:
!mkdir preprocessed_data

In [None]:
max_art_length = 400
max_title_length = 15
preprocess('val_text.txt', 'preprocessed_data/valid.src', keep_sep=False, max_len=max_art_length)
preprocess('val_target.txt', 'preprocessed_data/valid.tgt', keep_sep=True, max_len=max_title_length)
preprocess('tr_text.txt', 'preprocessed_data/train.src', keep_sep=False, max_len=max_art_length)
preprocess('tr_target.txt', 'preprocessed_data/train.tgt', keep_sep=True, max_len=max_title_length)

In [None]:
!fairseq-preprocess \
--user-dir ProphetNet/src/prophetnet/ \
--task translation_prophetnet \
--source-lang src --target-lang tgt \
--trainpref preprocessed_data/train \
--validpref preprocessed_data/valid \
--destdir dest_data/processed \
--srcdict ProphetNet/src/vocab.txt  \
--tgtdict ProphetNet/src/vocab.txt \
--bpe bert \
--workers 20

[ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/pdf/2001.04063.pdf)

I fine-tuned prophetnet_large 10 epoch with lr = 0.0001. 
Best model was on 5 epoch with validation perplexity 11.11. 

```
DATA_DIR='dest_data/processed'
USER_DIR='ProphetNet/src/prophetnet/'
ARCH='ngram_transformer_prophet_large'
CRITERION='ngram_language_loss'
SAVE_DIR='finetune/cryptonews'
PRETRAINED_MODEL='prophetnet_large_pretrained_160G_14epoch_model.pt'

!fairseq-train \
--user-dir $USER_DIR --task translation_prophetnet --arch $ARCH \
--optimizer adam --adam-betas '(0.9, 0.999)' --clip-norm 1 \
--lr 0.0001 \
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 1000 \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--criterion $CRITERION  \
--update-freq 32  --max-sentences 10 \
--num-workers 4 \
--bpe bert \
--load-from-pretrained-model $PRETRAINED_MODEL \
--load-sep \
--ddp-backend=no_c10d --max-epoch 10 \
--max-source-positions 402 --max-target-positions 17 \
--seed 1 \
--save-dir $SAVE_DIR \
--keep-last-epochs 1 \
$DATA_DIR ```

**Generating title for validation set
**



In [None]:
BEAM=5
LENPEN=1.2
CHECK_POINT='../input/prophetnetlarge-finetuned/checkpoint_best.pt'
TEMP_FILE='predict_outputs.txt'
OUTPUT_FILE='sorted_outputs.txt'

!fairseq-generate dest_data/processed --path $CHECK_POINT --user-dir ProphetNet/src/prophetnet --task translation_prophetnet --batch-size 32 --gen-subset valid --beam $BEAM --max-len-a 0 --max-len-b 15 --min-len 6 --num-workers 4 --no-repeat-ngram-size 3 --lenpen $LENPEN 2>&1 > $TEMP_FILE

In [None]:
!grep ^H $TEMP_FILE | cut -c 3- | sort -n | cut -f3- | sed "s/ ##//g" > $OUTPUT_FILE

In [None]:
with open('sorted_outputs.txt', 'r') as f:
    predicted = f.readlines()


In [None]:
punctuation = string.punctuation
true_val_titles = valid_df['title'].tolist()
true_titles = []
for tr in true_val_titles:
    for p in punctuation:
        tr = tr.replace(p, f' {p} ')
    true_titles.append(tr.lower().replace('  ', ' '))

In [None]:
predicted = [x.lower().replace('\n', '') for x in predicted]

In [None]:
%%time
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(hyps=predicted, refs=true_titles, avg=True, ignore_empty=True)

In [None]:
scores

In [None]:
final_metric = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f']) / 3
final_metric

# Eyeballing the results: good and bad cases

In [None]:
scores_by_example = rouge.get_scores(hyps=predicted, refs=true_titles, avg=False, ignore_empty=True)
scores_by_example = np.array([(x['rouge-1']['f'] + x['rouge-2']['f'] + x['rouge-l']['f']) / 3 for x in scores_by_example])

In [None]:
def print_result(index):
    display(Markdown('> **Rouge:** ' + str(round(scores_by_example[index], 3))))
    display(Markdown('> **Title:** ' + valid_df['title'].iloc[index]))
    display(Markdown('> **Generated:** ' + predicted[index]))
    display(Markdown('> **Text:** ' + valid_df['text'].iloc[index]))

In [None]:
top_best_10 = scores_by_example.argsort()[-10:]
top_worst_10 = scores_by_example.argsort()[:10]

In [None]:
for i in top_best_10:
    print_result(i)

In [None]:
for i in top_worst_10:
    print_result(i)