In [16]:
import openreview
import pandas as pd
import numpy as np
import datetime as date
import os
import tqdm
import ast
import json
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
import string
import math
from sentence_transformers import SentenceTransformer, util
import torch

tqdm.tqdm.pandas()

In [8]:
df_rr_pairs_all = pd.read_csv('../data/model_inputs/df_sbert_rr_pairs_all.csv')

df_rr_pairs_all=df_rr_pairs_all.drop(['Unnamed: 0'], axis = 1)

In [9]:
df_rr_pairs_all.columns

Index(['year_x', 'type_x', 'invitation_x', 'readers_x', 'writers_x',
       'signatures_x', 'title_x', 'comment_x', 'id_x', 'original_x',
       'number_x', 'cdate_x', 'tcdate_x', 'tmdate_x', 'ddate_x', 'forum_x',
       'referent_x', 'replyto_x', 'nonreaders_x', 'details_x', 'rating_x',
       'review_x', 'confidence_x', 'year_y', 'type_y', 'invitation_y',
       'readers_y', 'writers_y', 'signatures_y', 'title_y', 'comment_y',
       'id_y', 'original_y', 'number_y', 'cdate_y', 'tcdate_y', 'tmdate_y',
       'ddate_y', 'forum_y', 'referent_y', 'replyto_y', 'nonreaders_y',
       'details_y', 'rating_y', 'review_y', 'confidence_y'],
      dtype='object')

In [10]:
def mySentTokenize(texts):
    # strip away splitters but sometimes formulas (more: +)
    sents = [sent.strip(" \t*#=-_<>") for parag in texts.split('\n') for sent in sent_tokenize(parag)]
    sents = [sent for sent in sents if sent]
    
    # fix i.e. e.g. and et al.
    cleaned_sents = []
    for sent in sents:
        if cleaned_sents:
            last_lc = cleaned_sents[-1].lower()
            if last_lc.endswith('i.e.') or last_lc.endswith('ie.') or \
              last_lc.endswith('e.g.') or last_lc.endswith('eg.') or \
              (last_lc.endswith('et al.') and sent[0].islower()):
                cleaned_sents[-1] += ' ' + sent
                continue
        cleaned_sents.append(sent)

    # return [sent for sent in sents if len(word_tokenize(sent)) > 9]
    # # eliminate single indicies but also 'Thanks!', 'Pros', 'Cons' and other section titles
    # sents = [sent for sent in cleaned_sents if len([word for word in word_tokenize(sent) if word not in string.punctuation]) > 1]
    sents = [sent for sent in cleaned_sents if len([word for word in word_tokenize(sent) if word not in string.punctuation]) > 9]
    return [sent for sent in sents if not (sent[-1] == ':' and len(word_tokenize(sent)) <= 10)]


In [17]:
df_rr_pairs_all['review_x_sent_tokens'] = df_rr_pairs_all['review_x'].progress_apply(lambda x: mySentTokenize(str(x)))
df_rr_pairs_all['comment_y_sent_tokens'] = df_rr_pairs_all['comment_y'].progress_apply(lambda x: mySentTokenize(str(x)))

100%|████████████████████████████████████| 10435/10435 [00:32<00:00, 319.44it/s]
100%|████████████████████████████████████| 10435/10435 [00:24<00:00, 431.56it/s]


In [28]:
df_rr_pairs_all[['forum_x','id_x','id_y','comment_y_sent_tokens']]

Unnamed: 0,forum_x,id_x,id_y,comment_y_sent_tokens
0,r1Ue8Hcxg,rkeMJc-Ee,,[]
1,HJlgm-B9lx,SyHgRujVl,,[]
2,rJq_YBqxx,rJJHggGNx,BkSQh9u4e,[We have added an appendix that described the ...
3,rJq_YBqxx,rJJHggGNx,By45ECGEl,[We think our system is less complicated compa...
4,r1S083cgx,rJpkoyG4g,,[]
...,...,...,...,...
10430,HJePno0cYm,SJgjpKAko7,Hylxe2VcA7,"[As shown in our paper, Transformer is the sta..."
10431,H1g2NhC5KQ,H1lXb9okiX,rylhcb16TX,"[To make the architecture clearer, we updated ..."
10432,B1x9siCcYQ,r11pKi1i7,HJl2qAoFCQ,[We just would like to highlight a couple of t...
10433,HyGh4sR9YQ,HJg26wuJsX,HkeZ_0Dq07,[We are glad that you identify our paper as a ...


In [32]:
out = []
model = SentenceTransformer('all-MiniLM-L6-v2')
for i, (rebuttal, review, forum) in tqdm.tqdm(df_rr_pairs_all[['comment_y_sent_tokens','review_x_sent_tokens','forum_x']].iterrows()):
    input_row = {}
    if rebuttal == []:
        input_row['forum'] = forum
        input_row['sim_score'] = 0
        input_row['consine_scores'] = np.NaN
        out.append(input_row)
    else:
        #Compute embedding for both lists
        embeddings1 = model.encode(review, convert_to_tensor=True)
        embeddings2 = model.encode(rebuttal, convert_to_tensor=True)

        #Compute cosine-similarits
        #todo find how to save all the embeddings with index of the sentences i guess
        cosine_scores = util.cos_sim(embeddings1, embeddings2)        
        max_cos_scores, y = torch.max(cosine_scores,dim=1)
        input_row['consine_scores'] = cosine_scores
        input_row['sim_score'] = torch.mean(max_cos_scores).item()
        input_row['forum'] = forum
        input_row['review'] = review
        input_row['rebuttal'] = rebuttal
        out.append(input_row)

#print(out)
df_cos_sim = pd.DataFrame(out)
df_cos_sim.to_csv('../data/model_output/df_cos_sim.csv',index=False)

10435it [2:38:04,  1.10it/s]


OSError: Cannot save file into a non-existent directory: '../data/model_outputs'

In [36]:
df_cos_sim.to_csv('../data/model_output/df_cos_sim.csv', index=False)