# Libraries

In [95]:
import pandas as pd
import numpy as np
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pickle
from sklearn.manifold import TSNE
import mplcursors
%matplotlib widget
from sentence_transformers import SentenceTransformer
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler

[nltk_data] Downloading package punkt to /home/a2211506/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load the dataset

If you don't have a dataset, please run following commands.
- `pip install gdown`
- `gdown https://drive.google.com/uc?id=19rEco5OT4Um-0DNzna27UfZ1wjSG5oKC`  
(`wget` and `curl` does not correspond to `Google Drive`. You can get another format of datasets in [ACL Anthology Corpus - Full Text](https://github.com/shauryr/ACL-anthology-corpus).)

In [96]:
acl_full_text = pd.read_parquet('acl_corpus_full-text.parquet')

In [97]:
acl_full_text

Unnamed: 0,acl_id,title,abstract,full_text
0,O02-2002,A Study on Word Similarity using Context Vecto...,There is a need to measure word similarity whe...,There is a need to measure word similarity whe...
1,L02-1310,,,
2,R13-1042,"Headerless, Quoteless, but not Hopeless? Using...",Thread disentanglement is the task of separati...,Thread disentanglement is the task of separati...
3,W05-0819,Aligning words in English-Hindi parallel corpora,"In this paper, we describe a word alignment al...","In this paper, we describe a word alignment al..."
4,L02-1309,,,
...,...,...,...,...
80008,P99-1056,The grapho-phonological system of written Fren...,The processes through which readers evoke ment...,The processes through which readers evoke ment...
80009,P99-1051,Acquiring Lexical Generalizations from Corpora...,This paper examines the extent to which verb d...,This paper examines the extent to which verb d...
80010,P99-1000,Using Aggregation for Selecting Content when G...,"As co-chairs, we had two particular objectives...","As co-chairs, we had two particular objectives..."
80011,P99-1,,,


In [98]:
acl_full_text

Unnamed: 0,acl_id,title,abstract,full_text
0,O02-2002,A Study on Word Similarity using Context Vecto...,There is a need to measure word similarity whe...,There is a need to measure word similarity whe...
1,L02-1310,,,
2,R13-1042,"Headerless, Quoteless, but not Hopeless? Using...",Thread disentanglement is the task of separati...,Thread disentanglement is the task of separati...
3,W05-0819,Aligning words in English-Hindi parallel corpora,"In this paper, we describe a word alignment al...","In this paper, we describe a word alignment al..."
4,L02-1309,,,
...,...,...,...,...
80008,P99-1056,The grapho-phonological system of written Fren...,The processes through which readers evoke ment...,The processes through which readers evoke ment...
80009,P99-1051,Acquiring Lexical Generalizations from Corpora...,This paper examines the extent to which verb d...,This paper examines the extent to which verb d...
80010,P99-1000,Using Aggregation for Selecting Content when G...,"As co-chairs, we had two particular objectives...","As co-chairs, we had two particular objectives..."
80011,P99-1,,,


# Comparison between 19's to 20's

In [99]:
acl_full_text
acl_full_text.insert(1, 'year', 0)
acl_full_text.insert(2, 'era_id', 0)

In [100]:
for index, row in acl_full_text.iterrows():
    if(int(row[0][1]) == 0 or int(row[0][1]) == 1 or int(row[0][1]) == 2):
        year = int(f'20{row[0][1:3]}')
        era_id = 20
    else:
        year = int(f'19{row[0][1:3]}')
        era_id = 19
    acl_full_text.iat[index, 1] = year
    acl_full_text.iat[index, 2] = era_id

In [101]:
origin_data = acl_full_text[acl_full_text['acl_id'].str.startswith('P')] # Extract ACL papers
origin_data

Unnamed: 0,acl_id,year,era_id,title,abstract,full_text
55,P07-1084,2007,20,"Bilingual Terminology Mining -Using Brain, not...",Current research in text mining favours the qu...,Current research in text mining favours the qu...
75,P12-3005,2012,20,langid.py: An Off-the-shelf Language Identific...,"We present langid.py, an off-the-shelf languag...","We present langid.py, an off-the-shelf languag..."
76,P12-3016,2012,20,ACCURAT Toolkit for Multi-Level Alignment and ...,The lack of parallel corpora and linguistic re...,The lack of parallel corpora and linguistic re...
78,P13-1068,2013,20,Large tagset labeling using Feed Forward Neura...,Standard methods for part-of-speech tagging su...,Standard methods for part-of-speech tagging su...
80,P19-1161,2019,20,Counterfactual Data Augmentation for Mitigatin...,Gender stereotypes are manifest in most of the...,Gender stereotypes are manifest in most of the...
...,...,...,...,...,...,...
80008,P99-1056,1999,19,The grapho-phonological system of written Fren...,The processes through which readers evoke ment...,The processes through which readers evoke ment...
80009,P99-1051,1999,19,Acquiring Lexical Generalizations from Corpora...,This paper examines the extent to which verb d...,This paper examines the extent to which verb d...
80010,P99-1000,1999,19,Using Aggregation for Selecting Content when G...,"As co-chairs, we had two particular objectives...","As co-chairs, we had two particular objectives..."
80011,P99-1,1999,19,,,


In [102]:
docs = origin_data.iloc[:]["abstract"]

In [103]:
model = SentenceTransformer('all-MiniLM-L6-v2') 
sentences = list(docs)
#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

In [104]:
data = [[(i)] + list(embeddings[i]) for i in range(len(docs))]
column_names = ["type"]+["x_"+str(dim) for dim in range(len(embeddings[0]))]
out = pd.DataFrame(data, columns=column_names)
# out = out.sort_values(["type"])
out.to_csv("acl_full-text_128dim_feat.csv", index=None)
out.insert(1, 'year', origin_data['year'].values)
out.insert(2, 'title', origin_data['title'].values)
out.insert(3, 'era_id', origin_data['era_id'].values)
out

Unnamed: 0,type,year,title,era_id,x_0,x_1,x_2,x_3,x_4,x_5,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
0,0,2007,"Bilingual Terminology Mining -Using Brain, not...",20,0.018361,-0.069289,-0.045787,-0.043936,-0.018778,-0.021305,...,0.034927,0.087479,0.006538,0.018238,0.010428,0.004054,0.034271,0.006479,0.102239,-0.011539
1,1,2012,langid.py: An Off-the-shelf Language Identific...,20,-0.105661,-0.071641,0.016773,0.029877,0.055789,0.004766,...,0.031761,-0.009294,-0.067167,-0.002346,0.067897,0.026724,-0.035206,0.074351,0.032862,0.036407
2,2,2012,ACCURAT Toolkit for Multi-Level Alignment and ...,20,-0.035305,0.002230,-0.027929,-0.065353,-0.033240,-0.010883,...,-0.018830,0.054944,-0.051018,-0.008120,0.091906,0.000924,-0.009921,-0.073989,0.058452,-0.002942
3,3,2013,Large tagset labeling using Feed Forward Neura...,20,-0.053140,-0.046510,0.060817,0.003574,-0.023673,-0.035749,...,-0.013833,0.046963,-0.008109,-0.038660,0.006526,0.051414,0.055248,0.082104,0.011507,0.023412
4,4,2019,Counterfactual Data Augmentation for Mitigatin...,20,0.045053,0.051824,0.097364,0.006804,-0.012303,-0.034496,...,0.017489,-0.026561,-0.073428,0.044345,-0.005372,0.094221,0.085115,0.072723,0.025918,-0.102300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6843,6843,1999,The grapho-phonological system of written Fren...,19,0.008246,-0.060258,0.026317,-0.046699,-0.074417,0.036662,...,0.050305,-0.023778,-0.001076,0.043697,-0.003842,0.000818,0.008940,0.064710,-0.008511,-0.080687
6844,6844,1999,Acquiring Lexical Generalizations from Corpora...,19,0.037969,-0.074707,0.098113,0.012809,-0.000898,-0.016804,...,0.036504,0.056846,-0.042679,-0.013597,0.019778,0.020553,0.036398,0.002618,0.045448,-0.045846
6845,6845,1999,Using Aggregation for Selecting Content when G...,19,-0.050357,0.010708,-0.037823,0.015405,0.047308,0.030114,...,0.072255,0.113188,-0.046559,-0.028938,-0.014051,0.018255,0.047413,0.031819,-0.069908,0.041277
6846,6846,1999,,19,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717


# Decennial Comparison

In [105]:
df1 = out[(out['year'] == 2000)].sample(n=80)
# df2 = out[(out['year'] == 2005)].sample(n=80)
df3 = out[(out['year'] == 2010)].sample(n=80)
# df4 = out[(out['year'] == 2010)].sample(n=80)
df5 = out[(out['year'] == 2019)].sample(n=80)
decennial_out = pd.concat([df1, df3, df5], axis=0) # df2, df3, df4,
decennial_out

Unnamed: 0,type,year,title,era_id,x_0,x_1,x_2,x_3,x_4,x_5,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
6562,6562,2000,"Generic NLP Technologies: Language, Knowledge ...",20,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
2114,2114,2000,A Constraint-based Approach to English Prosodi...,20,0.045043,-0.033382,0.065596,-0.114945,-0.086161,0.050376,...,-0.023803,0.051378,-0.004509,0.031187,-0.006987,-0.014140,0.091878,0.087572,0.081552,-0.085340
2430,2430,2000,a cb ed gf ih qp sr gt gd u )v 6w yx x 6x ¦ 6 ...,20,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
2229,2229,2000,"¢¡ ¤£ ¥¡ §¦ © ¦ ¦ © ! ""£ "" $# % ""£ ¥& ' ¦ £ ¥ ...",20,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
4972,4972,2000,Query-Relevant Summarization using FAQs,20,-0.006374,0.042739,-0.037649,0.034485,0.042604,0.085942,...,0.020160,-0.034567,0.063022,-0.037991,0.068156,-0.003041,0.012255,0.062987,0.007088,0.072261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,1825,2019,Revisiting Joint Modeling of Cross-document En...,20,-0.028619,-0.049142,0.063358,-0.024484,0.099629,0.077400,...,0.036631,0.067196,0.011151,-0.050000,0.065749,0.036370,-0.007256,0.017618,0.010934,0.131057
1572,1572,2019,Putting Evaluation in Context: Contextual Embe...,20,-0.006061,-0.088416,-0.034406,0.031932,0.081604,-0.005672,...,0.062917,0.045372,0.006774,-0.038584,0.077082,0.062210,-0.021635,0.015855,-0.007763,-0.043921
4302,4302,2019,Neural Text Simplification of Clinical Letters...,20,-0.026094,-0.014658,0.023012,-0.018591,0.011769,0.019559,...,0.051751,0.084831,0.053418,0.055694,-0.038669,0.068791,0.016495,0.037822,-0.009372,0.023822
4155,4155,2019,Are we there yet? Encoder-decoder neural netwo...,20,-0.046121,-0.090919,0.041789,-0.020282,0.006140,0.054499,...,0.091933,0.017866,0.021257,-0.024974,-0.019788,0.026923,0.025894,0.118947,-0.054728,-0.007261


In [106]:
title_list = out['title'].values
X = decennial_out.iloc[:, 4:]
pca = PCA(n_components=2)
pca.fit(X)
x_pcx = pca.transform(X)
x_pcx.shape

(240, 2)

In [107]:
pcx_2000_2010_2019 =  pd.DataFrame(data=x_pcx, columns=['x1', 'x2'])
pcx_2000_2010_2019.shape

(240, 2)

In [108]:
pcx_2000_2010_2019.insert(0, 'title', decennial_out['title'].values)
pcx_2000_2010_2019.insert(1, 'lab_or_not', 'notlab')
pcx_2000_2010_2019.insert(2, 'award', 'normal')
pcx_2000_2010_2019.insert(3, 'year', decennial_out['year'].values)
pcx_2000_2010_2019

Unnamed: 0,title,lab_or_not,award,year,x1,x2
0,"Generic NLP Technologies: Language, Knowledge ...",notlab,normal,2000,0.900063,-0.025013
1,A Constraint-based Approach to English Prosodi...,notlab,normal,2000,-0.112024,0.404659
2,a cb ed gf ih qp sr gt gd u )v 6w yx x 6x ¦ 6 ...,notlab,normal,2000,0.900063,-0.025013
3,"¢¡ ¤£ ¥¡ §¦ © ¦ ¦ © ! ""£ "" $# % ""£ ¥& ' ¦ £ ¥ ...",notlab,normal,2000,0.900063,-0.025013
4,Query-Relevant Summarization using FAQs,notlab,normal,2000,-0.177430,-0.241778
...,...,...,...,...,...,...
235,Revisiting Joint Modeling of Cross-document En...,notlab,normal,2019,-0.293890,-0.263055
236,Putting Evaluation in Context: Contextual Embe...,notlab,normal,2019,-0.279168,-0.107920
237,Neural Text Simplification of Clinical Letters...,notlab,normal,2019,-0.208981,-0.066609
238,Are we there yet? Encoder-decoder neural netwo...,notlab,normal,2019,-0.197103,-0.027206


In [109]:
import plotly.express as px

pcx_2000_2010_2019["year"] = pcx_2000_2010_2019["year"].astype(str) # change a legend color to Discrete form Continuous Color
fig = px.scatter(pcx_2000_2010_2019, x="x1", y="x2", color='year', hover_data=[pcx_2000_2010_2019['title'].values])
fig.show()

fig.update_layout(title=f"Comparison of 2000, 2015 and 2019 ACL papers(SBERT)")
fig.write_html(f"Comparison_2000_2010_2015_2000(SBERT).html")

# Best Pape 2019

In [110]:
out_2019 = out[(out['year'] == 2019)]
out_2019

Unnamed: 0,type,year,title,era_id,x_0,x_1,x_2,x_3,x_4,x_5,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
4,4,2019,Counterfactual Data Augmentation for Mitigatin...,20,0.045053,0.051824,0.097364,0.006804,-0.012303,-0.034496,...,0.017489,-0.026561,-0.073428,0.044345,-0.005372,0.094221,0.085115,0.072723,0.025918,-0.102300
7,7,2019,Unsupervised Discovery of Gendered Language th...,20,0.055325,-0.026664,0.026544,0.035051,-0.025747,0.028869,...,0.042687,-0.009107,-0.044295,0.062166,0.015808,0.011218,0.014497,0.052990,0.027738,-0.078972
197,197,2019,Margin-based Parallel Corpus Mining with Multi...,20,-0.043548,-0.073220,-0.017195,-0.006280,0.074463,0.062449,...,0.020356,0.067960,-0.048375,-0.060403,0.074845,0.020359,0.031584,-0.009068,-0.054869,0.029275
214,214,2019,Searching for Effective Neural Extractive Summ...,20,-0.087911,-0.016023,0.083686,0.014348,0.102073,0.052615,...,-0.022277,0.001834,0.037592,-0.062821,0.059469,0.020610,-0.007068,0.078916,-0.054624,-0.012028
216,216,2019,Style Transformer: Unpaired Text Style Transfe...,20,-0.090578,-0.026025,0.070133,0.055996,0.007759,0.059559,...,0.018578,0.012675,0.015493,0.084550,0.030756,0.020959,-0.017649,0.082438,0.026562,-0.056312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6806,6806,2019,Gender Stereotypes Differ between Male and Fem...,20,0.046365,0.002689,0.039982,0.063683,-0.007623,-0.039271,...,0.016852,-0.021938,-0.065961,0.065509,0.014462,0.008181,0.118501,0.013749,0.004447,-0.097342
6809,6809,2019,Natural Language Generation: Recently Learned ...,20,-0.034928,-0.045406,0.002385,0.063168,-0.065263,0.067551,...,0.077954,0.003297,0.036508,0.023411,0.079902,-0.005770,0.033541,0.121778,0.062387,0.002115
6827,6827,2019,Towards Turkish Abstract Meaning Representation,20,0.035353,0.065390,0.082225,0.001338,-0.001349,0.011120,...,0.123215,0.043724,0.031627,0.040976,0.042600,-0.017804,0.082463,0.089982,0.016809,-0.049283
6830,6830,2019,Predicting the Outcome of Deliberative Democra...,20,-0.004017,-0.119011,0.006992,0.003682,0.042108,-0.002981,...,0.057444,0.069133,-0.005857,0.003294,0.040088,0.074956,0.047436,0.075340,-0.021177,-0.023391


In [111]:
title_list = out_2019['title'].values
X = out_2019.iloc[:, 4:]
pca = PCA(n_components=2)
pca.fit(X)
x_pcx = pca.transform(X)
x_pcx.shape

(771, 2)

In [112]:
pcx_2019 =  pd.DataFrame(data=x_pcx, columns=['x1', 'x2'])

In [113]:
pcx_2019.insert(0, 'title', out[out['year'] == 2019]['title'].values)
pcx_2019.insert(1, 'award', 'normal')
pcx_2019.insert(2, 'lab_or_not', 'notlab')

In [114]:
pcx_2019

Unnamed: 0,title,award,lab_or_not,x1,x2
0,Counterfactual Data Augmentation for Mitigatin...,normal,notlab,0.139320,-0.332224
1,Unsupervised Discovery of Gendered Language th...,normal,notlab,0.259765,-0.252255
2,Margin-based Parallel Corpus Mining with Multi...,normal,notlab,-0.200356,-0.276796
3,Searching for Effective Neural Extractive Summ...,normal,notlab,-0.053926,0.118147
4,Style Transformer: Unpaired Text Style Transfe...,normal,notlab,-0.157264,0.012630
...,...,...,...,...,...
766,Gender Stereotypes Differ between Male and Fem...,normal,notlab,0.330663,-0.273882
767,Natural Language Generation: Recently Learned ...,normal,notlab,0.100569,0.037523
768,Towards Turkish Abstract Meaning Representation,normal,notlab,0.063818,-0.124647
769,Predicting the Outcome of Deliberative Democra...,normal,notlab,0.268622,0.004009


In [115]:
best_paper_awards = ['OpenKiwi: An Open Source Framework for Quality Estimation',
'Emotion-Cause Pair Extraction: A New Task to Emotion Analysis in Texts.',
'A Simple Theoretical Model of Importance for Summarization',
'Transferable Multi-Domain State Generator for Task-Oriented Dialogue Systems',
'We need to talk about standard splits',
'Zero-shot Word Sense Disambiguation using Sense Definition Embeddings',
'Do you know that Florence is packed with visitors? Evaluating state-of-the-art models of speaker commitment.',
'Bridging the Gap between Training and Inference for Neural Machine Translation.',
]

In [116]:
for index, row in pcx_2019.iterrows():
    if (row['title'] in best_paper_awards):
        pcx_2019.iat[index, 1] = 'award'

pcx_2019[pcx_2019['award'] == 'award']

Unnamed: 0,title,award,lab_or_not,x1,x2
169,OpenKiwi: An Open Source Framework for Quality...,award,notlab,-0.157024,-0.108724
223,Zero-shot Word Sense Disambiguation using Sens...,award,notlab,-0.074407,-0.123164
375,Transferable Multi-Domain State Generator for ...,award,notlab,-0.043104,0.301466
543,A Simple Theoretical Model of Importance for S...,award,notlab,0.117568,0.083693
581,We need to talk about standard splits,award,notlab,0.124279,-0.065231


In [117]:
import plotly.express as px
fig = px.scatter(pcx_2019, x="x1", y="x2", color="award", hover_data=[pcx_2019['title'].values])
fig.show()
fig.update_layout(title=f"Best Paper in 2019")
fig.write_html(f"BestPaper_2019(SBERT).html")

# 乾研究室の論文を可視化

In [118]:
award_inui_lab = ['An Empirical Study of Span Representations in Argumentation Structure Parsing']

In [119]:
for index, row in pcx_2019.iterrows():
    if (row['title'] in award_inui_lab):
        pcx_2019.iat[index, 2] = 'inuilab'

pcx_2019[pcx_2019['lab_or_not'] == 'inuilab']

Unnamed: 0,title,award,lab_or_not,x1,x2
55,An Empirical Study of Span Representations in ...,normal,inuilab,0.069995,0.010502


In [120]:
import plotly.express as px
fig = px.scatter(pcx_2019, x="x1", y="x2", color="lab_or_not", hover_data=[pcx_2019['title'].values])
fig.show()
fig.update_layout(title=f"LabPaper(2019)")
fig.write_html(f"LabPaper(2019).html")