# ThesisVisualizationTools sample code 
## Requrement
- GPU
    - if you want to run it on Colab, please change a runtime setting 



# Libraries

In [1]:
!pip install -q sentence_transformers

[K     |████████████████████████████████| 85 kB 5.5 MB/s 
[K     |████████████████████████████████| 5.5 MB 48.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 52.1 MB/s 
[K     |████████████████████████████████| 163 kB 65.6 MB/s 
[K     |████████████████████████████████| 7.6 MB 41.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pickle
from sentence_transformers import SentenceTransformer
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler

# Load the dataset

If you don't have a dataset, please run following commands.
- `pip install gdown`
- `gdown https://drive.google.com/uc?id=19rEco5OT4Um-0DNzna27UfZ1wjSG5oKC`  
(`wget` and `curl` does not correspond to `Google Drive`. You can get another format of datasets in [ACL Anthology Corpus - Full Text](https://github.com/shauryr/ACL-anthology-corpus).)

In [3]:
!pip install -q  gdown 

In [4]:
!gdown https://drive.google.com/uc?id=19rEco5OT4Um-0DNzna27UfZ1wjSG5oKC

Downloading...
From: https://drive.google.com/uc?id=19rEco5OT4Um-0DNzna27UfZ1wjSG5oKC
To: /content/acl_corpus_full-text.parquet
100% 527M/527M [00:13<00:00, 38.0MB/s]


In [5]:
acl_full_text = pd.read_parquet('acl_corpus_full-text.parquet')

In [6]:
acl_full_text

Unnamed: 0,acl_id,title,abstract,full_text
0,O02-2002,A Study on Word Similarity using Context Vecto...,There is a need to measure word similarity whe...,There is a need to measure word similarity whe...
1,L02-1310,,,
2,R13-1042,"Headerless, Quoteless, but not Hopeless? Using...",Thread disentanglement is the task of separati...,Thread disentanglement is the task of separati...
3,W05-0819,Aligning words in English-Hindi parallel corpora,"In this paper, we describe a word alignment al...","In this paper, we describe a word alignment al..."
4,L02-1309,,,
...,...,...,...,...
80008,P99-1056,The grapho-phonological system of written Fren...,The processes through which readers evoke ment...,The processes through which readers evoke ment...
80009,P99-1051,Acquiring Lexical Generalizations from Corpora...,This paper examines the extent to which verb d...,This paper examines the extent to which verb d...
80010,P99-1000,Using Aggregation for Selecting Content when G...,"As co-chairs, we had two particular objectives...","As co-chairs, we had two particular objectives..."
80011,P99-1,,,


# Preprocessing

In [7]:
acl_full_text
acl_full_text.insert(1, 'year', 0)
acl_full_text.insert(2, 'era_id', 0)

In [8]:
for index, row in acl_full_text.iterrows():
    if(int(row[0][1]) == 0 or int(row[0][1]) == 1 or int(row[0][1]) == 2):
        year = int(f'20{row[0][1:3]}')
        era_id = 20
    else:
        year = int(f'19{row[0][1:3]}')
        era_id = 19
    acl_full_text.iat[index, 1] = year
    acl_full_text.iat[index, 2] = era_id

In [9]:
origin_data = acl_full_text[acl_full_text['acl_id'].str.startswith('P')] # Extract ACL papers
origin_data

Unnamed: 0,acl_id,year,era_id,title,abstract,full_text
55,P07-1084,2007,20,"Bilingual Terminology Mining -Using Brain, not...",Current research in text mining favours the qu...,Current research in text mining favours the qu...
75,P12-3005,2012,20,langid.py: An Off-the-shelf Language Identific...,"We present langid.py, an off-the-shelf languag...","We present langid.py, an off-the-shelf languag..."
76,P12-3016,2012,20,ACCURAT Toolkit for Multi-Level Alignment and ...,The lack of parallel corpora and linguistic re...,The lack of parallel corpora and linguistic re...
78,P13-1068,2013,20,Large tagset labeling using Feed Forward Neura...,Standard methods for part-of-speech tagging su...,Standard methods for part-of-speech tagging su...
80,P19-1161,2019,20,Counterfactual Data Augmentation for Mitigatin...,Gender stereotypes are manifest in most of the...,Gender stereotypes are manifest in most of the...
...,...,...,...,...,...,...
80008,P99-1056,1999,19,The grapho-phonological system of written Fren...,The processes through which readers evoke ment...,The processes through which readers evoke ment...
80009,P99-1051,1999,19,Acquiring Lexical Generalizations from Corpora...,This paper examines the extent to which verb d...,This paper examines the extent to which verb d...
80010,P99-1000,1999,19,Using Aggregation for Selecting Content when G...,"As co-chairs, we had two particular objectives...","As co-chairs, we had two particular objectives..."
80011,P99-1,1999,19,,,


In [10]:
docs = origin_data.iloc[:]["abstract"]

In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2') 
sentences = list(docs)
#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [12]:
data = [[(i)] + list(embeddings[i]) for i in range(len(docs))]
column_names = ["type"]+["x_"+str(dim) for dim in range(len(embeddings[0]))]
out = pd.DataFrame(data, columns=column_names)
# out = out.sort_values(["type"])
out.to_csv("acl_full-text_128dim_feat.csv", index=None)
out.insert(1, 'year', origin_data['year'].values)
out.insert(2, 'title', origin_data['title'].values)
out

Unnamed: 0,type,year,title,x_0,x_1,x_2,x_3,x_4,x_5,x_6,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
0,0,2007,"Bilingual Terminology Mining -Using Brain, not...",0.018361,-0.069289,-0.045787,-0.043936,-0.018778,-0.021305,0.046412,...,0.034927,0.087479,0.006538,0.018238,0.010428,0.004054,0.034271,0.006479,0.102239,-0.011539
1,1,2012,langid.py: An Off-the-shelf Language Identific...,-0.105661,-0.071641,0.016773,0.029877,0.055789,0.004766,0.031019,...,0.031761,-0.009294,-0.067167,-0.002346,0.067897,0.026724,-0.035206,0.074351,0.032862,0.036407
2,2,2012,ACCURAT Toolkit for Multi-Level Alignment and ...,-0.035305,0.002230,-0.027929,-0.065353,-0.033240,-0.010883,-0.041478,...,-0.018830,0.054944,-0.051018,-0.008120,0.091906,0.000924,-0.009921,-0.073989,0.058452,-0.002942
3,3,2013,Large tagset labeling using Feed Forward Neura...,-0.053140,-0.046510,0.060817,0.003574,-0.023673,-0.035749,0.012342,...,-0.013833,0.046963,-0.008109,-0.038660,0.006526,0.051414,0.055248,0.082104,0.011507,0.023412
4,4,2019,Counterfactual Data Augmentation for Mitigatin...,0.045053,0.051824,0.097364,0.006804,-0.012303,-0.034496,0.071024,...,0.017489,-0.026561,-0.073428,0.044345,-0.005372,0.094221,0.085115,0.072723,0.025918,-0.102300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6843,6843,1999,The grapho-phonological system of written Fren...,0.008246,-0.060258,0.026317,-0.046699,-0.074417,0.036662,0.062985,...,0.050305,-0.023778,-0.001076,0.043697,-0.003842,0.000818,0.008940,0.064710,-0.008511,-0.080687
6844,6844,1999,Acquiring Lexical Generalizations from Corpora...,0.037969,-0.074707,0.098113,0.012809,-0.000898,-0.016804,0.050174,...,0.036504,0.056846,-0.042679,-0.013597,0.019778,0.020553,0.036398,0.002618,0.045448,-0.045846
6845,6845,1999,Using Aggregation for Selecting Content when G...,-0.050357,0.010708,-0.037823,0.015405,0.047308,0.030114,0.002136,...,0.072255,0.113188,-0.046559,-0.028938,-0.014051,0.018255,0.047413,0.031819,-0.069909,0.041277
6846,6846,1999,,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,0.115433,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717


# Decennial Comparison

In [13]:
df1 = out[(out['year'] == 2000)].sample(n=80)
# df2 = out[(out['year'] == 2005)].sample(n=80)
df3 = out[(out['year'] == 2010)].sample(n=80)
# df4 = out[(out['year'] == 2010)].sample(n=80)
df5 = out[(out['year'] == 2019)].sample(n=80)
decennial_out = pd.concat([df1, df3, df5], axis=0) # df2, df3, df4,
decennial_out

Unnamed: 0,type,year,title,x_0,x_1,x_2,x_3,x_4,x_5,x_6,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
6511,6511,2000,S UT V XW QV `Y a Eb $c da EW Qe ¦a Ef `g ¦T i...,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,0.115433,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
143,143,2000,Lexicalized Stochastic Modeling of Constraint-...,0.012664,-0.121479,0.043401,-0.002518,0.048654,-0.010552,0.015556,...,0.053638,0.017729,-0.013209,-0.046627,0.043719,-0.031709,0.051861,0.035461,0.002857,-0.053933
4972,4972,2000,Query-Relevant Summarization using FAQs,-0.006374,0.042739,-0.037649,0.034485,0.042604,0.085942,0.091994,...,0.020160,-0.034567,0.063022,-0.037991,0.068156,-0.003041,0.012255,0.062987,0.007088,0.072261
3816,3816,2000,D E GF ¨H I QP SR UT WV ¢X `Y ba dc fe hg i qp...,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,0.115433,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
5694,5694,2000,,0.008570,0.018861,0.058193,-0.049902,-0.029992,0.077190,0.084569,...,0.058883,0.026053,0.027545,0.024396,-0.048527,-0.037013,0.023605,0.010817,0.073720,-0.076497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3093,3093,2019,Reliability-aware Dynamic Feature Composition ...,-0.062348,-0.150640,-0.040459,0.047742,0.047253,-0.008203,0.019848,...,0.023675,0.041713,0.026853,0.057794,0.048746,-0.045725,0.039063,-0.029375,0.037366,0.050851
1779,1779,2019,Revisiting Low-Resource Neural Machine Transla...,-0.010180,-0.032494,0.051061,0.048925,0.025601,0.003050,-0.034723,...,0.079970,0.073329,0.004255,-0.030866,0.047006,0.027952,0.018962,-0.006608,-0.079275,-0.003443
2798,2798,2019,Semantically Conditioned Dialog Response Gener...,0.009970,-0.102874,0.045304,0.020113,0.031783,-0.047598,-0.012982,...,-0.005918,0.006566,0.007046,-0.045648,0.061141,0.088045,0.035674,-0.000636,-0.031123,-0.007221
2407,2407,2019,"ClaimPortal: Integrated Monitoring, Searching,...",-0.056997,-0.002463,-0.028062,0.005520,0.135265,-0.024588,0.059676,...,0.001754,0.044520,0.027584,-0.002449,0.032630,0.036001,0.074970,0.015704,0.055119,0.013746


In [14]:
title_list = out['title'].values
X = decennial_out.iloc[:, 3:]
pca = PCA(n_components=2)
pca.fit(X)
x_pcx = pca.transform(X)
x_pcx.shape

(240, 2)

In [15]:
pcx_2000_2010_2019 =  pd.DataFrame(data=x_pcx, columns=['x1', 'x2'])
pcx_2000_2010_2019.shape

(240, 2)

In [16]:
pcx_2000_2010_2019.insert(0, 'title', decennial_out['title'].values)
pcx_2000_2010_2019.insert(1, 'mypaper_or_not', 'notmypaper')
pcx_2000_2010_2019.insert(2, 'year', decennial_out['year'].values)
pcx_2000_2010_2019

Unnamed: 0,title,mypaper_or_not,year,x1,x2
0,S UT V XW QV `Y a Eb $c da EW Qe ¦a Ef `g ¦T i...,notmypaper,2000,0.903897,0.023942
1,Lexicalized Stochastic Modeling of Constraint-...,notmypaper,2000,-0.266805,-0.138053
2,Query-Relevant Summarization using FAQs,notmypaper,2000,-0.156220,0.082331
3,D E GF ¨H I QP SR UT WV ¢X `Y ba dc fe hg i qp...,notmypaper,2000,0.903897,0.023942
4,,notmypaper,2000,0.203174,-0.347253
...,...,...,...,...,...
235,Reliability-aware Dynamic Feature Composition ...,notmypaper,2019,-0.323716,0.053280
236,Revisiting Low-Resource Neural Machine Transla...,notmypaper,2019,-0.307907,0.105377
237,Semantically Conditioned Dialog Response Gener...,notmypaper,2019,-0.239261,0.475755
238,"ClaimPortal: Integrated Monitoring, Searching,...",notmypaper,2019,-0.156190,-0.090019


In [41]:
pcx_2000_2010_2019["year"] = pcx_2000_2010_2019["year"].astype(str) # change a legend color to Discrete form Continuous Color
fig = px.scatter(pcx_2000_2010_2019, x="x1", y="x2", color='year', symbol='year', hover_data=[pcx_2000_2010_2019['title'].values], range_x=[-0.4, 0.2])
fig.update_traces(marker_size=10)
fig.show()
fig.update_layout(title=f"Comparison of 2000, 2015 and 2019 ACL papers(SBERT)")
fig.write_html(f"Comparison_2000_2010_2015_2000(SBERT).html")

# Visualization of a spicific paper

## Preproceeding

In [18]:
out_2019 = out[(out['year'] == 2019)]
out_2019

Unnamed: 0,type,year,title,x_0,x_1,x_2,x_3,x_4,x_5,x_6,...,x_374,x_375,x_376,x_377,x_378,x_379,x_380,x_381,x_382,x_383
4,4,2019,Counterfactual Data Augmentation for Mitigatin...,0.045053,0.051824,0.097364,0.006804,-0.012303,-0.034496,0.071024,...,0.017489,-0.026561,-0.073428,0.044345,-0.005372,0.094221,0.085115,0.072723,0.025918,-0.102300
7,7,2019,Unsupervised Discovery of Gendered Language th...,0.055325,-0.026664,0.026544,0.035051,-0.025747,0.028869,0.084476,...,0.042687,-0.009107,-0.044295,0.062166,0.015808,0.011218,0.014497,0.052990,0.027738,-0.078972
197,197,2019,Margin-based Parallel Corpus Mining with Multi...,-0.043548,-0.073220,-0.017195,-0.006280,0.074463,0.062449,-0.023729,...,0.020356,0.067960,-0.048375,-0.060403,0.074845,0.020359,0.031584,-0.009068,-0.054869,0.029275
214,214,2019,Searching for Effective Neural Extractive Summ...,-0.087911,-0.016023,0.083686,0.014348,0.102073,0.052615,-0.038143,...,-0.022277,0.001834,0.037592,-0.062821,0.059469,0.020610,-0.007068,0.078916,-0.054624,-0.012028
216,216,2019,Style Transformer: Unpaired Text Style Transfe...,-0.090578,-0.026025,0.070133,0.055996,0.007759,0.059559,-0.058675,...,0.018578,0.012675,0.015493,0.084550,0.030756,0.020959,-0.017649,0.082438,0.026562,-0.056312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6806,6806,2019,Gender Stereotypes Differ between Male and Fem...,0.046365,0.002689,0.039982,0.063683,-0.007623,-0.039271,0.040909,...,0.016852,-0.021938,-0.065961,0.065508,0.014462,0.008181,0.118501,0.013749,0.004447,-0.097342
6809,6809,2019,Natural Language Generation: Recently Learned ...,-0.034928,-0.045406,0.002385,0.063168,-0.065263,0.067551,-0.040913,...,0.077954,0.003297,0.036508,0.023411,0.079902,-0.005770,0.033541,0.121778,0.062387,0.002115
6827,6827,2019,Towards Turkish Abstract Meaning Representation,0.035353,0.065390,0.082225,0.001338,-0.001349,0.011120,-0.003171,...,0.123215,0.043724,0.031627,0.040976,0.042600,-0.017804,0.082463,0.089982,0.016809,-0.049283
6830,6830,2019,Predicting the Outcome of Deliberative Democra...,-0.004017,-0.119011,0.006992,0.003682,0.042108,-0.002981,0.023505,...,0.057444,0.069133,-0.005857,0.003294,0.040088,0.074956,0.047436,0.075340,-0.021177,-0.023391


In [19]:
title_list = out_2019['title'].values
X = out_2019.iloc[:, 3:]
pca = PCA(n_components=2)
pca.fit(X)
x_pcx = pca.transform(X)
x_pcx.shape

(771, 2)

In [20]:
pcx_2019 =  pd.DataFrame(data=x_pcx, columns=['x1', 'x2'])
pcx_2019.insert(0, 'title', out[out['year'] == 2019]['title'].values)
pcx_2019.insert(1, 'mypaper_or_not', 'notmypaper')
pcx_2019

Unnamed: 0,title,mypaper_or_not,x1,x2
0,Counterfactual Data Augmentation for Mitigatin...,notmypaper,0.139323,-0.332217
1,Unsupervised Discovery of Gendered Language th...,notmypaper,0.259768,-0.252261
2,Margin-based Parallel Corpus Mining with Multi...,notmypaper,-0.200356,-0.276794
3,Searching for Effective Neural Extractive Summ...,notmypaper,-0.053925,0.118150
4,Style Transformer: Unpaired Text Style Transfe...,notmypaper,-0.157264,0.012636
...,...,...,...,...
766,Gender Stereotypes Differ between Male and Fem...,notmypaper,0.330666,-0.273885
767,Natural Language Generation: Recently Learned ...,notmypaper,0.100570,0.037513
768,Towards Turkish Abstract Meaning Representation,notmypaper,0.063821,-0.124651
769,Predicting the Outcome of Deliberative Democra...,notmypaper,0.268621,0.004015


## Vizualize

In [21]:
mypaper = ['An Empirical Study of Span Representations in Argumentation Structure Parsing']

In [22]:
for index, row in pcx_2019.iterrows():
    if (row['title'] in mypaper):
        pcx_2019.iat[index, 1] = 'mypaper'

pcx_2019[pcx_2019['mypaper_or_not'] == 'mypaper']

Unnamed: 0,title,mypaper_or_not,x1,x2
55,An Empirical Study of Span Representations in ...,mypaper,0.069995,0.010514


In [44]:
import plotly.express as px
fig = px.scatter(pcx_2019, x="x1", y="x2", color="mypaper_or_not",  symbol='mypaper_or_not', hover_data=[pcx_2019['title'].values],range_x=[-0.2, 0.2], range_y=[-0.2, 0.2], render_mode='svg')
fig.update_traces(marker_size=12)
fig.show()
fig.update_layout(title=f"MyPaperOrNot(2019)")
fig.write_html(f"MyPaperOrNot(2019).html")