# Libraries

In [97]:
import pandas as pd
import numpy as np
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/a2211506/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [57]:
!wget "https://aclanthology.org/anthology+abstracts.bib.gz" -O anthology_abstracts.bib.gz

--2022-08-31 16:06:58--  https://aclanthology.org/anthology+abstracts.bib.gz
aclanthology.org (aclanthology.org) をDNSに問いあわせています... 174.138.37.75
aclanthology.org (aclanthology.org)|174.138.37.75|:443 に接続しています... 接続しました。
HTTP による接続要求を送信しました、応答を待っています... 200 OK
長さ: 17301246 (16M) [application/x-gzip]
`anthology_abstracts.bib.gz' に保存中


2022-08-31 16:07:01 (8.36 MB/s) - `anthology_abstracts.bib.gz' へ保存完了 [17301246/17301246]



In [58]:
!gzip -d anthology_abstracts.bib.gz

# Preprocessing
## ENTRYTYPEについて
`anthology_abstracts`には`ENTRYTYPE`の取りうる値に`proceedings`と`inproceedings`の2つが存在する。
- `proceedings`: 議事録(アブストラクトが存在しない)
- `inproceedings`: 議事録(アブストラクトが存在する)
したがって、アブストラクトを得るためには、`ENTRYTYPE`が`inproceedings`のものを取得する。

In [59]:
!head anthology_abstracts.bib -n 273377  > anthology_abstracts_2020.bib

In [60]:
with open("anthology_abstracts_2020.bib") as f:
    s = f.read()

In [61]:
s = s.replace("apr", '"apr"').replace("may", '"may"').replace("jun", '"jun"').replace("jul", '"jul"').replace("aug", '"aug"').replace("sep", '"sep"').replace("oct", '"oct"').replace("nov", '"nov"').replace("dec", '"dec"').replace("jan", '"jan"').replace("feb", '"feb"').replace("mar", '"mar"')
with open("anthology_abstracts_2020_modified.bib", mode='w') as f:
    f.write(s)

In [62]:
# https://bibtexparser.readthedocs.io/en/master/
import bibtexparser
with open('anthology_abstracts_2020_modified.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

In [65]:
bib_database.entries[0]

{'url': 'https://aclanthology.org/2022.wordplay-1.0',
 'publisher': 'Association for Computational Linguistics',
 'address': 'Seattle, United States',
 'year': '2022',
 'month': 'jul',
 'editor': "C{\\^o}t{\\'e}, Marc-Alexandre  and\nYuan, Xingdi  and\nAmmanabrolu, Prithviraj",
 'title': 'Proceedings of the 3rd Wordplay: When Language Meets Games Workshop (Wordplay 2022)',
 'ENTRYTYPE': 'proceedings',
 'ID': 'wordplay-2022-wordplay'}

In [66]:
import pickle
with open("anthology+abstracts_modified2020.bib.pkl","wb") as f:
    pickle.dump(bib_database, f)

with open("anthology+abstracts_modified2020.bib.pkl", "rb") as f:
    anthology2020 = pickle.load(f)

In [81]:
!chmod -w anthology+abstracts_modified2020.bib.pkl

In [83]:
anthology2020_pd = (pd.DataFrame(anthology2020.entries))
anthology2020_pd = anthology2020_pd[anthology2020_pd["ENTRYTYPE"]  == "inproceedings"]
anthology2020_pd.iloc[0:3]

Unnamed: 0,url,publisher,address,year,month,editor,title,ENTRYTYPE,ID,abstract,pages,doi,booktitle,author,language,volume,journal,number,isbn
1,https://aclanthology.org/2022.wordplay-1.1,Association for Computational Linguistics,"Seattle, United States",2022,jul,,A Systematic Survey of Text Worlds as Embodied...,inproceedings,"""jan""sen-2022-systematic",Text Worlds are virtual environments for embod...,1--15,10.18653/v1/2022.wordplay-1.1,Proceedings of the 3rd Wordplay: When Language...,"Jansen, Peter",,,,,
2,https://aclanthology.org/2022.wordplay-1.2,Association for Computational Linguistics,"Seattle, United States",2022,jul,,A Minimal Computational Improviser Based on Or...,inproceedings,montfort-bartlett-fernandez-2022-minimal,A prototype system for playing a minimal impro...,16--24,10.18653/v1/2022.wordplay-1.2,Proceedings of the 3rd Wordplay: When Language...,"Montfort, Nick and\nBartlett Fernandez, Sebas...",,,,,
4,https://aclanthology.org/2022.woah-1.2,Association for Computational Linguistics,"Seattle, Washington (Hybrid)",2022,jul,,Towards Automatic Generation of Messages Count...,inproceedings,ashida-komachi-2022-towards,"With the widespread use of social media, onlin...",11--23,10.18653/v1/2022.woah-1.2,Proceedings of the Sixth Workshop on Online Ab...,"Ashida, Mana and\nKomachi, Mamoru",,,,,


In [88]:
docs = anthology2020_pd.iloc[:]["abstract"]

In [120]:
docs_tokenized = [(nltk.word_tokenize(str(doc))) for doc in (docs)]

In [123]:
docs_tokenized

[['Text',
  'Worlds',
  'are',
  'virtual',
  'environments',
  'for',
  'embodied',
  'agents',
  'that',
  ',',
  'unlike',
  '2D',
  'or',
  '3D',
  'environments',
  ',',
  'are',
  'rendered',
  'exclusively',
  'using',
  'textual',
  'descriptions',
  '.',
  'These',
  'environments',
  'offer',
  'an',
  'alternative',
  'to',
  'higher-fidelity',
  '3D',
  'environments',
  'due',
  'to',
  'their',
  'low',
  'barrier',
  'to',
  'entry',
  ',',
  'providing',
  'the',
  'ability',
  'to',
  'study',
  'semantics',
  ',',
  'compositional',
  'inference',
  ',',
  'and',
  'other',
  'high-level',
  'tasks',
  'with',
  'rich',
  'action',
  'spaces',
  'while',
  'controlling',
  'for',
  'perceptual',
  'input',
  '.',
  'This',
  'systematic',
  'survey',
  'outlines',
  'recent',
  'developments',
  'in',
  'tooling',
  ',',
  'environments',
  ',',
  'and',
  'agent',
  'modeling',
  'for',
  'Text',
  'Worlds',
  ',',
  'while',
  'examining',
  'recent',
  'trends',
  

# Documents to Vectors

In [128]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_tokenized)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=-1)

In [127]:
model.dv[0]

array([ 0.54577327,  0.62607336,  0.33639103, -0.6962394 ,  0.28107893],
      dtype=float32)