# PAPER-TITLE

## 1. Retrieving ISMIR Papers with Code from Arxiv 

Not all of the papers from 2011 to 2020 of ISMIR are uploaded to Arxiv, this reminds the importance of all-in-one platform for this purpose to easily collect data for such researches. Here, via web scrapping, we get the papers that are uploaded to Arxiv and retrieve the information on whether they provided their code/data.

In [None]:
import os, re, time
import json

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager

import pandas as pd

import urllib, json, re

import os
import json
from collections import Counter

## 1.1. Get corresponding arxiv links for the selected papers

In [None]:
path_to_proceeding_json = "selected_years/"

In [None]:
def arxiv_query(max_result=1):
    paper_urls = {}
    for d,r,f in os.walk(path_to_proceeding_json):
        for file in f:
            if file.endswith(".json"):
                titles = []
                try:
                    with open(path_to_json+file, 'r') as output:
                        f_out = json.load(output)
                        for keys in f_out:
                            
                            #Since the paper titles may include characthers such as .,-,?
                            #They need to be replaced with space
                            #This query method can be improved
                            paper_title = keys['title'].lower().replace(" ","+").replace(".","").replace(",","").replace(":","").replace(";","").replace("?","").replace("-","").replace("'","").replace("-","+")
                            
                            #Create the query for given paper to use Arxiv API
                            url = 'http://export.arxiv.org/api/query?search_query=ti:{}&start=0&max_results={}'.format(paper_title, max_result)
                            
                            with urllib.request.urlopen(url) as ur:
                                r = ur.read()
                            
                            #The links to the arxiv page are stored within this element
                            #A basic regex search
                            match = re.findall('<link title="pdf" href=(.*)',r.decode("utf-8"))
                            for paper_url in match:
                                paper = paper_url.split(" ")[0].split('"')[1]
                                paper_urls[keys['title']] = paper.replace("pdf","abs")
                except:
                    pass
    return paper_urls


In [None]:
#paper_urls = arxiv_query()

In [None]:
paper_urls

In [None]:
#Convert to CSV for file type consistency in upcoming steps
df = pd.DataFrame({'titles':list(paper_urls.keys()),'arxiv_links':list(paper_urls.values())})

In [None]:
#Save the data
df.to_csv("titles_arxiv_links.csv", index=None)

## 1.2. Retrieve Links of Publicly Available Code/Data

In [None]:
df = pd.read_csv("titles_arxiv_links.csv")
papers = df.to_dict()

In [None]:
papers_dict = {}
for title, link in zip(papers['titles'].values(),papers['arxiv_links'].values()):
    papers_dict[title] = link

In [None]:
def get_code_links(urls):
    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    paper_code = {}
    for title, url in urls.items():
        driver.get(url)
        driver.delete_all_cookies()
        try:
            tit = driver.find_element_by_xpath('//h1[@class="title mathjax"]')
            d = driver.find_element_by_xpath('//div[@id="pwc-output"]/p/a')
            paper_code[title] = {}
            paper_code[title][tit.text] = {url: None}
            if d.get_attribute(name='href'):
                paper_code[title][tit.text][url] = d.get_attribute(name='href')
            else:
                paper_code[title][tit.text][url] = None
        except:
            pass
        main_page = driver.window_handles[0]
        driver.switch_to.window(main_page)
    return paper_code

In [None]:
final_results = get_code_links(papers_dict)

In [None]:
csv_dic = {}
paper_title = []
arxiv_title = []
arxiv_link = []
code_link = []
for key, val in final_results.items():
    paper_title.append(key)
    for k2,v2 in val.items():
        arxiv_title.append(k2)
        for k3,v3 in v2.items():
            arxiv_link.append(k3)
            code_link.append(v3)
csv_dic['paper_title'] = paper_title
csv_dic['arxiv_title'] = arxiv_title
csv_dic['arxiv_link'] = arxiv_link
csv_dic['code_link'] = code_link
    

In [None]:
df_arxiv_res = pd.DataFrame(csv_dic)

In [None]:
df_arxiv_res.to_csv('arxiv_implementation_results.csv', index=None)

On top of this approach, the papers were manually annotated such as `with code/without code`. 

Since not all of the papers of ISMIR are uploaded to Arxiv and the query may fail in some cases, the output file `arxiv_implementation_results.csv` is manually filtered by comparing the title of the paper and the title of the paper from Arxiv. The merged version is `ismir_2011_2021_with_implementation.csv`.

For the next steps, we should have three different data: CSV with:

1. Titles, year of all papers

2. Titles, year of the papers with code

3. Titles, year of the papers without code


In [None]:
title_year_dic = {}
for d,r,f in os.walk(path_to_json):
    for file in f:
        if file.endswith(".json"):
            with open(path_to_proceeding_json+file, 'r') as output:
                f_out = json.load(output)
            for keys in f_out:
                title_year_dic[keys['title']] = file.split(".")[0]
df_all_papers = pd.DataFrame(title_year_dic, index=[0])
df_all_papers = df_all_papers.T
df_all_papers.reset_index(inplace=True)
df_all_papers.rename(columns={'index':'titles',0:'year'},inplace=True)
df_all_papers.to_csv("ismir_2011_2020_papers.csv", index=None)

In [None]:
df_all_papers.head(3)

In [None]:
df_papers_with_code = pd.read_csv("ismir_2011_2020_with_implementation.csv")

In [None]:
df_papers_without_code = df_all_papers[~df_all_papers['titles'].isin(df_papers_with_code['titles'].to_list())]

In [None]:
df_papers_without_code.head(3)

In [None]:
df_papers_without_code.to_csv("ismir_2011_2020_without_implementation.csv")

## 2. **Creating subcategories of the papers**

ISMIR Papers do not have a keywords section & this leads to a problem on the categorisation of the papers. Since there isn't a common agreement on the subtopics of Music Information Retrieval, for this research, we decided to use `http://www.music.mcgill.ca/~ich/classes/mumt621_15/MIR_topics.html` as base and extended the keywords by iterating over the titles of the papers. This is not a solid solution but such a data is required for further analysis on open data/code in MIR.

This subtopics are stored in JSON format: `mir_topics.json`

In [None]:
with open('mir_topics.json', 'r') as mir_out:
    mir_topics = json.load(mir_out)

In [None]:
def categorise_papers(df):
    df['category'] = None
    for idx, row in df.iterrows():
        row = row.copy()
        categories = []
        for key, val in mir_topics.items():
            for v in val:
                title = [t.lower() for t in row.titles.replace("-"," ").replace(":"," ").split(" ")]
                if v.lower() in title:
                    categories.append(key)
        if categories == []:
            categories = "Other"
        else:
            categories = ",".join(cat for cat in set(categories))
        df.loc[idx, 'category']= categories
    return df

In [None]:
df = categorise_papers(df_all_papers)
df_papers_with_code_cat = categorise_papers(df_papers_with_code)
df_papers_without_code_cat = categories(df_papers_without_code)

In [None]:
df.head(5)

In [None]:
df.to_csv("ismir_2011_2020_papers_categorized.csv", index=None)
df_papers_with_code_cat.to_csv("ismir_2011_2020_papers_with_code_categorized.csv", index=None)
df_papers_without_code.to_csv("ismir_2011_2020_papers_without_code_categorized.csv", index=None)

## 3. Finding Papers Published an Open Dataset

As an intuition, we could say that dataset/database creation among MIR community has increased but this can be analyzed by searching for the amount of published papers for dataset/database creation. To do that, our approach is to search for the titles including words such as `dataset` and `database`.

The next step is to manually check the results.

In [None]:
dataset = {}
for idx, row in df_all_papers.iterrows():
    for single_word in row.titles.replace(".","").split(" "):
        if single_word.lower() == 'dataset':
            dataset[row.titles] = row.year

In [None]:
dataset

In [None]:
df_dataset = pd.DataFrame(dataset, index=[0])
df_dataset = df_dataset.T
df_dataset.reset_index(inplace=True)
df_dataset.rename(columns={'index':'titles',0:'year'},inplace=True)
df_dataset.to_csv("ismir_2011_2020_dataset.csv", index=None)