In [38]:
import pandas as pd
import numpy as np
import time

from copy import deepcopy

# constants
DATA_LOC = "/home/ubuntu/projects/datasets/AP_test.txt"

# data formats
FORMAT_ID = "#index"
FORMAT_PAPER_TITLE = "#*"
FORMAT_AUTHORS = "#@"
FORMAT_YEAR = "#t"
FORMAT_VENUE = "#c"
FORMAT_CITATIONS = "#%"
FORMAT_ABSTRACT = "#!"

# data keys
ID = "id"
TITLE = "title"
AUTHOR = "author"
YEAR = "year"
VENUE = "venue"
CITATION = "citation"
ABSTRACT = "abstract"


In [39]:
def generatetables():
    main_table = list()
    author_table = list()
    citation_table = list()
    datum = dict()
    with open(DATA_LOC, "r", encoding='utf-8') as file:
        for line in file.readlines():            
            if FORMAT_ID in line:
                # yield already existing datum and continue
                if len(datum.keys()) > 0:
                    main_table.append(datum)
                datum = dict() # re-initialize local datum
                datum[ID] = line.strip(FORMAT_ID).strip()
            elif FORMAT_PAPER_TITLE in line:
                datum[TITLE] = line.strip(FORMAT_PAPER_TITLE).strip()
            elif FORMAT_CITATIONS in line:
                citation_table.append({ID: datum[ID], CITATION: line.strip(FORMAT_CITATIONS).strip()})
            elif FORMAT_AUTHORS in line: 
                for author in line.strip(FORMAT_AUTHORS).strip().split(';'):
                    author_table.append({ID: datum[ID], AUTHOR: author})                    
            elif FORMAT_YEAR in line: 
                datum[YEAR] = line.strip(FORMAT_YEAR).strip()
            elif FORMAT_VENUE in line: 
                datum[VENUE] = line.strip(FORMAT_VENUE).strip()
            elif FORMAT_ABSTRACT in line: 
                datum[ABSTRACT] = line.strip(FORMAT_ABSTRACT).strip()
    return main_table, author_table, citation_table

In [40]:
# generate all tables
start = time.time()
main, author, cit = generatetables()
print(time.time()-start)

0.004236459732055664


In [41]:
# combine all tables and compute data
start = time.time()
main_df = pd.DataFrame(main, columns={ID, TITLE, YEAR, VENUE, ABSTRACT})
author_df = pd.DataFrame(author, columns={ID, AUTHOR})
cit_df = pd.DataFrame(cit, columns={ID, CITATION})
data = main_df.merge(author_df, how='outer', on=ID).merge(cit_df, how='outer', on=ID)
print(time.time()-start)

0.05514383316040039


In [42]:
#total count of data
len(data)

442

In [43]:
# solution A

# distinct authors
len(data.author.unique())

215

In [44]:
# distinct venues
len(data.venue.unique())

64

In [45]:
# distinct publications
len(data.title.unique())

142

In [46]:
# distinct citations
len(data.citation.unique())

140

# Solution B

No, the solutions are not accurate. The specified example contains the year of the publication in the publi

In [61]:
data.groupby([AUTHOR, ID])[ID].count()

author                     id 
                           2       1
                           20      1
A F. Smeaton               64      1
A Kreczmar                 34      1
A M. Davis                 142     1
A Salwicki                 34      1
A. A. Rampuria             7       1
A. D. Wyner                19      1
A. G. Akritas              7       1
Adele Goldberg             103     1
Alan L. Tharp              77      1
Allan Ramsay               74      1
Andre Schiper              71      1
Andreas Reuter             117    16
Andrzej Salwicki           59      1
Anne Lee Paxton            73      1
Arto Salomaa               15      1
Arun K. Majumdar           55      1
Avi Rushinek               75      1
Avi Wigderson              17      1
                           18      1
B N.T Foxon                113     1
Beat Hirsbrunner           71      1
Bertrand Meyer             68      1
Broy Manfred               35      1
Bruce W. Leverett          49      1
C D. Pa

In [53]:
data[data[AUTHOR] == 'Theo Haerder'].groupby(ID).count()

Unnamed: 0_level_0,year,title,venue,abstract,author,citation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
117,16,16,16,0,16,16
