In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

from collections import defaultdict
from copy import deepcopy

# constants
# DATA_LOC = "/home/ubuntu/projects/datasets/AP_train.txt"
DATA_LOC = "/Users/sahilgandhi/Datasets/AP_train.txt"

# data formats
FORMAT_ID = "#index"
FORMAT_PAPER_TITLE = "#*"
FORMAT_AUTHORS = "#@"
FORMAT_YEAR = "#t"
FORMAT_VENUE = "#c"
FORMAT_REFERENCES = "#%"
FORMAT_ABSTRACT = "#!"

# data keys
ID = "id"
TITLE = "title"
AUTHOR = "author"
YEAR = "year"
VENUE = "venue"
REFERENCE = "reference"
ABSTRACT = "abstract"

In [3]:
def generatetables():
    main_table = list()
    author_table = list()
    reference_table = list()
    datum = dict()
    with open(DATA_LOC, "r", encoding='utf-8') as file:
        for line in file.readlines():            
            if FORMAT_ID in line:
                # yield already existing datum and continue
                if len(datum.keys()) > 0:
                    main_table.append(datum)
                datum = dict() # re-initialize local datum
                datum[ID] = line.strip(FORMAT_ID).strip()
            elif FORMAT_PAPER_TITLE in line:
                datum[TITLE] = line.strip(FORMAT_PAPER_TITLE).strip()
            elif FORMAT_REFERENCES in line:
                reference_table.append({ID: datum[ID], REFERENCE: line.strip(FORMAT_REFERENCES).strip()})
            elif FORMAT_AUTHORS in line: 
                for author in line.strip(FORMAT_AUTHORS).strip().split(';'):
                    author_table.append({ID: datum[ID], AUTHOR: author.strip()})                    
            elif FORMAT_YEAR in line: 
                datum[YEAR] = line.strip(FORMAT_YEAR).strip()
            elif FORMAT_VENUE in line: 
                datum[VENUE] = line.strip(FORMAT_VENUE).strip()
            elif FORMAT_ABSTRACT in line: 
                datum[ABSTRACT] = line.strip(FORMAT_ABSTRACT).strip()
    return main_table, author_table, reference_table

# generate all tables
main, author, cit = generatetables()


In [4]:
# combine all tables and compute data
main_df = pd.DataFrame(main, columns={ID, TITLE, YEAR, VENUE, ABSTRACT})
author_df = pd.DataFrame(author, columns={ID, AUTHOR})
ref_df = pd.DataFrame(cit, columns={ID, REFERENCE})

consolidated = main_df.merge(author_df, how='outer', on=ID).merge(ref_df, how='outer', on=ID)

print('''
Length of Main DF: {0}\n
Length of Author DF: {1}\n
Length of References DF: {2}\n
Length of Consolidated DF: {3}
'''.format(len(main_df), len(author_df), len(ref_df), len(consolidated)))


Length of Main DF: 1976814

Length of Author DF: 4909851

Length of References DF: 7250328

Length of Consolidated DF: 23565643



### A. Compute the number of distinct authors, publication venues, publications, and citations/references

In [5]:
# distinct authors
print('''
Distinct Authors: {0}\n
Distinct Publication venues: {1}\n
Distinct Publications: {2}\n
Distinct references: {3}
'''.format(
    len(author_df.author.unique()),
    len(main_df.venue.unique()),
    len(main_df.title.unique()),
    len(ref_df.reference.unique())
))


Distinct Authors: 1478737

Distinct Publication venues: 255690

Distinct Publications: 1929486

Distinct references: 871091



### B. Are these numbers likely to be accurate? As an example look up all the publications venue names associated with the conference “Principles and Practice of Knowledge Discovery in Databases”13 – what do you notice?

In [6]:
main_df[main_df.venue.str.contains('Principles and Practice of Knowledge Discovery in Databases')][:5]

Unnamed: 0,venue,title,id,year,abstract
799595,PKDD '04 Proceedings of the 8th European Confe...,Summarization of dynamic content in web collec...,799597,2004,This paper describes a new research proposal o...
799732,PKDD '04 Proceedings of the 8th European Confe...,Proceedings of the 8th European Conference on ...,799734,2004,
799733,PKDD '04 Proceedings of the 8th European Confe...,Random matrices in data analysis,799735,2004,We show how carefully crafted random matrices ...
799734,PKDD '04 Proceedings of the 8th European Confe...,Data privacy,799736,2004,There is increasing need to build information ...
799735,PKDD '04 Proceedings of the 8th European Confe...,Breaking through the syntax barrier: searching...,799737,2004,The next wave in search technology will be dri...


No, the results are not accurate. As we can see in the results above, the strings have some form of 'year' mentioned in them. So even if the actual venue is not different, the string representation of the venue changes because of the change in the year/date.

### C. For each author, construct the list of publications. Plot a histogram of the number of publications per author (use a logarithmic scale on the y axis)

In [None]:
_pubs, _authors = author_df[author_df.author != ''].sort_values([AUTHOR]).values.T
_keys, _indices = np.unique(_authors, True)
author_pubs = pd.DataFrame({AUTHOR: _keys, 'publications': [list(x) for x in np.split(_pubs, _indices[1:])], 'pub count': [len(list(x)) for x in np.split(_pubs, _indices[1:])]})

In [None]:
plt.hist(author_pubs['pub count'], log=True)
plt.xlabel('No. of Publications')
plt.ylabel('No. of Authors')
plt.title('No. of publications per author')
plt.show()

### D. Calculate the mean and standard deviation of the number of publications per author. Also calculate the Q1 (1st quartile14), Q2 (2nd quartile, or median) and Q3 (3rd quartile) values.

In [None]:
author_pubs['pub count'].describe()

#### Compare the median to the mean and explain the difference between the two values based on the standard deviation and the 1st and 3rd quartiles.

The vast difference between the median and the mean is because the max is very high which skews the mean, but the majority of the authors have 'no. of publications' closes to 1 (the median).

### E. Now plot a histogram of the number of publications per venue..

In [None]:
_pubs, _venues = main_df[main_df[VENUE] != ''][[ID, VENUE]].sort_values([VENUE]).values.T
_keys, _indices = np.unique(_venues, True)
venue_pubs = pd.DataFrame({VENUE: _keys, 'publications': [list(x) for x in np.split(_pubs, _indices[1:])], 'pub_count': [len(list(x)) for x in np.split(_pubs, _indices[1:])]})

plt.hist(venue_pubs['pub_count'], log=True)
plt.xlabel('No. of Publications')
plt.ylabel('No. of Venues')
plt.title('No. of publications per venue')
plt.show()

#### ...as well as calculate the mean, standard deviation, median, Q1, and Q3 values...

In [None]:
venue_pubs['pub_count'].describe()

#### ...What is the venue with the largest number of publications in the dataset?

In [None]:
venue_pubs.sort_values(['pub_count'], ascending=False)[:1]

#### F. Plot a histogram of the number of references (number of publications a publication refers to) and citations (number of publications referring to a publication) per publication. 

In [None]:
publication_cits = ref_df[ref_df[REFERENCE] != ''].groupby([REFERENCE])[ID].count().reset_index(name='cit_count')
publication_refs = ref_df.groupby([ID])[REFERENCE].count().reset_index(name='ref_count')

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

ax1.hist(publication_cits['cit_count'], log=True)
ax1.set_xlabel('No. of Citations')
ax1.set_ylabel('No. of Publications')
ax1.set_title('Citations per publications')

ax2.hist(publication_refs['ref_count'], log=True)
ax2.set_xlabel('No. of References')
ax2.set_title('References per publications')

plt.show()

#### ...What is the publication with the largest number of references?

In [None]:
publication_refs.sort_values(['ref_count'], ascending=False)[:5]

#### What is the publication with the largest number of citations?

In [None]:
publication_cits.sort_values(['cit_count'], ascending=False)[:5]

# Do these make sense?

#### G. Calculate the so called “impact” factor for each venue. To do so, calculate the total number of citations for the publications in the venue, and then divide this number by the number of publications for the venue. Plot a histogram of the results

In [None]:
# impact = average citation for venue

pub_cit_dict = {row.reference:row.cit_count for row in publication_cits.itertuples()}

impact = list()
for row in venue_pubs.itertuples():
    tmp_dict = {VENUE: row.venue}
    _lst = list()
    for id in row.publications:
        _val = pub_cit_dict.get(id, None)
        if _val:
            _lst.append(_val)
    tmp_dict.update({'impact': 0 if len(_lst) == 0 else sum(_lst)/float(len(_lst))})
    impact.append(tmp_dict)


In [1]:
plt.hist([x['impact'] for x in impact], log=True)
plt.xlabel('Impact')
plt.ylabel('No. of Venues')
plt.title('Impact per venue')
plt.show()

NameError: name 'plt' is not defined

#### H. What is the venue with the highest apparent impact factor? 

In [None]:
sorted(impact, key=lambda x:x['impact'], reverse=True)[:1]

#### Do you believe this number? (http://mdanderson.libanswers.com/faq/26159)

#### I. Now repeat the calculation from item b., but restrict the calculation to venues with at least 10 publications. How does your histogram change? 

In [None]:
pub_cit_dict = {row.reference:row.cit_count for row in publication_cits.itertuples()}

restricted_impact = list()
for row in venue_pubs.itertuples():
    if len(row.publications) < 10 :
        continue
    tmp_dict = {VENUE: row.venue}
    _lst = list()
    _raw= list()
    for id in row.publications:
        _val = pub_cit_dict.get(id, None)
        if _val:
            _raw.append({'publication': id, 'citation_count': _val})
            _lst.append(_val)
    tmp_dict.update({'impact': 0 if len(_lst) == 0 else sum(_lst)/float(len(_lst)),
                     'raw': _raw})
    restricted_impact.append(tmp_dict)

In [None]:
plt.hist([x['impact'] for x in restricted_impact], log=True)
plt.xlabel('Impact')
plt.ylabel('No. of Venues')
plt.title('Impact per venue')
plt.show()

#### List the citation counts for all publications from the venue with the highest impact factor.

In [None]:
sorted(restricted_impact, key=lambda x:x['impact'], reverse=True)[:1]

#### How does the impact factor (mean number of citations) compare to the median number of citations?

#### J. Finally, construct a list of publications for each publication year. Use this list to plot the average number of references and average number of citations per publication as a function of time. Explain the differences you see in the trends.