In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
from collections import Counter
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.split(NOTEBOOKS_DIR)[0]
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

FINAL_DF_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'final.csv')

In [3]:
df = pd.read_csv(FINAL_DF_FILEPATH, encoding='utf-8')

In [5]:
df['set_spec'].value_counts()

math                55755
physics:astro-ph    42702
physics:cond-mat    39016
cs                  16915
physics:hep-ph      13188
physics:physics     13064
physics:quant-ph     8701
physics:hep-th       7330
physics:gr-qc        6760
physics:hep-ex       4969
physics:nucl-th      2828
physics:hep-lat      2429
nlin                 2246
physics:nucl-ex      1890
q-bio                1811
stat                 1243
physics:nlin          641
q-fin                 493
econ                   12
Name: set_spec, dtype: int64

In [6]:
df_cs = df[df['set_spec'] == 'cs']
df_math = df[df['set_spec'] == 'math']

In [7]:
df_cs.head()

Unnamed: 0,identifier,url,title,set_spec,subjects,authors,dates,description
2,oai:arXiv.org:0810.4423,http://arxiv.org/abs/0810.4423,Efficient Algorithmic Techniques for Several M...,cs,"Computer Science - Computational Geometry,Comp...","Andreica, Mugurel Ionut","2008-10-24,2013-01-01","In this paper I present several novel, effic..."
5,oai:arXiv.org:0810.4426,http://arxiv.org/abs/0810.4426,Camera distortion self-calibration using the p...,cs,Computer Science - Computer Vision and Pattern...,"Rosten, Edward,Loveland, Rohan","2008-10-24,2009-01-04",In this paper we present a simple and robust...
10,oai:arXiv.org:0810.4431,http://arxiv.org/abs/0810.4431,An Eye Tracking Study into the Effects of Grap...,cs,Computer Science - Human-Computer Interaction,"Huang, Weidong",2008-10-24,Graphs are typically visualized as node-link...
19,oai:arXiv.org:0810.4440,http://arxiv.org/abs/0810.4440,Randomization Adaptive Self-Stabilization,cs,"Computer Science - Distributed, Parallel, and ...","Dolev, Shlomi,Tzachar, Nir",2008-10-24,We present a scheme to convert self-stabiliz...
21,oai:arXiv.org:0810.4442,http://arxiv.org/abs/0810.4442,Message passing resource allocation for the up...,cs,Computer Science - Information Theory,"Abrardo, Andrea,Detti, Paolo,Moretti, Marco",2008-10-24,We propose a novel distributed resource allo...


In [11]:
set(df_cs['dates'].map(len).values)

{10, 21}

In [15]:
# Most of our data only has one data, but quite a bit has two dates.
(df_cs['dates'].map(len) == 10).value_counts()

True     11984
False     4931
Name: dates, dtype: int64

In [18]:
df_cs_dates = df_cs[df_cs['dates'].map(len) == 10]['dates'].values

In [22]:
c = Counter(df_cs_dates)

In [25]:
x = list(c.keys())
y = [c[i] for i in x]

In [30]:
print(min(x))
print(max(x))

2007-03-31
2010-10-27


In [31]:
descriptions = df_cs['description'].values

In [32]:
len(descriptions)

16915

In [33]:
descriptions[0]

'  In this paper I present several novel, efficient, algorithmic techniques for\nsolving some multidimensional geometric data management and analysis problems.\nThe techniques are based on several data structures from computational geometry\n(e.g. segment tree and range tree) and on the well-known sweep-line method.\n'

In [34]:
descriptions[1]

'  In this paper we present a simple and robust method for self-correction of\ncamera distortion using single images of scenes which contain straight lines.\nSince the most common distortion can be modelled as radial distortion, we\nillustrate the method using the Harris radial distortion model, but the method\nis applicable to any distortion model. The method is based on transforming the\nedgels of the distorted image to a 1-D angular Hough space, and optimizing the\ndistortion correction parameters which minimize the entropy of the\ncorresponding normalized histogram. Properly corrected imagery will have fewer\ncurved lines, and therefore less spread in Hough space. Since the method does\nnot rely on any image structure beyond the existence of edgels sharing some\ncommon orientations and does not use edge fitting, it is applicable to a wide\nvariety of image types. For instance, it can be applied equally well to images\nof texture with weak but dominant orientations, or images with s

In [35]:
descriptions[2]

'  Graphs are typically visualized as node-link diagrams. Although there is a\nfair amount of research focusing on crossing minimization to improve\nreadability, little attention has been paid on how to handle crossings when\nthey are an essential part of the final visualizations. This requires us to\nunderstand how people read graphs and how crossings affect reading performance.\n  As an initial step to this end, a preliminary eye tracking experiment was\nconducted. The specific purpose of this experiment was to test the effects of\ncrossing angles and geometric-path tendency on eye movements and performance.\nSixteen subjects performed both path search and node locating tasks with six\ndrawings. The results showed that small angles can slow down and trigger extra\neye movements, causing delays for path search tasks, whereas crossings have\nlittle impact on node locating tasks. Geometric-path tendency indicates that a\npath between two nodes can become harder to follow when many branc

In [43]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [37]:
vectorizer = CountVectorizer()

In [38]:
tf = vectorizer.fit_transform(descriptions)

In [41]:
vectorizer.get_feature_names()

['00',
 '000',
 '0001',
 '0001071',
 '0004609',
 '000s',
 '0025',
 '005',
 '00509488',
 '0071v4',
 '0084',
 '01',
 '010',
 '01020103',
 '0103200',
 '010401',
 '013',
 '015',
 '0167',
 '017',
 '0170',
 '02',
 '0207023',
 '024',
 '025',
 '029',
 '03',
 '0307',
 '031',
 '0310049',
 '035n',
 '036111',
 '0367',
 '04',
 '040',
 '0410',
 '0410460v2',
 '0412187',
 '0423',
 '046',
 '0463',
 '0476',
 '04db',
 '05',
 '0501076',
 '0501315',
 '0506134',
 '0511096',
 '0595',
 '05a',
 '05kw',
 '06',
 '0602345',
 '0604',
 '0604017',
 '0605181',
 '0609101',
 '0609825v5',
 '0612509v2',
 '062',
 '0625',
 '0665',
 '068',
 '069',
 '07',
 '0701020v2',
 '0701096v2',
 '0703125',
 '0708',
 '0709',
 '0710',
 '0711',
 '073',
 '0746',
 '076',
 '0783',
 '079',
 '07cc',
 '08',
 '0801',
 '0802',
 '0803',
 '0805',
 '0807',
 '0809',
 '081',
 '0810',
 '0811',
 '0812',
 '083',
 '085',
 '0854',
 '088',
 '0883',
 '08bss',
 '09',
 '0901',
 '0902',
 '0903',
 '0905',
 '0907',
 '0908',
 '0910',
 '0912',
 '0919',
 '0977',
 '0_

In [44]:
tfidf_vectorizer = TfidfVectorizer()

In [45]:
tfidf = tfidf_vectorizer.fit_transform(descriptions)

In [50]:
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
svd.fit(tfidf)
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

[0.00506781 0.00607851 0.00492608 0.0036822  0.0034717  0.00300686
 0.00284712 0.00245702 0.00235219 0.00219536]
0.03608484518816286
[33.07165716  9.8596289   8.85028021  7.65733894  7.44016161  6.93299996
  6.7283403   6.25975719  6.11560688  5.90884147]


In [51]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(tfidf, 
                              n_components=10,
                              n_iter=7,
                              random_state=42)

In [52]:
Sigma

array([33.07165716,  9.8596289 ,  8.85028021,  7.65733894,  7.44016161,
        6.93299996,  6.7283403 ,  6.25975719,  6.11560688,  5.90884147])

In [55]:
VT.shape

(10, 38246)

In [56]:
df_cs.shape

(16915, 8)