In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import Counter
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.split(NOTEBOOKS_DIR)[0]
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

FINAL_DF_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'final.csv')

In [3]:
df = pd.read_csv(FINAL_DF_FILEPATH, encoding='utf-8')

In [4]:
df['set_spec'].value_counts()

math                70513
physics:astro-ph    52289
physics:cond-mat    48181
cs                  23133
physics:hep-ph      15842
physics:physics     15812
physics:quant-ph    10235
physics:hep-th       8575
physics:gr-qc        8251
physics:hep-ex       6504
physics:nucl-th      3312
physics:hep-lat      3049
nlin                 2607
physics:nucl-ex      2309
q-bio                2120
stat                 1670
physics:nlin          883
q-fin                 688
econ                   16
Name: set_spec, dtype: int64

In [5]:
df_cs = df[df['set_spec'] == 'cs']
df_math = df[df['set_spec'] == 'math']

In [6]:
df_cs.head()

Unnamed: 0,identifier,url,title,set_spec,subjects,authors,dates,description
10,oai:arXiv.org:1011.5311,http://arxiv.org/abs/1011.5311,Citations and impact of Dutch astronomy,cs,Astrophysics - Instrumentation and Methods for...,"Kamphuis, P.,van der Kruit, P. C.",2010-11-24,The aim of this study is to make a bibliomet...
13,oai:arXiv.org:1011.5314,http://arxiv.org/abs/1011.5314,"ML(n)BiCGStab: Reformulation, Analysis and Imp...",cs,"Mathematics - Numerical Analysis,Computer Scie...","Yeung, Man-Chung",2010-11-24,"With the aid of index functions, we re-deriv..."
16,oai:arXiv.org:1011.5317,http://arxiv.org/abs/1011.5317,Performance of CSMA in Multi-Channel Wireless ...,cs,Computer Science - Networking and Internet Arc...,"Bonald, Thomas,Feuillet, Mathieu","2010-11-24,2011-04-02",We analyze the performance of CSMA in multi-...
19,oai:arXiv.org:1011.5320,http://arxiv.org/abs/1011.5320,Computation of the shortest path between two c...,cs,"Computer Science - Computational Geometry,Math...","Chen, Wen-Haw,Chen, Sheng-Gwo",2010-11-24,"In this paper, we present the geodesic-like ..."
24,oai:arXiv.org:1011.5325,http://arxiv.org/abs/1011.5325,World of Movable Objects. Part 2,cs,Computer Science - Human-Computer Interaction,"Andreyev, Sergey",2010-11-24,This book is about the transformation of scr...


In [7]:
set(df_cs['dates'].map(len).values)

{10, 21}

In [8]:
# Most of our data only has one data, but quite a bit has two dates.
(df_cs['dates'].map(len) == 10).value_counts()

True     16048
False     7085
Name: dates, dtype: int64

In [9]:
df_cs_dates = df_cs[df_cs['dates'].map(len) == 10]['dates'].values

In [10]:
c = Counter(df_cs_dates)

In [11]:
x = list(c.keys())
y = [c[i] for i in x]

In [12]:
print(min(x))
print(max(x))

2007-03-31
2011-07-20


In [13]:
descriptions = df_cs['description'].values

In [32]:
len(descriptions)

16915

In [33]:
descriptions[0]

'  In this paper I present several novel, efficient, algorithmic techniques for\nsolving some multidimensional geometric data management and analysis problems.\nThe techniques are based on several data structures from computational geometry\n(e.g. segment tree and range tree) and on the well-known sweep-line method.\n'

In [34]:
descriptions[1]

'  In this paper we present a simple and robust method for self-correction of\ncamera distortion using single images of scenes which contain straight lines.\nSince the most common distortion can be modelled as radial distortion, we\nillustrate the method using the Harris radial distortion model, but the method\nis applicable to any distortion model. The method is based on transforming the\nedgels of the distorted image to a 1-D angular Hough space, and optimizing the\ndistortion correction parameters which minimize the entropy of the\ncorresponding normalized histogram. Properly corrected imagery will have fewer\ncurved lines, and therefore less spread in Hough space. Since the method does\nnot rely on any image structure beyond the existence of edgels sharing some\ncommon orientations and does not use edge fitting, it is applicable to a wide\nvariety of image types. For instance, it can be applied equally well to images\nof texture with weak but dominant orientations, or images with s

In [35]:
descriptions[2]

'  Graphs are typically visualized as node-link diagrams. Although there is a\nfair amount of research focusing on crossing minimization to improve\nreadability, little attention has been paid on how to handle crossings when\nthey are an essential part of the final visualizations. This requires us to\nunderstand how people read graphs and how crossings affect reading performance.\n  As an initial step to this end, a preliminary eye tracking experiment was\nconducted. The specific purpose of this experiment was to test the effects of\ncrossing angles and geometric-path tendency on eye movements and performance.\nSixteen subjects performed both path search and node locating tasks with six\ndrawings. The results showed that small angles can slow down and trigger extra\neye movements, causing delays for path search tasks, whereas crossings have\nlittle impact on node locating tasks. Geometric-path tendency indicates that a\npath between two nodes can become harder to follow when many branc

# CountVectorizer and TfidfVectorizer

In [43]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [37]:
vectorizer = CountVectorizer()

In [38]:
tf = vectorizer.fit_transform(descriptions)

In [41]:
vectorizer.get_feature_names()

['00',
 '000',
 '0001',
 '0001071',
 '0004609',
 '000s',
 '0025',
 '005',
 '00509488',
 '0071v4',
 '0084',
 '01',
 '010',
 '01020103',
 '0103200',
 '010401',
 '013',
 '015',
 '0167',
 '017',
 '0170',
 '02',
 '0207023',
 '024',
 '025',
 '029',
 '03',
 '0307',
 '031',
 '0310049',
 '035n',
 '036111',
 '0367',
 '04',
 '040',
 '0410',
 '0410460v2',
 '0412187',
 '0423',
 '046',
 '0463',
 '0476',
 '04db',
 '05',
 '0501076',
 '0501315',
 '0506134',
 '0511096',
 '0595',
 '05a',
 '05kw',
 '06',
 '0602345',
 '0604',
 '0604017',
 '0605181',
 '0609101',
 '0609825v5',
 '0612509v2',
 '062',
 '0625',
 '0665',
 '068',
 '069',
 '07',
 '0701020v2',
 '0701096v2',
 '0703125',
 '0708',
 '0709',
 '0710',
 '0711',
 '073',
 '0746',
 '076',
 '0783',
 '079',
 '07cc',
 '08',
 '0801',
 '0802',
 '0803',
 '0805',
 '0807',
 '0809',
 '081',
 '0810',
 '0811',
 '0812',
 '083',
 '085',
 '0854',
 '088',
 '0883',
 '08bss',
 '09',
 '0901',
 '0902',
 '0903',
 '0905',
 '0907',
 '0908',
 '0910',
 '0912',
 '0919',
 '0977',
 '0_

In [44]:
tfidf_vectorizer = TfidfVectorizer()

In [45]:
tfidf = tfidf_vectorizer.fit_transform(descriptions)

# SVD

In [50]:
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
svd.fit(tfidf)
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

[0.00506781 0.00607851 0.00492608 0.0036822  0.0034717  0.00300686
 0.00284712 0.00245702 0.00235219 0.00219536]
0.03608484518816286
[33.07165716  9.8596289   8.85028021  7.65733894  7.44016161  6.93299996
  6.7283403   6.25975719  6.11560688  5.90884147]


In [51]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(tfidf, 
                              n_components=10,
                              n_iter=7,
                              random_state=42)

In [52]:
Sigma

array([33.07165716,  9.8596289 ,  8.85028021,  7.65733894,  7.44016161,
        6.93299996,  6.7283403 ,  6.25975719,  6.11560688,  5.90884147])

In [55]:
VT.shape

(10, 38246)

In [56]:
df_cs.shape

(16915, 8)

# Comp Sci - Word counts, n-gram counts

In [30]:
#import nltk
import string
#from nltk.stem.wordnet import WordNetLemmatizer

def tokenize(sentence):
    s = sentence.lower()
    #tokens = nltk.word_tokenize(s)
    translation = str.maketrans('', '', string.punctuation)
    s = s.translate(translation)
    tokens = s.split()
    #punctuation_set = set(string.punctuation)
    #tokens_no_punc = [t for t in tokens if t not in punctuation_set]
    #wordnet = WordNetLemmatizer()
    #lemmatized = [wordnet.lemmatize(word) for word in tokens_no_punc]

    #return lemmatized
    #return tokens_no_punc
    return tokens

In [31]:
descriptions_tokenized = [tokenize(s) for s in descriptions]

In [32]:
descriptions_tokenized[0]

['the',
 'aim',
 'of',
 'this',
 'study',
 'is',
 'to',
 'make',
 'a',
 'bibliometric',
 'comparison',
 'of',
 'the',
 'performance',
 'of',
 'research',
 'astronomers',
 'in',
 'the',
 'netherlands',
 'research',
 'school',
 'for',
 'astronomy',
 'nova',
 'with',
 'astronomers',
 'elsewhere',
 'by',
 'using',
 'the',
 'nasa',
 'astrophysics',
 'data',
 'system',
 'ads',
 'we',
 'use',
 'various',
 'indices',
 'for',
 'bibliometric',
 'performance',
 'for',
 'a',
 'sample',
 'of',
 'nova',
 'astronomers',
 'to',
 'compare',
 'to',
 'samples',
 'of',
 'astronomers',
 'worldwide',
 'and',
 'from',
 'the',
 'united',
 'states',
 'we',
 'give',
 'much',
 'weight',
 'to',
 'normalising',
 'bibliometric',
 'measures',
 'by',
 'number',
 'of',
 'authors',
 'and',
 'number',
 'of',
 'years',
 'since',
 'first',
 'publication',
 'in',
 'particular',
 'we',
 'calculate',
 'the',
 'hirshindex',
 'normalized',
 'to',
 'number',
 'of',
 'authors',
 'and',
 'for',
 'firstauthor',
 'papers',
 'second

In [42]:
words_cs = []
vocab_cs = set()

for sent in descriptions_tokenized:
    for word in sent:
        words_cs.append(word)
        vocab_cs.add(word)

In [44]:
len(words_cs)

3195865

In [45]:
len(vocab_cs)

72294

In [48]:
c = Counter(words_cs)

In [49]:
c.most_common(100)

[('the', 210396),
 ('of', 135913),
 ('a', 89100),
 ('and', 82074),
 ('to', 70410),
 ('in', 68731),
 ('is', 56807),
 ('we', 47420),
 ('for', 45004),
 ('that', 38684),
 ('this', 31748),
 ('with', 25215),
 ('on', 25204),
 ('are', 24275),
 ('by', 20408),
 ('an', 20202),
 ('as', 20201),
 ('be', 16889),
 ('which', 14676),
 ('can', 13848),
 ('paper', 12433),
 ('it', 11950),
 ('problem', 10746),
 ('from', 10739),
 ('algorithm', 10358),
 ('our', 10120),
 ('show', 8738),
 ('data', 8356),
 ('at', 8224),
 ('network', 7863),
 ('such', 7740),
 ('results', 7646),
 ('also', 7629),
 ('model', 7537),
 ('or', 7467),
 ('based', 7459),
 ('has', 7368),
 ('using', 7355),
 ('these', 7222),
 ('time', 7079),
 ('number', 7021),
 ('system', 6998),
 ('information', 6939),
 ('have', 6926),
 ('two', 6660),
 ('not', 6276),
 ('new', 6100),
 ('one', 5972),
 ('proposed', 5701),
 ('channel', 5639),
 ('systems', 5609),
 ('networks', 5574),
 ('used', 5515),
 ('algorithms', 5510),
 ('between', 5441),
 ('where', 5435),
 ('it

# Math descriptions and word count

In [40]:
math_descriptions = df_math['description'].values

In [41]:
math_descriptions_tokenized = [tokenize(s) for s in math_descriptions]

In [50]:
words_math = []
vocab_math = set()

for sent in math_descriptions_tokenized:
    for word in sent:
        words_math.append(word)
        vocab_math.add(word)

In [51]:
len(words_math)

6440380

In [52]:
len(vocab_math)

152388

In [53]:
c_math = Counter(words_math)

In [54]:
c_math.most_common(100)

[('the', 456964),
 ('of', 361459),
 ('a', 209841),
 ('and', 150926),
 ('in', 138187),
 ('we', 136198),
 ('is', 115536),
 ('to', 111747),
 ('for', 91751),
 ('that', 77353),
 ('this', 59302),
 ('on', 58318),
 ('with', 57407),
 ('are', 49687),
 ('by', 45323),
 ('an', 43544),
 ('as', 34350),
 ('which', 30412),
 ('be', 29601),
 ('show', 21267),
 ('paper', 20267),
 ('it', 19246),
 ('prove', 18691),
 ('also', 18240),
 ('from', 17461),
 ('these', 16804),
 ('space', 16510),
 ('results', 16374),
 ('group', 16219),
 ('some', 15865),
 ('if', 15647),
 ('can', 14501),
 ('such', 14456),
 ('study', 14405),
 ('two', 13909),
 ('theory', 13431),
 ('at', 13364),
 ('case', 13052),
 ('our', 13011),
 ('all', 12628),
 ('one', 12533),
 ('finite', 12338),
 ('or', 12149),
 ('has', 12080),
 ('function', 11883),
 ('give', 11460),
 ('number', 11196),
 ('result', 11126),
 ('problem', 11116),
 ('not', 11112),
 ('new', 11087),
 ('functions', 10933),
 ('where', 10805),
 ('given', 10801),
 ('over', 10796),
 ('then', 107

# Which top-100 words are in CS or Math but not both?

In [55]:
top_100_cs = [item[0] for item in c.most_common(100)]
top_100_math = [item[0] for item in c_math.most_common(100)]

In [65]:
cs_only = set(top_100_cs) - set(top_100_math)
cs_only_list = []
for word in cs_only:
    cs_only_list.append((word, c[word], c_math[word]))
cs_only_list_sorted = sorted(cs_only_list, key=lambda x: x[1], reverse=True)

In [67]:
for item in cs_only_list_sorted:
    print(f'{item[0]:12} {item[1]:7} {item[2]:7}')

algorithm      10358    3213
data            8356    3962
network         7863     714
based           7459    5744
information     6939    1512
proposed        5701    2212
channel         5639     207
systems         5609    6862
networks        5574     705
used            5515    4652
algorithms      5510    1206
each            5101    5816
performance     5002     563
approach        4813    5035
present         4665    6363
been            4601    3779
more            4438    5377
different       4300    4285
analysis        4253    4081
problems        4176    4294
both            4174    4878
graph           4154    5700
use             4133    5760
only            4042    6933
than            3972    4382
other           3871    4820
optimal         3830    2583
complexity      3777     835
codes           3615     300
into            3564    4722
but             3429    4723
work            3390    4901
rate            3311    2266
propose         3295    1706
bound         

In [69]:
math_only = set(top_100_math) - set(top_100_cs)
math_only_list = []
for word in math_only:
    math_only_list.append((word, c[word], c_math[word]))
math_only_list_sorted = sorted(math_only_list, key=lambda x: x[2], reverse=True)

In [70]:
for item in math_only_list_sorted:
    print(f'{item[0]:12} {item[1]:7} {item[2]:7}')

prove           2409   18691
space           2388   16510
group           1102   16219
theory          2724   13431
finite          2187   12338
function        3098   11883
give            2087   11460
result          2961   11126
functions       2267   10933
g               1390   10733
algebra          473   10530
equation         453   10267
equations        701    9818
class           2355    9681
x                971    9461
field           1241    9170
general         3125    8997
theorem         1025    8993
groups           607    8874
spaces           447    8746
solutions       1392    8724
type            1216    8365
order           3089    8309
particular      2467    8258
consider        2963    8216
let              447    8070
under           2980    7965
there           3082    7919
terms           2131    7757
certain         1478    7540
random          3083    7525
properties      2591    7519
quantum         1831    7454
conditions      1603    7434
structure     

In [71]:
def print_top_100_diff(df, set_spec1, set_spec2):
    df1 = df[df['set_spec'] == set_spec1]
    df2 = df[df['set_spec'] == set_spec2]

    desc1 = df1['description'].values
    desc2 = df2['description'].values

    tokens1 = [tokenize(s) for s in desc1]
    tokens2 = [tokenize(s) for s in desc2]

    words1 = []
    vocab1 = set()

    for sent in tokens1:
        for word in sent:
            words1.append(word)
            vocab1.add(word)
    wordcount1 = Counter(words1)
    
    words2 = []
    vocab2 = set()

    for sent in tokens2:
        for word in sent:
            words2.append(word)
            vocab2.add(word)
    wordcount2 = Counter(words2)
    
    top_100_1 = [item[0] for item in wordcount1.most_common(100)]
    top_100_2 = [item[0] for item in wordcount2.most_common(100)]
    
    only1 = set(top_100_1) - set(top_100_2)
    only1_list = []
    for word in only1:
        only1_list.append((word, wordcount1[word], wordcount2[word]))
    only1_list_sorted = sorted(only1_list, key=lambda x: x[1], reverse=True)

    for item in only1_list_sorted:
        print(f'{item[0]:12} {item[1]:7} {item[2]:7}')

# Math vs. CS

In [72]:
print_top_100_diff(df, 'math', 'cs')

prove          18691    2409
space          16510    2388
group          16219    1102
theory         13431    2724
finite         12338    2187
function       11883    3098
give           11460    2087
result         11126    2961
functions      10933    2267
g              10733    1390
algebra        10530     473
equation       10267     453
equations       9818     701
class           9681    2355
x               9461     971
field           9170    1241
general         8997    3125
theorem         8993    1025
groups          8874     607
spaces          8746     447
solutions       8724    1392
type            8365    1216
order           8309    3089
particular      8258    2467
consider        8216    2963
let             8070     447
under           7965    2980
there           7919    3082
terms           7757    2131
certain         7540    1478
random          7525    3083
properties      7519    2591
quantum         7454    1831
conditions      7434    1603
structure     

# CS vs. Math

In [73]:
print_top_100_diff(df, 'cs', 'math')

algorithm      10358    3213
data            8356    3962
network         7863     714
based           7459    5744
information     6939    1512
proposed        5701    2212
channel         5639     207
systems         5609    6862
networks        5574     705
used            5515    4652
algorithms      5510    1206
each            5101    5816
performance     5002     563
approach        4813    5035
present         4665    6363
been            4601    3779
more            4438    5377
different       4300    4285
analysis        4253    4081
problems        4176    4294
both            4174    4878
graph           4154    5700
use             4133    5760
only            4042    6933
than            3972    4382
other           3871    4820
optimal         3830    2583
complexity      3777     835
codes           3615     300
into            3564    4722
but             3429    4723
work            3390    4901
rate            3311    2266
propose         3295    1706
bound         

# physics:cond-mat vs Math

In [74]:
print_top_100_diff(df, 'physics:cond-mat', 'math')

magnetic       19923    1014
phase          19644    1879
temperature    17235     676
spin           16429    1238
state          15690    2982
energy         14024    3589
transition     13548    1229
states         12807    3113
density        11758    3149
systems        10215    6862
both            9436    4878
different       9185    4285
effect          9105     906
find            8807    3388
critical        8556    2460
surface         8429    3978
interaction     8389     974
lattice         8362    2638
found           8182    1732
observed        8112     686
dynamics        8105    2764
electron        8021     221
been            7851    3779
graphene        7522      60
coupling        7508     982
present         7487    6363
well            7357    4906
large           6936    4927
low             6922     711
potential       6920    2996
behavior        6888    2601
single          6693    1275
current         6690     711
experimental    6627     322
high          

# Math vs. Statistics

In [75]:
print_top_100_diff(df, 'math', 'stat')

prove          18691      65
space          16510     198
group          16219     101
if             15647     145
theory         13431     140
case           13052     211
all            12628     220
finite         12338     103
give           11460      87
result         11126      86
functions      10933     182
given          10801     227
then           10751     220
g              10733      16
n              10599      63
algebra        10530       3
equation       10267      23
equations       9818      24
class           9681     205
any             9631     121
x               9461      61
field           9170      77
general         8997     220
theorem         8993      31
groups          8874      69
spaces          8746      36
solutions       8724      44
type            8365      84
order           8309     178
particular      8258     197
consider        8216     175
system          8112     113
let             8070      13
first           7994     223
there         

# Statistics vs. Math

In [76]:
print_top_100_diff(df, 'stat', 'math')

data            1778    3962
models           954    5163
methods          698    3845
analysis         550    4081
approach         550    5035
based            550    5744
algorithm        541    3213
distribution     539    4741
proposed         490    2212
regression       473     794
used             464    4652
statistical      463    1140
estimation       453    1157
bayesian         435     326
use              384    5760
more             378    5377
variables        361    2999
both             346    4878
propose          344    1706
information      341    1512
process          325    4959
selection        317     640
statistics       316     955
likelihood       315     542
than             314    4382
performance      304     563
parameters       303    2366
distributions     301    2019
sample           296     926
each             296    5816
estimator        291    1158
different        285    4285
other            285    4820
inference        282     250
many         

# Machine Learning papers?
## First, find subjects that have "machine learning" in them

In [6]:
machine_learning_subjects = df[df['subjects'].str.lower().str.contains('machine learning')]['subjects']

In [9]:
machine_learning_subjects

78         Statistics - Machine Learning,Statistics - App...
94         Computer Science - Machine Learning,Statistics...
151                            Statistics - Machine Learning
224        Computer Science - Social and Information Netw...
271        Statistics - Machine Learning,Computer Science...
                                 ...                        
1601096    Computer Science - Machine Learning,Computer S...
1601186    Computer Science - Machine Learning,Statistics...
1601240    Computer Science - Cryptography and Security,C...
1601258    Computer Science - Computer Vision and Pattern...
1601269    Mathematics - Optimization and Control,Compute...
Name: subjects, Length: 48564, dtype: object

## Second, get a count of the subjects that have "machine learning" in them

In [11]:
ml_subject_counts = Counter([item for x in machine_learning_subjects.values for item in x.split(',')])

In [13]:
ml_subject_counts.most_common(20)

[('Computer Science - Machine Learning', 41875),
 ('Statistics - Machine Learning', 31114),
 ('Computer Science - Computer Vision and Pattern Recognition', 7092),
 ('Computer Science - Artificial Intelligence', 6945),
 ('Computer Science - Computation and Language', 3550),
 ('Computer Science - Neural and Evolutionary Computing', 2963),
 ('Mathematics - Optimization and Control', 2457),
 ('Computer Science - Information Theory', 1757),
 ('Mathematics - Statistics Theory', 1744),
 ('Computer Science - Information Retrieval', 1662),
 ('Statistics - Methodology', 1617),
 ('Computer Science - Cryptography and Security', 1464),
 ('Computer Science - Robotics', 1323),
 ('Computer Science - Social and Information Networks', 1198),
 ('Computer Science - Data Structures and Algorithms', 1179),
 ('Computer Science - Sound', 1143),
 ('Statistics - Applications', 1117),
 ('Statistics - Computation', 962),
 ('Electrical Engineering and Systems Science - Audio and Speech Processing',
  901),
 ('Elec