In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
from collections import Counter
import glob
import os
import pickle
import re
import time

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.manifold import TSNE
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})

NOTEBOOKS_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.split(NOTEBOOKS_DIR)[0]
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

FINAL_DF_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'final.csv')
ML_ONLY_FILEPATH = os.path.join(PROCESSED_DATA_DIR, 'machine_learning_only.csv')

In [4]:
df = pd.read_csv(FINAL_DF_FILEPATH, encoding='utf-8')
df_ml = pd.read_csv(ML_ONLY_FILEPATH, encoding='utf-8')

# Can we predict the macro category of a paper? What about micro-categories?
## Starting with macro categories, on the full dataset

In [6]:
df['macro_set'] = df['set_spec'].apply(lambda x: 'physics' if 'physics' in x else x)

In [7]:
df.head()

Unnamed: 0,identifier,url,title,set_spec,subjects,authors,dates,description,macro_set
0,oai:arXiv.org:1311.5328,http://arxiv.org/abs/1311.5328,Quenched invariance principle for a long-range...,math,"Mathematics - Probability,Primary 60K37, 60F17...","Zhongyang, Zhang,Li-Xin, Zhang","2013-11-21,2016-05-17",We consider a random walk on a random graph ...,math
1,oai:arXiv.org:1311.5329,http://arxiv.org/abs/1311.5329,Remarks on the energy release rate for an anti...,math,Mathematical Physics,"Morini, L.,Piccolroaz, A.,Mishuris, G.","2013-11-21,2014-04-10",This paper is concerned with the steady-stat...,math
2,oai:arXiv.org:1311.5330,http://arxiv.org/abs/1311.5330,Color spin wave functions of heavy tetraquark ...,physics:nucl-th,Nuclear Theory,"Park, Woosung,Lee, Su Houng",2013-11-21,"Using the variational method, we calculate t...",physics
3,oai:arXiv.org:1311.5331,http://arxiv.org/abs/1311.5331,"Stability, Electronic and Magnetic properties ...",physics:cond-mat,Condensed Matter - Materials Science,"Zhang, Jian-Min,Ming, Wenmei,Huang, Zhigao,Liu...",2013-11-21,Magnetic interaction with the gapless surfac...,physics
4,oai:arXiv.org:1311.5332,http://arxiv.org/abs/1311.5332,"On a Conjecture of Erd\H{o}s, Gallai, and Tuza",math,"Mathematics - Combinatorics,05C70","Puleo, Gregory J.","2013-11-21,2014-10-13","Erd\H{o}s, Gallai, and Tuza posed the follow...",math


In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_ml = tfidf_vectorizer.fit_transform(df['description'])
features = np.array(tfidf_vectorizer.get_feature_names())

In [11]:
tfidf = tfidf_ml

In [9]:
tfidf.shape

(1603370, 526112)

In [10]:
df.shape

(1603370, 9)

In [14]:
X = tfidf
y = df['macro_set'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
nb = MultinomialNB()

In [18]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
y_pred = nb.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy_score(y_test, y_pred)

0.8940857143569927

In [25]:
Counter(y_test)

Counter({'physics': 235783,
         'math': 93703,
         'cs': 59153,
         'nlin': 3271,
         'eess': 870,
         'stat': 3774,
         'q-bio': 2865,
         'econ': 312,
         'q-fin': 1112})

In [26]:
Counter(y_pred)

Counter({'physics': 244052,
         'math': 92744,
         'cs': 64033,
         'q-bio': 6,
         'stat': 5,
         'nlin': 3})

In [32]:
print(235783 / sum(Counter(y_test).values()))
print((235783 + 93703) / sum(Counter(y_test).values()))
print((235783 + 93703 + 59153) / sum(Counter(y_test).values()))

0.5882178309213333
0.8219826715197721
0.9695541645981095


## Let's try lumping all the non-physics/math/cs papers into one category, and then see how it performs

In [34]:
df['macro_set_reduced'] = df['macro_set'].apply(lambda x: x if x in ('physics', 'math', 'cs') else 'other')

In [35]:
df['macro_set_reduced'].value_counts()

physics    942170
math       376287
cs         236782
other       48131
Name: macro_set_reduced, dtype: int64

In [36]:
y_alt = df['macro_set_reduced'].values
y_alt_train, y_alt_test = train_test_split(y_alt, random_state=42)

In [59]:
nb_alt = MultinomialNB()
nb_alt.fit(X_train, y_alt_train)
y_alt_pred = nb_alt.predict(X_test)

In [60]:
accuracy_score(y_alt_test, y_alt_pred)

0.8946869472586524

In [61]:
incorrect_idxs = y_alt_pred != y_alt_test

In [62]:
incorrect = np.array(list(zip(list(y_alt_pred), list(y_alt_test))))[incorrect_idxs]

In [63]:
incorrect_strings = []
for row in incorrect:
    incorrect_strings.append(' -- '.join(row))

In [64]:
print('predicted -- actual: count')
Counter(incorrect_strings).most_common(100)

predicted -- actual: count


[('physics -- math', 9723),
 ('cs -- other', 5686),
 ('math -- cs', 5604),
 ('math -- physics', 5068),
 ('physics -- other', 4719),
 ('cs -- physics', 3756),
 ('cs -- math', 3448),
 ('physics -- cs', 2614),
 ('math -- other', 1525),
 ('other -- physics', 46),
 ('other -- math', 15),
 ('other -- cs', 10)]

# Let's try oversampling the minority classes

In [65]:
from imblearn.over_sampling import RandomOverSampler

In [66]:
ros = RandomOverSampler(random_state=42)

In [67]:
X_resampled, y_alt_resampled = ros.fit_resample(X, y_alt)

In [69]:
X_res_train, X_res_test, y_alt_res_train, y_alt_res_test = train_test_split(X_resampled, y_alt_resampled, random_state=42)

In [70]:
nb_res_alt = MultinomialNB()
nb_res_alt.fit(X_res_train, y_alt_res_train)
y_alt_res_pred = nb_alt.predict(X_res_test)

In [71]:
accuracy_score(y_alt_res_test, y_alt_res_pred)

0.6788849146120127

Well that didn't work! When I get more time, I'll try SMOTE or something. And also write some functions to be able to check the misclassified labels more easily for different models.