We want to use the BoW representation as a competitive baseline to predict citations. However, there are two concerns:

- The size of the corpus is in the millions. So we want sparse representations of the documents for speed and accuracy.
- We don't care about the CI for the individual features. Instead, we just want to explain as much variance through these features.

Thus, the strategy we use is as follows:

Get a sample of the corpus (eg. 1 million documents) and divide into training and test set. Train the best regression model and then make predictions for all documents in the corpus. Use this as a feature in the downstream regression.

In [1]:
import numpy as np
import os

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline

In [15]:
MILLION = 1000000
STATS_DIR = "/hg191/corpora/academic-data/semantic-scholar/stats/"
INDEG_FILE = os.path.join (STATS_DIR, "abs.ind")
DOCS_FILE = os.path.join (STATS_DIR, "abs.docs")
LINENUMS_FILE = os.path.join (STATS_DIR, "abs.bow-linenums")

In [3]:
with open (INDEG_FILE) as fin:
    indegrees = [int(line.strip().split(",")[1]) for line in fin]        

In [10]:
indices = set (np.random.choice (len(indegrees), MILLION, replace=False))

In [13]:
def readCorpus (filename, indices, verbose=True):
    corpus = list ()
    for i, line in enumerate (fin):
        if i in indices:
            corpus.append (line.strip())
        
        if verbose and i % MILLION == 0:
            print ("Lines processed: {0}".format (i))
    return corpus

In [None]:
corpus = readCorpus (DOCS_FILE, indices)

In [11]:
#corpus = list ()
#with open (DOCS_FILE) as fin:
#    for i, line in enumerate (fin):
#        if i in indices:
#            corpus.append (line.strip())
#        
#        if i % MILLION == 0:
#            print ("Lines processed: {0}".format (i))

Lines processed: 0
Lines processed: 1000000
Lines processed: 2000000
Lines processed: 3000000
Lines processed: 4000000
Lines processed: 5000000
Lines processed: 6000000
Lines processed: 7000000
Lines processed: 8000000
Lines processed: 9000000
Lines processed: 10000000
Lines processed: 11000000
Lines processed: 12000000
Lines processed: 13000000
Lines processed: 14000000
Lines processed: 15000000
Lines processed: 16000000
Lines processed: 17000000
Lines processed: 18000000
Lines processed: 19000000
Lines processed: 20000000
Lines processed: 21000000
Lines processed: 22000000
Lines processed: 23000000
Lines processed: 24000000
Lines processed: 25000000
Lines processed: 26000000
Lines processed: 27000000


In [62]:
y = np.array ([indegrees[i] for i in range (len(indegrees)) if i in indices])

In [16]:
with open (LINENUMS_FILE, "w") as fout:
    for index in indices:
        fout.write ("{0}\n".format (index))

In [68]:
count_vect = CountVectorizer(max_features=10000, min_df=3, max_df=0.9)
lm = LinearRegression()

In [73]:
k=50000
X = count_vect.fit_transform (corpus[:k])
cv_results = cross_validate(lm, X, np.log (y[:k] + 1), cv=5,
                            return_train_score=False)

In [74]:
print (np.mean(cv_results["test_score"]), np.std(cv_results["test_score"]))

-0.2462995478932446 0.17468092078831565


In [78]:
lm.fit(X, np.log (y[:k] + 1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [80]:
lm.predict (count_vect.transform([corpus[0]]))

array([0.54827673])

In [63]:
lm.fit(X, np.log (y[:25000] + 1))
lm.score(X,np.log (y[:25000] + 1))

0.5273972302695069