We want to use the BoW representation as a competitive baseline to predict citations. However, there are two concerns:

- The size of the corpus is in the millions. So we want sparse representations of the documents for speed and accuracy.
- We don't care about the CI for the individual features. Instead, we just want to explain as much variance through these features.

Thus, the strategy we use is as follows:

Get a sample of the corpus (eg. 1 million documents) and divide into training and test set. Train the best regression model and then make predictions for all documents in the corpus. Use this as a feature in the downstream regression.

In [7]:
import numpy as np
import os
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline

In [25]:
MILLION = 1000000
STATS_DIR = "/hg191/corpora/legaldata/data/stats/"
COURTS_CATEGORIES_FILE = "/hg191/corpora/legaldata/data/court.categories"
CURRENT_YEAR = 2018
NOBS = 50000
TRAIN_OBS_RANGE = (0,int(0.95*NOBS))
TEST_OBS_RANGE = (int(0.95*NOBS),NOBS)

In [26]:
with open (os.path.join (STATS_DIR, "ops.list")) as fin:
    ids = [(i, int(line.strip())) for i,line in enumerate (fin)]
dict_ids = {elem[0]: elem[1] for elem in ids}

In [27]:
# initialize a seed for repeatability
np.random.seed(100)
indices = np.random.choice(len(ids), NOBS, replace=False)

In [28]:
ind = pd.read_csv (os.path.join (STATS_DIR, "ops.ind"), sep=",", header=None, names=["id", "ind"])

In [29]:
def df2dict (df, key, val):
    keys = df[key].values
    vals = df[val].values
    
    assert (len(keys) == len (vals))
    return {keys[i]: vals[i] for i in range (len(keys))}

In [30]:
I = df2dict (ind, "id", "ind")

In [31]:
def readCorpus (filename, indices, verbose=True):
    corpus = dict ()
    with open (filename) as fin:
        for i, line in enumerate (fin):
            if i in indices:
                corpus[i] = line.strip()
            if verbose and i % MILLION == 0:
                print ("Lines processed: {0}".format (i))
    return corpus

In [32]:
corpus = readCorpus (os.path.join (STATS_DIR, "ops.docs"), indices)

Lines processed: 0
Lines processed: 1000000
Lines processed: 2000000
Lines processed: 3000000


In [33]:
y = np.array ([I[ids[index][1]] for index in indices])
raw = [corpus[index] for index in indices]

In [34]:
count_vect = CountVectorizer(max_features=10000, min_df=3, max_df=0.9)
lm = LinearRegression()

In [36]:
X = count_vect.fit_transform (raw)
lm.fit(X, np.log (y+1))
yhat = np.exp(lm.predict (X))

In [37]:
from sklearn.metrics import mean_squared_log_error
print (mean_squared_log_error(y,yhat))

1.0269768991014914


In [38]:
ops = [dict_ids[index] for index in indices]
with open (os.path.join (STATS_DIR, "ops.temp.bowfeat"), "w") as fout:
    for i in range (len (ops)):
        fout.write ("{0},{1}\n".format (ops[i], yhat[i]))