# Classification with 279 Labels

## Prepare data

In [1]:
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt

print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_codes.keys())
issue_codes.append('-1')
issue_codes.sort()

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue']])
    texts.append(record['text'])

print('Found %s documents.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.', 'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court'}
Found 8419 documents.
Found 279 labels.


In [2]:
import logging
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from gensim import corpora, models

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
dictionary = corpora.Dictionary.load_from_text('scotus.dict')

In [4]:
corpus = corpora.MmCorpus('scotus_corpus.mm')

2018-07-10 02:50:20,450 : INFO : loaded corpus index from scotus_corpus.mm.index
2018-07-10 02:50:20,451 : INFO : initializing cython corpus reader from scotus_corpus.mm
2018-07-10 02:50:20,452 : INFO : accepted corpus with 8419 documents, 56365 features, 7829471 non-zero entries


In [5]:
from gensim.models import TfidfModel

tfidf_model = TfidfModel(corpus)

2018-07-10 02:50:20,457 : INFO : collecting document frequencies
2018-07-10 02:50:20,458 : INFO : PROGRESS: processing document #0
2018-07-10 02:50:26,676 : INFO : calculating IDF weights for 8419 documents and 56364 features (7829471 matrix non-zeros)


In [6]:
corpus_tfidf = tfidf_model[corpus]

## LDA

In [7]:
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=300)

2018-07-10 02:50:26,828 : INFO : using symmetric alpha at 0.0033333333333333335
2018-07-10 02:50:26,829 : INFO : using symmetric eta at 0.0033333333333333335
2018-07-10 02:50:26,841 : INFO : using serial LDA version on this node
2018-07-10 02:50:29,186 : INFO : running online (single-pass) LDA training, 300 topics, 1 passes over the supplied corpus of 8419 documents, updating model once every 2000 documents, evaluating perplexity every 8419 documents, iterating 50x with a convergence threshold of 0.001000
2018-07-10 02:50:32,520 : INFO : PROGRESS: pass 0, at document #2000/8419
2018-07-10 02:51:29,176 : INFO : merging changes from 2000 documents into a model of 8419 documents
2018-07-10 02:51:32,613 : INFO : topic #112 (0.003): 0.006*"tax" + 0.003*"leasehold" + 0.003*"commerce" + 0.003*"interstate" + 0.003*"kaiser" + 0.003*"rent" + 0.002*"bulova" + 0.002*"indictment" + 0.002*"arbitration" + 0.002*"landlord"
2018-07-10 02:51:32,615 : INFO : topic #233 (0.003): 0.005*"sentence" + 0.004*"

In [8]:
corpus_lda = lda[corpus_tfidf]

In [9]:
lda.save('scotus.lda')

2018-07-10 03:01:43,448 : INFO : saving LdaState object under scotus.lda.state, separately None
2018-07-10 03:01:43,449 : INFO : storing np array 'sstats' to scotus.lda.state.sstats.npy
2018-07-10 03:01:43,623 : INFO : saved scotus.lda.state
2018-07-10 03:01:43,684 : INFO : saving LdaModel object under scotus.lda, separately ['expElogbeta', 'sstats']
2018-07-10 03:01:43,689 : INFO : not storing attribute state
2018-07-10 03:01:43,693 : INFO : not storing attribute dispatcher
2018-07-10 03:01:43,695 : INFO : not storing attribute id2word
2018-07-10 03:01:43,695 : INFO : storing np array 'expElogbeta' to scotus.lda.expElogbeta.npy
2018-07-10 03:01:43,868 : INFO : saved scotus.lda


In [10]:
# reshape data for classification
import gensim
X = np.transpose(gensim.matutils.corpus2dense(corpus_lda, num_terms=90018))

## Train test split

In [11]:
y = np.array(labels)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [13]:
print(X_train.shape)
print(y_train.shape)

(6819, 90018)
(6819,)


In [14]:
print(X_test.shape)
print(y_test.shape)

(842, 90018)
(842,)


## Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [16]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
score = logreg.score(X_test, y_test)
print(score)
score = logreg.score(X_val, y_val)
print(score)

0.10451306413301663
0.12269129287598944


In [18]:
from sklearn.metrics import classification_report

report = classification_report(y_val, logreg.predict(X_val), labels=np.arange(len(issue_codes)))
print(report)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.00      0.00      0.00         0
          2       0.00      0.00      0.00         8
          3       0.00      0.00      0.00        11
          4       0.00      0.00      0.00         1
          5       0.00      0.00      0.00         2
          6       0.00      0.00      0.00         2
          7       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.00      0.00      0.00         4
         11       0.00      0.00      0.00         2
         12       0.00      0.00      0.00         1
         13       0.00      0.00      0.00         3
         14       0.00      0.00      0.00         2
         15       0.15      0.75      0.25        24
         16       0.00      0.00      0.00         4
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# Logistic Regression Classification with 15 Labels

## Prepare data

In [19]:
sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys())
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue'][:-4]])
    texts.append(record['text'])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

{'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.', 'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court'}
Found 8419 texts.
Found 15 labels.


## Train test split

In [20]:
y = np.array(labels)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [22]:
print(X_train.shape)
print(y_train.shape)

(6819, 90018)
(6819,)


In [23]:
print(X_test.shape)
print(y_test.shape)

(842, 90018)
(842,)


## Logistic regression

In [24]:
logreg = LogisticRegression()

In [25]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
score = logreg.score(X_test, y_test)
print(score)
score = logreg.score(X_val, y_val)
print(score)

0.43824228028503565
0.45382585751978893


In [27]:
from sklearn.metrics import classification_report

report = classification_report(y_val, logreg.predict(X_val), labels=np.arange(len(issue_codes)))
print(report)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.46      0.90      0.61       183
          2       0.46      0.43      0.44       121
          3       0.83      0.09      0.16        56
          4       0.00      0.00      0.00        33
          5       0.00      0.00      0.00         9
          6       0.00      0.00      0.00        11
          7       0.50      0.15      0.23        33
          8       0.44      0.70      0.54       145
          9       0.38      0.16      0.22       102
         10       0.00      0.00      0.00        33
         11       0.00      0.00      0.00         5
         12       1.00      0.04      0.08        25
         13       0.00      0.00      0.00         1
         14       0.00      0.00      0.00         0

avg / total       0.44      0.45      0.38       758



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
