# Classification with 279 Labels

## Prepare data

In [1]:
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt

print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_codes.keys())
issue_codes.append('-1')
issue_codes.sort()

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue']])
    texts.append(record['text'])

print('Found %s documents.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
Found 8419 documents.
Found 279 labels.


In [2]:
import logging
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from gensim import corpora, models

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
dictionary = corpora.Dictionary.load_from_text('scotus.dict')

In [4]:
corpus = corpora.MmCorpus('scotus_corpus.mm')

2018-04-01 23:38:00,900 : INFO : loaded corpus index from scotus_corpus.mm.index
2018-04-01 23:38:00,901 : INFO : initializing cython corpus reader from scotus_corpus.mm
2018-04-01 23:38:00,902 : INFO : accepted corpus with 8419 documents, 56365 features, 7829471 non-zero entries


In [5]:
from gensim.models import TfidfModel

tfidf_model = TfidfModel(corpus)

2018-04-01 23:38:00,907 : INFO : collecting document frequencies
2018-04-01 23:38:00,908 : INFO : PROGRESS: processing document #0
2018-04-01 23:38:06,497 : INFO : calculating IDF weights for 8419 documents and 56364 features (7829471 matrix non-zeros)


In [6]:
corpus_tfidf = tfidf_model[corpus]

## LDA

In [7]:
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=300)

2018-04-01 23:38:06,648 : INFO : using symmetric alpha at 0.0033333333333333335
2018-04-01 23:38:06,649 : INFO : using symmetric eta at 0.0033333333333333335
2018-04-01 23:38:06,660 : INFO : using serial LDA version on this node
2018-04-01 23:38:08,938 : INFO : running online (single-pass) LDA training, 300 topics, 1 passes over the supplied corpus of 8419 documents, updating model once every 2000 documents, evaluating perplexity every 8419 documents, iterating 50x with a convergence threshold of 0.001000
2018-04-01 23:38:12,053 : INFO : PROGRESS: pass 0, at document #2000/8419
2018-04-01 23:38:23,196 : INFO : merging changes from 2000 documents into a model of 8419 documents
2018-04-01 23:38:25,452 : INFO : topic #233 (0.003): 0.005*"enemy" + 0.004*"cement" + 0.004*"poy" + 0.004*"labor" + 0.003*"taxicab" + 0.003*"chin" + 0.003*"alstate" + 0.003*"ally" + 0.003*"collins" + 0.003*"pennsylvania"
2018-04-01 23:38:25,453 : INFO : topic #242 (0.003): 0.007*"tax" + 0.004*"confession" + 0.004*

In [8]:
corpus_lda = lda[corpus_tfidf]

In [9]:
lda.save('scotus.lda')

2018-04-01 23:39:56,667 : INFO : saving LdaState object under scotus.lda.state, separately None
2018-04-01 23:39:56,668 : INFO : storing np array 'sstats' to scotus.lda.state.sstats.npy
2018-04-01 23:39:56,714 : INFO : saved scotus.lda.state
2018-04-01 23:39:56,738 : INFO : saving LdaModel object under scotus.lda, separately ['expElogbeta', 'sstats']
2018-04-01 23:39:56,738 : INFO : not storing attribute state
2018-04-01 23:39:56,739 : INFO : not storing attribute dispatcher
2018-04-01 23:39:56,739 : INFO : storing np array 'expElogbeta' to scotus.lda.expElogbeta.npy
2018-04-01 23:39:56,874 : INFO : not storing attribute id2word
2018-04-01 23:39:56,876 : INFO : saved scotus.lda


In [10]:
# reshape data for classification
import gensim
X = np.transpose(gensim.matutils.corpus2dense(corpus_lda, num_terms=90018))

## Train test split

In [11]:
y = np.array(labels)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(X_train.shape)
print(y_train.shape)

(6735, 90018)
(6735,)


In [14]:
print(X_test.shape)
print(y_test.shape)

(1684, 90018)
(1684,)


## Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [16]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
score = logreg.score(X_test, y_test)
print(score)

0.1342042755344418


# Logistic Regression Classification with 15 Labels

## Prepare data

In [18]:
sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys())
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue'][:-4]])
    texts.append(record['text'])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

{'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.'}
Found 8419 texts.
Found 15 labels.


## Train test split

In [19]:
y = np.array(labels)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
print(X_train.shape)
print(y_train.shape)

(6735, 90018)
(6735,)


In [22]:
print(X_test.shape)
print(y_test.shape)

(1684, 90018)
(1684,)


## Logistic regression

In [23]:
logreg = LogisticRegression()

In [24]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
score = logreg.score(X_test, y_test)
print(score)

0.40320665083135393
