# Logistic Regression Classification with 279 Labels

## Prepare data

In [1]:
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt

print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_codes.keys())
issue_codes.append('-1')
issue_codes.sort()

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue']])
    texts.append(record['text'])

print('Found %s documents.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court', 'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.', 'name': 'supreme_court', 'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court'}
Found 8419 documents.
Found 279 labels.


In [2]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
from gensim.models.doc2vec import TaggedDocument

doc_tags = [str(i) for i in range(len(texts))]

def get_tagged_docs(doc_list, tags_list):
    tagged_docs = []
    for i, doc in enumerate(doc_list):
        tagged_docs.append(TaggedDocument(words=doc.split(), tags=[str(i)]))
    return tagged_docs
        
docs_generator = get_tagged_docs(texts, doc_tags)

2018-04-02 01:11:04,909 : INFO : 'pattern' package not found; tag filters are not available for English


In [4]:
from gensim.models import Doc2Vec

model = Doc2Vec(vector_size=300, window=10, min_count=5, workers=11, alpha=0.025, min_alpha=0.005)

model.build_vocab(docs_generator)

model.train(docs_generator, total_examples=len(texts), epochs=30)

2018-04-02 01:11:10,273 : INFO : collecting all words and their counts
2018-04-02 01:11:10,274 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-04-02 01:11:22,658 : INFO : collected 726981 word types and 8419 unique tags from a corpus of 8419 examples and 58601346 words
2018-04-02 01:11:22,659 : INFO : Loading a fresh vocabulary
2018-04-02 01:11:25,534 : INFO : min_count=5 retains 174584 unique words (24% of original 726981, drops 552397)
2018-04-02 01:11:25,535 : INFO : min_count=5 leaves 57765706 word corpus (98% of original 58601346, drops 835640)
2018-04-02 01:11:26,056 : INFO : deleting the raw counts dictionary of 726981 items
2018-04-02 01:11:26,092 : INFO : sample=0.001 downsamples 30 most-common words
2018-04-02 01:11:26,092 : INFO : downsampling leaves estimated 45524171 word corpus (78.8% of prior 57765706)
2018-04-02 01:11:26,773 : INFO : estimated required memory for 174584 words and 300 dimensions: 518080200 bytes
2018-04-02 01:11:26,77

2018-04-02 01:12:22,743 : INFO : EPOCH 2 - PROGRESS: at 71.15% examples, 1226925 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:12:23,749 : INFO : EPOCH 2 - PROGRESS: at 73.58% examples, 1226976 words/s, in_qsize 22, out_qsize 0
2018-04-02 01:12:24,762 : INFO : EPOCH 2 - PROGRESS: at 76.23% examples, 1227591 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:12:25,774 : INFO : EPOCH 2 - PROGRESS: at 78.77% examples, 1227954 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:12:26,781 : INFO : EPOCH 2 - PROGRESS: at 81.30% examples, 1228627 words/s, in_qsize 17, out_qsize 4
2018-04-02 01:12:27,786 : INFO : EPOCH 2 - PROGRESS: at 83.91% examples, 1229123 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:12:28,789 : INFO : EPOCH 2 - PROGRESS: at 86.36% examples, 1229866 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:12:29,796 : INFO : EPOCH 2 - PROGRESS: at 88.99% examples, 1230442 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:12:30,798 : INFO : EPOCH 2 - PROGRESS: at 91.55% examples, 1230736

2018-04-02 01:13:15,991 : INFO : EPOCH 4 - PROGRESS: at 38.29% examples, 1197229 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:13:16,999 : INFO : EPOCH 4 - PROGRESS: at 41.53% examples, 1196676 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:13:18,009 : INFO : EPOCH 4 - PROGRESS: at 44.33% examples, 1197378 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:13:19,014 : INFO : EPOCH 4 - PROGRESS: at 47.48% examples, 1205239 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:13:20,019 : INFO : EPOCH 4 - PROGRESS: at 50.49% examples, 1208171 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:13:21,019 : INFO : EPOCH 4 - PROGRESS: at 53.02% examples, 1211417 words/s, in_qsize 22, out_qsize 0
2018-04-02 01:13:22,020 : INFO : EPOCH 4 - PROGRESS: at 55.72% examples, 1215321 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:13:23,024 : INFO : EPOCH 4 - PROGRESS: at 58.39% examples, 1221346 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:13:24,029 : INFO : EPOCH 4 - PROGRESS: at 60.77% examples, 1223341

2018-04-02 01:14:12,052 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-02 01:14:12,055 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-04-02 01:14:12,056 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-04-02 01:14:12,063 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-02 01:14:12,064 : INFO : EPOCH - 5 : training on 58601346 raw words (40325970 effective words) took 32.6s, 1238813 effective words/s
2018-04-02 01:14:13,071 : INFO : EPOCH 6 - PROGRESS: at 3.52% examples, 1187641 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:14:14,076 : INFO : EPOCH 6 - PROGRESS: at 8.10% examples, 1225291 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:14:15,079 : INFO : EPOCH 6 - PROGRESS: at 12.83% examples, 1225674 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:14:16,080 : INFO : EPOCH 6 - PROGRESS: at 17.70% examples, 1224753 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:14:17,087 : INFO : EPOC

2018-04-02 01:15:11,079 : INFO : EPOCH 7 - PROGRESS: at 84.17% examples, 1232713 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:15:12,080 : INFO : EPOCH 7 - PROGRESS: at 86.66% examples, 1234770 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:15:13,087 : INFO : EPOCH 7 - PROGRESS: at 89.31% examples, 1235325 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:15:14,088 : INFO : EPOCH 7 - PROGRESS: at 91.85% examples, 1235883 words/s, in_qsize 22, out_qsize 0
2018-04-02 01:15:15,094 : INFO : EPOCH 7 - PROGRESS: at 94.26% examples, 1236642 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:15:16,106 : INFO : EPOCH 7 - PROGRESS: at 96.72% examples, 1238642 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:15:17,126 : INFO : EPOCH 7 - PROGRESS: at 99.12% examples, 1238847 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:15:17,351 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-04-02 01:15:17,361 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-04-02 0

2018-04-02 01:16:04,011 : INFO : EPOCH 9 - PROGRESS: at 53.39% examples, 1225115 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:16:05,019 : INFO : EPOCH 9 - PROGRESS: at 56.09% examples, 1227210 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:16:06,027 : INFO : EPOCH 9 - PROGRESS: at 58.68% examples, 1230003 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:16:07,045 : INFO : EPOCH 9 - PROGRESS: at 60.96% examples, 1229851 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:16:08,046 : INFO : EPOCH 9 - PROGRESS: at 63.51% examples, 1233379 words/s, in_qsize 18, out_qsize 3
2018-04-02 01:16:09,053 : INFO : EPOCH 9 - PROGRESS: at 66.21% examples, 1234491 words/s, in_qsize 18, out_qsize 3
2018-04-02 01:16:10,055 : INFO : EPOCH 9 - PROGRESS: at 68.76% examples, 1235224 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:16:11,062 : INFO : EPOCH 9 - PROGRESS: at 71.58% examples, 1237017 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:16:12,079 : INFO : EPOCH 9 - PROGRESS: at 74.02% examples, 1238850

2018-04-02 01:16:56,700 : INFO : EPOCH 11 - PROGRESS: at 8.12% examples, 1230929 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:16:57,705 : INFO : EPOCH 11 - PROGRESS: at 12.79% examples, 1220305 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:16:58,711 : INFO : EPOCH 11 - PROGRESS: at 17.60% examples, 1221640 words/s, in_qsize 22, out_qsize 1
2018-04-02 01:16:59,724 : INFO : EPOCH 11 - PROGRESS: at 21.70% examples, 1228393 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:17:00,724 : INFO : EPOCH 11 - PROGRESS: at 26.44% examples, 1229161 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:17:01,731 : INFO : EPOCH 11 - PROGRESS: at 30.57% examples, 1233153 words/s, in_qsize 22, out_qsize 0
2018-04-02 01:17:02,733 : INFO : EPOCH 11 - PROGRESS: at 35.41% examples, 1232657 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:17:03,746 : INFO : EPOCH 11 - PROGRESS: at 39.30% examples, 1234322 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:17:04,747 : INFO : EPOCH 11 - PROGRESS: at 42.40% examples,

2018-04-02 01:17:58,258 : INFO : EPOCH 12 - PROGRESS: at 97.21% examples, 1249182 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:17:59,261 : INFO : EPOCH 12 - PROGRESS: at 99.57% examples, 1249129 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:17:59,325 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-04-02 01:17:59,329 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-04-02 01:17:59,333 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-04-02 01:17:59,344 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-04-02 01:17:59,354 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-02 01:17:59,355 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-02 01:17:59,356 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-02 01:17:59,364 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-02 01:17:59,368 : INFO : worker

2018-04-02 01:18:49,979 : INFO : EPOCH 14 - PROGRESS: at 64.40% examples, 1251296 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:18:50,980 : INFO : EPOCH 14 - PROGRESS: at 66.94% examples, 1252234 words/s, in_qsize 18, out_qsize 3
2018-04-02 01:18:51,983 : INFO : EPOCH 14 - PROGRESS: at 69.74% examples, 1254235 words/s, in_qsize 18, out_qsize 3
2018-04-02 01:18:52,986 : INFO : EPOCH 14 - PROGRESS: at 72.36% examples, 1254433 words/s, in_qsize 22, out_qsize 2
2018-04-02 01:18:53,999 : INFO : EPOCH 14 - PROGRESS: at 75.00% examples, 1256238 words/s, in_qsize 22, out_qsize 0
2018-04-02 01:18:55,001 : INFO : EPOCH 14 - PROGRESS: at 77.50% examples, 1254542 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:18:56,005 : INFO : EPOCH 14 - PROGRESS: at 79.97% examples, 1254072 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:18:57,023 : INFO : EPOCH 14 - PROGRESS: at 82.52% examples, 1251417 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:18:58,028 : INFO : EPOCH 14 - PROGRESS: at 85.02% examples

2018-04-02 01:19:42,259 : INFO : EPOCH 16 - PROGRESS: at 25.85% examples, 1210518 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:19:43,263 : INFO : EPOCH 16 - PROGRESS: at 30.24% examples, 1213266 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:19:44,266 : INFO : EPOCH 16 - PROGRESS: at 35.06% examples, 1220098 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:19:45,269 : INFO : EPOCH 16 - PROGRESS: at 38.89% examples, 1217537 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:19:46,282 : INFO : EPOCH 16 - PROGRESS: at 41.96% examples, 1214819 words/s, in_qsize 22, out_qsize 0
2018-04-02 01:19:47,288 : INFO : EPOCH 16 - PROGRESS: at 44.87% examples, 1215718 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:19:48,292 : INFO : EPOCH 16 - PROGRESS: at 47.80% examples, 1219720 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:19:49,305 : INFO : EPOCH 16 - PROGRESS: at 50.85% examples, 1221987 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:19:50,312 : INFO : EPOCH 16 - PROGRESS: at 53.50% examples

2018-04-02 01:20:40,985 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-04-02 01:20:40,991 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-02 01:20:40,992 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-02 01:20:40,994 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-02 01:20:40,996 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-02 01:20:40,998 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-04-02 01:20:41,001 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-04-02 01:20:41,011 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-02 01:20:41,011 : INFO : EPOCH - 17 : training on 58601346 raw words (40323823 effective words) took 32.4s, 1245307 effective words/s
2018-04-02 01:20:42,015 : INFO : EPOCH 18 - PROGRESS: at 3.67% examples, 1242173 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:2

2018-04-02 01:21:36,604 : INFO : EPOCH 19 - PROGRESS: at 77.65% examples, 1259380 words/s, in_qsize 20, out_qsize 4
2018-04-02 01:21:37,607 : INFO : EPOCH 19 - PROGRESS: at 80.13% examples, 1260274 words/s, in_qsize 21, out_qsize 1
2018-04-02 01:21:38,615 : INFO : EPOCH 19 - PROGRESS: at 82.79% examples, 1261084 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:21:39,617 : INFO : EPOCH 19 - PROGRESS: at 85.44% examples, 1262200 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:21:40,623 : INFO : EPOCH 19 - PROGRESS: at 87.99% examples, 1261792 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:21:41,630 : INFO : EPOCH 19 - PROGRESS: at 90.52% examples, 1259890 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:21:42,642 : INFO : EPOCH 19 - PROGRESS: at 93.00% examples, 1258126 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:21:43,651 : INFO : EPOCH 19 - PROGRESS: at 95.23% examples, 1257080 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:21:44,665 : INFO : EPOCH 19 - PROGRESS: at 97.62% examples

2018-04-02 01:22:29,596 : INFO : EPOCH 21 - PROGRESS: at 48.13% examples, 1228936 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:22:30,598 : INFO : EPOCH 21 - PROGRESS: at 51.04% examples, 1230539 words/s, in_qsize 18, out_qsize 3
2018-04-02 01:22:31,602 : INFO : EPOCH 21 - PROGRESS: at 53.77% examples, 1236643 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:22:32,605 : INFO : EPOCH 21 - PROGRESS: at 56.47% examples, 1240620 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:22:33,631 : INFO : EPOCH 21 - PROGRESS: at 59.03% examples, 1241530 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:22:34,634 : INFO : EPOCH 21 - PROGRESS: at 61.54% examples, 1245916 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:22:35,637 : INFO : EPOCH 21 - PROGRESS: at 64.19% examples, 1247934 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:22:36,645 : INFO : EPOCH 21 - PROGRESS: at 66.74% examples, 1247725 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:22:37,647 : INFO : EPOCH 21 - PROGRESS: at 69.45% examples

2018-04-02 01:23:22,678 : INFO : EPOCH 23 - PROGRESS: at 3.74% examples, 1243317 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:23:23,682 : INFO : EPOCH 23 - PROGRESS: at 8.33% examples, 1249362 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:23:24,683 : INFO : EPOCH 23 - PROGRESS: at 13.15% examples, 1252009 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:23:25,687 : INFO : EPOCH 23 - PROGRESS: at 18.14% examples, 1253844 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:23:26,690 : INFO : EPOCH 23 - PROGRESS: at 22.08% examples, 1248174 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:23:27,690 : INFO : EPOCH 23 - PROGRESS: at 26.75% examples, 1245409 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:23:28,690 : INFO : EPOCH 23 - PROGRESS: at 30.88% examples, 1243103 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:23:29,704 : INFO : EPOCH 23 - PROGRESS: at 35.53% examples, 1238496 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:23:30,713 : INFO : EPOCH 23 - PROGRESS: at 39.30% examples, 

2018-04-02 01:24:24,822 : INFO : EPOCH 24 - PROGRESS: at 97.80% examples, 1256714 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:24:25,610 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-04-02 01:24:25,616 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-04-02 01:24:25,621 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-04-02 01:24:25,635 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-04-02 01:24:25,638 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-02 01:24:25,644 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-02 01:24:25,645 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-02 01:24:25,648 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-02 01:24:25,651 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-04-02 01:24:25,655 : INFO : worker thread finished; awaitin

2018-04-02 01:25:17,269 : INFO : EPOCH 26 - PROGRESS: at 67.03% examples, 1256754 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:25:18,281 : INFO : EPOCH 26 - PROGRESS: at 69.69% examples, 1254444 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:25:19,294 : INFO : EPOCH 26 - PROGRESS: at 72.29% examples, 1252853 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:25:20,298 : INFO : EPOCH 26 - PROGRESS: at 74.82% examples, 1253702 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:25:21,298 : INFO : EPOCH 26 - PROGRESS: at 77.36% examples, 1252985 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:25:22,299 : INFO : EPOCH 26 - PROGRESS: at 79.72% examples, 1250285 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:25:23,301 : INFO : EPOCH 26 - PROGRESS: at 82.35% examples, 1250332 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:25:24,307 : INFO : EPOCH 26 - PROGRESS: at 84.92% examples, 1250700 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:25:25,312 : INFO : EPOCH 26 - PROGRESS: at 87.46% examples

2018-04-02 01:26:08,567 : INFO : EPOCH 28 - PROGRESS: at 26.35% examples, 1229140 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:09,573 : INFO : EPOCH 28 - PROGRESS: at 30.61% examples, 1233822 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:10,579 : INFO : EPOCH 28 - PROGRESS: at 35.37% examples, 1233068 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:11,581 : INFO : EPOCH 28 - PROGRESS: at 39.32% examples, 1238284 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:12,585 : INFO : EPOCH 28 - PROGRESS: at 42.55% examples, 1235599 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:13,599 : INFO : EPOCH 28 - PROGRESS: at 45.49% examples, 1241085 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:14,599 : INFO : EPOCH 28 - PROGRESS: at 48.54% examples, 1240793 words/s, in_qsize 21, out_qsize 0
2018-04-02 01:26:15,610 : INFO : EPOCH 28 - PROGRESS: at 51.23% examples, 1240700 words/s, in_qsize 19, out_qsize 2
2018-04-02 01:26:16,621 : INFO : EPOCH 28 - PROGRESS: at 54.01% examples

2018-04-02 01:27:06,841 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-02 01:27:06,847 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-02 01:27:06,848 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-02 01:27:06,853 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-02 01:27:06,855 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-04-02 01:27:06,858 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-04-02 01:27:06,861 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-02 01:27:06,861 : INFO : EPOCH - 29 : training on 58601346 raw words (40324558 effective words) took 32.2s, 1252877 effective words/s
2018-04-02 01:27:07,868 : INFO : EPOCH 30 - PROGRESS: at 3.62% examples, 1211993 words/s, in_qsize 20, out_qsize 1
2018-04-02 01:27:08,869 : INFO : EPOCH 30 - PROGRESS: at 8.05% examples, 1227795 words/s, in_qsize 20, out

In [5]:
import numpy as np

X = np.array([model[str(i)] for i in range(len(texts))])

## Train test split

In [6]:
y = np.array(labels)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(X_train.shape)
print(y_train.shape)

(6735, 300)
(6735,)


In [9]:
print(X_test.shape)
print(y_test.shape)

(1684, 300)
(1684,)


## Logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [11]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
score = logreg.score(X_test, y_test)
print(score)

0.2862232779097387


# Logistic Regression Classification with 15 Labels

## Prepare data

In [13]:
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys())
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue'][:-4]])

print('Found %s labels.' % len(labels_index))

Found 15 labels.


## Train test split

In [14]:
y = np.array(labels)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print(X_train.shape)
print(y_train.shape)

(6735, 300)
(6735,)


In [17]:
print(X_test.shape)
print(y_test.shape)

(1684, 300)
(1684,)


## Logistic regression

In [18]:
logreg = LogisticRegression()

In [19]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
score = logreg.score(X_test, y_test)
print(score)

0.5409738717339667
