# 279 Labels

In [1]:
import numpy as np
from textacy.datasets.supreme_court import SupremeCourt

print('Processing text dataset')

sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_codes.keys())
issue_codes.append('-1')
issue_codes.sort()

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue']])
    texts.append(record['text'])

print('Found %s documents.' % len(texts))
print('Found %s labels.' % len(labels_index))

Processing text dataset
{'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.', 'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court', 'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court'}
Found 8419 documents.
Found 279 labels.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('clf-svm', SVC(random_state=42)),
])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
print(np.mean(predicted_svm == y_test))


predicted_svm = text_clf_svm.predict(X_val)
print(np.mean(predicted_svm == y_val))



0.29334916864608074
0.30474934036939316


In [3]:
from sklearn.metrics import classification_report

report = classification_report(y_val, predicted_svm, labels=np.arange(len(issue_codes)))
print(report)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.00      0.00      0.00         0
          2       0.00      0.00      0.00         8
          3       0.17      0.27      0.21        11
          4       1.00      1.00      1.00         1
          5       0.00      0.00      0.00         2
          6       0.00      0.00      0.00         2
          7       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.50      0.50      0.50         4
         11       0.00      0.00      0.00         2
         12       0.00      0.00      0.00         1
         13       0.00      0.00      0.00         3
         14       0.00      0.00      0.00         2
         15       0.28      0.79      0.41        24
         16       0.75      0.75      0.75         4
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# 15 Labels

In [4]:
sc = SupremeCourt()
print(sc.info)

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

issue_codes = list(sc.issue_area_codes.keys())
issue_codes.sort()
issue_codes = [str(ic) for ic in issue_codes]

labels_index = dict(zip(issue_codes, np.arange(len(issue_codes))))

for i,record in enumerate(sc.records(limit=-1)):
    if record['issue'] == None: # some cases have None as an issue
        labels.append(labels_index['-1'])
    else:
        labels.append(labels_index[record['issue'][:-4]])
    texts.append(record['text'])

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels_index))

{'description': 'Collection of ~8.4k decisions issued by the U.S. Supreme Court between November 1946 and June 2016.', 'data_dir': '/usr/local/lib/python3.5/dist-packages/textacy/data/supreme_court', 'name': 'supreme_court', 'site_url': 'http://caselaw.findlaw.com/court/us-supreme-court'}
Found 8419 texts.
Found 15 labels.


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('clf-svm', SVC(random_state=42)),
])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
print(np.mean(predicted_svm == y_test))


predicted_svm = text_clf_svm.predict(X_val)
print(np.mean(predicted_svm == y_val))



0.6258907363420427
0.6398416886543535


In [6]:
from sklearn.metrics import classification_report

report = classification_report(y_val, predicted_svm, labels=np.arange(len(issue_codes)))
print(report)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.73      0.83      0.78       183
          2       0.74      0.52      0.61       121
          3       0.89      0.61      0.72        56
          4       0.86      0.18      0.30        33
          5       1.00      0.11      0.20         9
          6       1.00      0.64      0.78        11
          7       0.88      0.67      0.76        33
          8       0.57      0.77      0.65       145
          9       0.41      0.69      0.52       102
         10       0.56      0.15      0.24        33
         11       0.00      0.00      0.00         5
         12       0.93      0.56      0.70        25
         13       0.00      0.00      0.00         1
         14       0.00      0.00      0.00         0

avg / total       0.68      0.64      0.63       758



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
