In [43]:
# general imports
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from time import time, sleep
import json
import requests
import random
random.seed(11)
%load_ext autoreload
%autoreload 2

# model-specific
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Download opinions

## Specific judge

In [3]:
import requests
import json

def download_and_save_data(api_url, output_file):
    all_results = []

    while api_url:
        response = requests.get(api_url)
        data = response.json()

        # Assuming that the results are stored under a key named 'results'
        results = data.get('results', [])

        # Append the current page of results to the list
        all_results.extend(results)

        # Check if there's a 'next' key in the response indicating more pages
        api_url = data.get('next')

    # Save all results to a JSON file
    with open(output_file, 'w') as json_file:
        json.dump(all_results, json_file, indent=2)

# Example usage:
api_url = 'https://www.courtlistener.com/api/rest/v3/opinions/?cluster__docket__court=scotus&author=1713'
output_file = 'Kavanaugh_data.json'
download_and_save_data(api_url, output_file)


In [23]:
token = "a3d4ee79cbf433d55ee8b18cb3cd3a22aba8ae5f"
# url = "https://www.courtlistener.com/api/rest/v3/opinions/?cluster__docket__court__id=scotus&cluster__date_filed__gte=1994-01-01"
# https://www.courtlistener.com/api/rest/v3/clusters/?date_filed__gte=1994-01-01&docket__court__id=scotus

# to avoide deep paganation we use year filter
url = "https://www.courtlistener.com/api/rest/v3/opinions/?cluster__docket__court__id=scotus&cluster__date_filed__year=1994"
head = {'Authorization': 'Token {}'.format(token)}
response = requests.get(url, headers=head)

In [None]:
for year in range(1994,2023):
  url = f"""https://www.courtlistener.com/api/rest/v3/opinions/?cluster__docket__court__id=scotus&cluster__date_filed__year={year}"""
  headers = {"Authorization": f"Bearer {token}"}
  response = requests.get(url, headers=head)
  while response.status_code == 200:
    url = response.json()['next']
    for opinion in response.json()['results']:
      if not opinion['per_curiam'] and opinion['author_str'] is None:
        continue
      # get metadata from cluster
      clus_url = opinion['cluster']
      response_clus = requests.get(clus_url, headers=head)
      opinion['date_filed'] = response_clus.json()['date_filed']
      opinion['scdb_id'] = response_clus.json()['scdb_id']
      opinion['scdb_decision_direction'] = response_clus.json()['scdb_decision_direction']
      opinion['scdb_votes_majority'] = response_clus.json()['scdb_votes_majority']
      opinion['scdb_votes_minority'] = response_clus.json()['scdb_votes_minority']
      opinion['source'] = response_clus.json()['source']
      if (opinion['scdb_decision_direction'] is None) and (opinion['scdb_votes_majority'] is None):
        continue
      with open(f"""opinions/{opinion['id']}.json""", 'w') as f:
        json.dump(opinion, f)
    if url is None:
      break
    response = requests.get(url, headers=headers)


In [42]:
!ls opinions | wc -l

961


## Importing Jsons

In [None]:
# IMPORT JSONS
import os
import glob
from lxml import html

start = time()
jsons_as_series = []
file_list = glob.glob('scotus_opinions/*.json')

for filename in file_list:
    with open(filename) as json_data:
        json_1 = json.load(json_data)
        jsons_as_series.append(pd.Series(json_1))

opinions_df = pd.DataFrame(jsons_as_series)
print("Elapsed opinion loading time:", round((time()-start)/60, 1), 'minutes')


# REMOVE DISMISSALS (coextensive with non-per-curiam, short texts with no majority opinion)
# -- mostly denial of certiorari, but some misc. dismissals
opinions_df['per_curiam'] = opinions_df.per_curiam.astype(bool)
dismissals_index = opinions_df[
    (~opinions_df.per_curiam)
    & (opinions_df.html_with_citations.map(lambda x: len(x) < 5000))
    & (opinions_df.html_with_citations.map(lambda x: x.lower().find('delivered the opinion of the court.') == -1))
].index
opinions_df = opinions_df.drop(dismissals_index)

# LOAD AND LINK CLUSTERS
# first, convert all http URLs to https (we'll need this for consistency of merging, and user convenience)
def to_https(url):
    if url[:5] != 'https':
        url = 'https' + url[4:]
    if url[:32] == 'https://www.courtlistener.com:80': # fix erroneous :80 urls
        url = 'https://www.courtlistener.com' + url[32:]
    return url

opinions_df['cluster'] = opinions_df['cluster'].map(to_https)

start = time()
jsons_as_series = []
file_list = glob.glob('scotus_clusters/*.json')

for filename in file_list:
    with open(filename) as json_data:
        json_1 = json.load(json_data)
        jsons_as_series.append(pd.Series(json_1))

clusters_df = pd.DataFrame(jsons_as_series)
clusters_df['resource_uri'] = clusters_df.resource_uri.map(to_https)
print("Elapsed cluster loading time:", round((time()-start)/60, 1), 'minutes')

# merge info from clusters_df into opinions_df
cases_df = pd.merge(opinions_df,
                       clusters_df[['case_name',
                                    'date_filed',
                                    'federal_cite_one',
                                    'resource_uri',
                                    'scdb_id',
                                    'scdb_decision_direction',
                                    'scdb_votes_majority',
                                    'scdb_votes_minority'
                                   ]],
                       how='left',
                       left_on='cluster',
                       right_on='resource_uri')

del opinions_df, clusters_df
print('1', cases_df.shape)

# winnow down to the relevant columns (note: we'll drop the few cases of plain_text for consistency's sake)
cases_df = cases_df[[
    'case_name',
    'author_str',
    'date_filed',
    'federal_cite_one',
    'per_curiam',
    'author',
    'cluster',
    'absolute_url',
    'html_with_citations',
    'scdb_id',
    'scdb_decision_direction',
    'scdb_votes_majority',
    'scdb_votes_minority'
]]

# PARSE HTML
start = time()
cases_df['html_with_citations'] = cases_df.html_with_citations.astype(str)
cases_df = cases_df[cases_df.html_with_citations.map(lambda x: len(x) > 1)] # eliminate one empty string
cases_df['absolute_url'] = 'https://www.courtlistener.com' + cases_df.absolute_url
def extract_text(raw_html):
    return html.fromstring(raw_html).text_content().strip()
cases_df['plain_text'] = cases_df.html_with_citations.map(lambda x: extract_text(x))
is_empty_now = cases_df.plain_text.isnull()
print('Total html parsing time:', round((time()-start)/60, 1), 'minutes')
print("After parsing html, there are {} empty opinions remaining".format(sum(is_empty_now)))
cases_df = cases_df[~cases_df.per_curiam.isnull()]
print('2', cases_df.shape)

# remove remaining certiorari and misc. non-decisions: no listed decision direction, and no majority opinion
non_decision_index = cases_df[(~cases_df.per_curiam)
         & (cases_df.scdb_decision_direction.isnull())
         & (cases_df.plain_text.map(
             lambda x: x.lower().find('delivered the opinion of the court.')==-1))
        ].index
cases_df = cases_df.drop(non_decision_index)
print('3', cases_df.shape)

# remove duplicate cases
# cases_df = cases_df.drop_duplicates(subset='federal_cite_one')

# convert dates to datetime
import datetime
cases_df['date_filed'] = pd.to_datetime(cases_df.date_filed)
cases_df['year_filed'] = cases_df.date_filed.map(lambda x: x.year)
cases_df['year_filed'] = cases_df.year_filed.astype(int)
# filter by date here if desired:
# cases_df = cases_df[cases_df.year_filed >= 1970]

# SANITY CHECK: do dates and titles match texts?
checks = [83,1065,4508]
for c in checks:
    i = cases_df.index[c]
    print(
        '\n\n***SANITY CHECK {}***: \n',
        'CASE NAME:', cases_df.case_name[i], '\n',
        'CASE DATE:', cases_df.date_filed[i], '\n', '\n',
        'CASE TEXT:\n', cases_df.plain_text[i][:500])

1 (30407, 32)
Total html parsing time: 1.1 minutes
After parsing html, there are 0 empty opinions remaining
2 (30407, 14)
3 (28904, 14)


***SANITY CHECK {}***: 
 CASE NAME: Williams Co. v. Shoe MacH. Corp. 
 CASE DATE: 1942-05-25 00:00:00 
 
 CASE TEXT:
 316 U.S. 364 (1942)
WILLIAMS MANUFACTURING CO.
v.
UNITED SHOE MACHINERY CORP.
No. 332.
Supreme Court of United States.
Argued February 13, 1942.
Decided May 25, 1942.
CERTIORARI TO THE CIRCUIT COURT OF APPEALS FOR THE SIXTH DISTRICT.
Mr. H.A. Toulmin, Jr. for petitioner.
Mr. Harrison F. Lyman, with whom Messrs. Charles E. Hammett, Jr. and Thomas J. Ryan were on the brief, for respondent.
MR. JUSTICE ROBERTS delivered the opinion of the Court.
The suit was for the infringement of Claims 6, 23, 42


***SANITY CHECK {}***: 
 CASE NAME: Jacobs v. Baker 
 CASE DATE: 1869-03-22 00:00:00 
 
 CASE TEXT:
 74 U.S. 295
    19 L. Ed. 200
    7 Wall. 295
    JACOBSv.BAKER.
    December Term, 1868
    
      JACOBS filed a bill in the Circuit Court

In [None]:
cases_df.date_filed.max()

Timestamp('2020-07-09 00:00:00')

## Parsing HTML into distinct opinions

In [None]:
# PARSE plain text into separate opinions
def find_author_listed_before(text, index):
    '''
    Returns first justice name preceding INDEX in the same sentence of TEXT.  If no justice named
    between INDEX and the end of the previous sentence, returns None.
    '''
    text = text[:index].lower().replace('mr.','mr ')
    start_index = text.rfind(".")
    sentence = text[start_index:]

    justice_index = sentence.find("justice ")
    if justice_index == -1:
        justice_index = sentence.find("justice\n")
        if justice_index == -1:
            # catch rare format "Smith, Justice, delivered the opinion of the court."
            justice_index = sentence.find("justice, delivered")
            if justice_index != -1:
                return "justice " + sentence[:justice_index].split()[-1][:-1] # name is prev word sans comma
    if justice_index == -1:
        return None

    name_words = sentence[justice_index:].split()[:2]
    name_words[-1] = name_words[-1].replace(',','') # remove trailing comma if present
    name = " ".join(name_words)
    if name == 'justice dissentin': # catch rare false flag (actually a citation)
        return None
    return name

def get_index_from_keyphrase(text, start_index, keyphrase, alternate_keyphrase=None):
    '''
    returns first index of KEYPHRASE(str) in TEXT[START_INDEX:] that has an author name
    preceding it in the same sentence; returns None if none found
    '''
    search_text = text[start_index:]
    index = search_text.find(keyphrase)
    # if there isn't a justice preceding the keyphrase in the same sentence (rare),
    # then this is a false flag.  Move on to the next occurrence of the keyphrase and repeat until true flag or end.
    while index != -1 and find_author_listed_before(search_text, index + len(keyphrase)-2) is None:
        new_index = search_text[(index + len(keyphrase)):].find(keyphrase)
        index = new_index if new_index == -1 else new_index + (index + len(keyphrase))
        # because the search started with the index of the prev find as 0
    if index != -1:
        index += len(keyphrase) + start_index
    elif alternate_keyphrase is not None:
        index = get_index_from_keyphrase(text, start_index, alternate_keyphrase, None)
    return index

def get_indices(text, per_curiam=False):
    '''
    returns dictionary of beginning indices of majority / concurring / dissenting opinions in TEXT
    '''
    text = text.lower()
    indices = {}
    bookmark = 0  # keeps track of where to start our next search

    if per_curiam:
        indices['majority'] = text.find("per curiam.")
        if indices['majority'] != -1:
            indices['majority'] += len("per curiam.")
    else:
        indices['majority'] = get_index_from_keyphrase(text, 0, 'delivered the opinion of the court.', 'join.')

    if indices['majority'] == -1: # short-circuit if there is no majority opinion: it's a dismissal (or an anomaly)
        return indices

    bookmark = indices['majority']

    indices['first_concurring'] = get_index_from_keyphrase(
        text,
        bookmark,
        'concurring.',
        'concurring in the judgment.'
    )
    bookmark = max(bookmark, indices['first_concurring'])

    if indices['first_concurring'] == -1:
        indices['second_concurring'] = -1
    else:
        indices['second_concurring'] = get_index_from_keyphrase(
            text,
            bookmark,
            'concurring.'
        )
        bookmark = max(bookmark, indices['second_concurring'])

    indices['first_dissenting'] = get_index_from_keyphrase(
        text,
        bookmark,
        'dissenting.'
    )
    bookmark = max(bookmark, indices['first_dissenting'])


    if indices['first_dissenting'] == -1:
        indices['second_dissenting'] = -1
    else:
        indices['second_dissenting'] = get_index_from_keyphrase(
            text,
            bookmark,
            'dissenting.'
        )

    return indices

def remove_next_intro(text):
    '''removes last sentence of text if it's introducing the next opinion '''
    if text[-11:] in ['concurring.', 'dissenting.']:
        end_of_prev_sentence = text[:-1].replace('Mr.','Mr ').rfind('.')
        text = text[:end_of_prev_sentence + 2] # +2 to include last char and period
    return text

def split_and_label(text, per_curiam=False, include_concurring=True, include_second_dissent=True):
    ''' returns a list of tuples formatted as (author, majority/concurring/dissenting, text)'''
    opinions = []
    indices = get_indices(text, per_curiam)

    if indices['majority'] == -1: # indicates empty / dismissal / haywire
        return [None]

    majority_endpoint = indices['first_concurring'] if indices['first_concurring'] != -1 \
                            else indices['first_dissenting']
    if per_curiam:
        majority = (
            'per_curiam',
            'per_curiam',
            remove_next_intro( text[indices['majority']:majority_endpoint] ).strip()
        )
    else:
        majority = (
            find_author_listed_before(text, indices['majority']-1), # -1 to avoid including final period (find_author)
            'majority',
            remove_next_intro( text[indices['majority']:majority_endpoint] ).strip()
        )
    opinions.append(majority)

    concurring_endpoint = indices['second_concurring'] if indices['second_concurring'] != -1 \
                            else indices['first_dissenting']
    if include_concurring and indices['first_concurring'] != -1:
        first_concurring = (
            find_author_listed_before(text, indices['first_concurring']-1),
            'concurring',
            remove_next_intro( text[indices['first_concurring']:concurring_endpoint] ).strip()
        )
        opinions.append(first_concurring)

    if indices['first_dissenting'] != -1:
        first_dissenting = (
            find_author_listed_before(text, indices['first_dissenting']-1),
            'dissenting',
            remove_next_intro( text[indices['first_dissenting']:indices['second_dissenting']] ).strip()
        )
        opinions.append(first_dissenting)

    if include_second_dissent and indices['second_dissenting'] != -1:
        second_dissenting = (
            find_author_listed_before(text, indices['second_dissenting']-1),
            'second_dissenting',
            remove_next_intro( text[indices['second_dissenting']:] ).strip()
        )
        opinions.append(second_dissenting)

    # clip "notes" section from end of the text of the last opinion in the case file
    notes_index = opinions[-1][2].find('NOTES')
    if notes_index == -1:
        notes_index = opinions[-1][2].find('APPENDIXES')
    if notes_index != -1:
        opinions[-1] = (opinions[-1][0],
                        opinions[-1][1],
                        opinions[-1][2][:notes_index])

    return opinions

columns = [
    'author_name',
    'category',
    'per_curiam',
    'case_name',
    'date_filed',
    'federal_cite_one',
    'absolute_url',
    'cluster',
    'year_filed',
    'scdb_id',
    'scdb_decision_direction',
    'scdb_votes_majority',
    'scdb_votes_minority',
    'text'
]
opinions_df = pd.DataFrame(columns=columns)
counter = 0
start = time()

# .drop_duplicates(subset='federal_cite_one')
for i in cases_df.index:
    counter += 1
    print("Processing row {} of {}".format(counter, cases_df.shape[0]), end='\r')
    text = cases_df.plain_text[i]
    per_curiam = cases_df.per_curiam[i]
    opinions = split_and_label(text, per_curiam)
    if opinions[0] is None: # if no majority opinion, either empty or something is haywire
        continue
    for opinion in opinions:
        new_row = pd.Series(
            [
                opinion[0], # author
                opinion[1], # majority/concurring/dissenting
                per_curiam,
                cases_df.case_name[i],
                cases_df.date_filed[i],
                cases_df.federal_cite_one[i],
                cases_df.absolute_url[i],
                cases_df.cluster[i],
                cases_df.year_filed[i],
                cases_df.scdb_id[i],
                cases_df.scdb_decision_direction[i],
                cases_df.scdb_votes_majority[i],
                cases_df.scdb_votes_minority[i],
                opinion[2] # text
            ],
        index=columns)

#         print(new_row[:-1])
        opinions_df.loc[opinions_df.shape[0]] = new_row # append without creating new object each time

print("Elapsed opinion parsing time:", round((time()-start)/60, 1), 'minutes     ')

# retyping as necessary
opinions_df.per_curiam = opinions_df.per_curiam.astype(bool)
opinions_df.year_filed = opinions_df.year_filed.astype(int)

# drop any blank opinions that got read in (very few - about 7)
opinions_df = opinions_df[opinions_df.text.map(lambda x: len(x) > 1)]

# resolve apostrophe format discrepancies
opinions_df.author_name = opinions_df.author_name.map(lambda x: x.replace('’','\''))
opinions_df.author_name = opinions_df.author_name.map(lambda x: x.replace('`','\''))

import string
def format_name(name):
    ''' strips punctuation and capitalizes first letter of each part of the name'''
    if name == 'per_curiam':
        return name
    name = name.translate(str.maketrans('', '', string.punctuation.replace("'","")))
    name = ' '.join([s[0].upper() + s[1:] for s in name.split()])
    name = name\
        .replace('Homes','Holmes')\
        .replace('Mkinley','McKinley')\
        .replace('Mckinley','McKinley')\
        .replace('Duvall','Duval')\
        .replace('Duval','Duvall')\
        .replace('Brandies','Brandeis')\
        .replace('Branders','Brandeis')\
        .replace('Wilso','Wilson')\
        .replace('Bruger','Burger')\
        .replace('Authorginsburgauthor','Ginsburg')\
        .replace('Strongdelivered','Strong')\
        .replace('Conner',"O'Connor")\
        .replace("O'connor","O'Connor")\
        .replace('Millier','Miller')\
        .replace("M'kinley",'McKinley')\
        .replace("Mcreynolds",'McReynolds')

    return name

opinions_df['author_name'] = opinions_df.author_name.apply(format_name)

# remove very rare (mostly erroneous) author_name values if desired:
# rare_authors = list(opinions_df.author_name.value_counts()[opinions_df.author_name.value_counts() <= 5].index)
# opinions_df = opinions_df[~opinions_df.author_name.isin(rare_authors)]

Elapsed opinion parsing time: 26.6 minutes     


In [None]:
print('shape:', opinions_df.shape)
print('earliest date:', opinions_df.date_filed.min())
print('latest date:', opinions_df.date_filed.max())

shape: (35781, 14)
earliest date: 1797-02-13
latest date: 2020-07-09


## Save

In [None]:
opinions_df.to_csv('all_opinions.csv', index=False)

In [None]:
opinions_df[opinions_df.year_filed.astype(str) >= '1970'].to_csv('opinions_since_1970.csv', index=False)

## Checks

In [None]:
opinions_df.shape

(35781, 14)

In [None]:
opinions_df.isnull().sum()

author_name                    0
category                       0
per_curiam                     0
case_name                      0
date_filed                     0
federal_cite_one           16586
absolute_url                   0
cluster                        0
year_filed                     0
scdb_id                     1763
scdb_decision_direction     1768
scdb_votes_majority         1763
scdb_votes_minority         1763
text                           0
dtype: int64

In [None]:
opinions_df.head(3)

Unnamed: 0,author_name,category,per_curiam,case_name,date_filed,federal_cite_one,absolute_url,cluster,year_filed,scdb_id,scdb_decision_direction,scdb_votes_majority,scdb_votes_minority,text
0,Justice Roberts,majority,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,There is no right more basic in our democracy ...
1,Justice Thomas,concurring,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,I adhere to the view that this Court’s decisio...
2,Justice Breyer,dissenting,False,McCutcheon v. Federal Election Comm'n,2014-04-02,,https://www.courtlistener.com/opinion/2659301/...,https://www.courtlistener.com/api/rest/v3/clus...,2014,2013-033,1.0,5.0,4.0,"Nearly 40 years ago in Buckley v. Valeo, 424 U..."


In [None]:
print(round(opinions_df[opinions_df.category=='dissenting'].text.map(lambda x: len(x)).median()))
print(round(opinions_df[opinions_df.category=='second_dissenting'].text.map(lambda x: len(x)).median()))

10022
8685


In [None]:
opinions_df.author_name.value_counts()[opinions_df.author_name.value_counts() < 50]

Justice Gorsuch       47
Justice Baldwin       42
Justice Woodbury      42
Justice Washington    29
Justice Kavanaugh     20
Justice McKinley      17
Justice Barbour       15
Justice Byrnes        15
Justice Trimble       13
Justice Livingston     9
Justice Duvall         7
Justice Todd           5
Justice Chase          5
Justice Stated         3
Justice Johnston       2
Justice O2122          2
Justice Chie           2
Justice 458            1
Justice Harean         1
Justice Parsons        1
Justice Iiunt          1
Justice Thomson        1
Justice Cushing        1
Justice Daniels        1
Justice Pearson        1
Justice Wilson         1
Justice Or             1
Justice With           1
Justice Breese         1
Justice Now            1
Justice Concurring     1
Justice Connor         1
Justice Paterson       1
Justice Mokenna        1
Justice And            1
Name: author_name, dtype: int64

In [None]:
print(cases_df.iloc[855].absolute_url)

https://www.courtlistener.com/opinion/108553/united-states-v-midwest-video-corp/


In [None]:
get_indices(cases_df.loc[5467, 'plain_text'].lower())

{'first_concurring': -1,
 'first_dissenting': -1,
 'majority': 875,
 'second_concurring': -1,
 'second_dissenting': -1}

# Appendix

## Investigate federal_cite_one duplicates
spoiler: they're by and large not actual duplicate rows (a rare few are).  Usually they're two events pertaining to the same (but sometimes differently titled) case, e.g., a motion to proceed a certain way passes, or it's postponed because the plaintiff didn't show, and then there's a separate event for a real hearing of the case.  Or something like that.  At any rate, different texts.

In [None]:
feds = cases_df[
    (~cases_df.federal_cite_one.isnull())
    & ~(cases_df.federal_cite_one == '')
    & (cases_df.federal_cite_one.duplicated())
].federal_cite_one

In [None]:
i = -5
df = cases_df[cases_df.federal_cite_one == feds.iloc[i]]
for j, row in df.iterrows():
    print('\n', j)
    print(row.case_name)
    print(row.date_filed)
    print('ID:', row.scdb_id)
    print('Author:', row.author_str)

for j, row in df.iterrows():
    print('\n', j)
    print('Text:\n', row.plain_text[:1200])


 24731
Hamburg-American Line Terminal & Navigation Co. v. United States (Two Cases). Atlas Line S. S. Co. v. Same
1928-05-14 00:00:00
ID: 
Author: 

 30153
Hamburg-American Co. v. United States
2005-03-04 00:00:00
ID: 
Author: 

 24731
Text:
 277 U.S. 138
    48 S. Ct. 470
    72 L. Ed. 822
    HAMBURG-AMERICAN LINE TERMINAL & NAVIGATION CO.v.UNITED STATES (two cases).  ATLAS LINE S. S. CO.  v.  SAME.
    Nos. 3-5.
    Argued and Submitted April 25, 1928.
    Decided May 14, 1928.
    
      Under Trading with the Enemy Act, § 2(a), 50 USCA Appendix, § 2(a), Comp. St. § 3115 1/2 aa, property of domestic corporations, seized during war with Germany cannot be treated as owned by enemy, so as to preclude recovery of compensation from United States for use thereof, because their entire capital stock belonged to German corporation.
      Congress, having power to direct forfeiture of all property beneficially owned by enemy subjects during war, could provide for seizure thereof, followed b

## check clusters-opinions match rate

In [None]:
opinions_df.shape

(30407, 24)

In [None]:
clusters_df.shape

(64163, 51)

In [None]:
cases_df.date_filed[~cases_df.date_filed.isnull()].astype(str).max()

'2020-07-24'

In [None]:
[col for col in opinions_df.columns if col in clusters_df.columns]

['resource_uri', 'id', 'absolute_url', 'date_created', 'date_modified']

In [None]:
for col in ['id','resource_uri']:
    print('\n', col)
    print('opinion nulls', opinions_df[col].isnull().sum())
    print('cluster nulls', clusters_df[col].isnull().sum())
    print('opinion dupes', opinions_df[col].duplicated().sum())
    print('cluster dupes', clusters_df[col].duplicated().sum())


 id
opinion nulls 0
cluster nulls 0
opinion dupes 0
cluster dupes 0

 resource_uri
opinion nulls 0
cluster nulls 0
opinion dupes 0
cluster dupes 0


In [None]:
test = pd.merge(
    left = clusters_df[['id', 'date_filed']],
    right = opinions_df[['id','author']],
    left_on='id',
    right_on='id',
    how='inner'
)
test.shape

(29922, 3)

In [None]:
test = pd.merge(
    left = clusters_df[['resource_uri', 'date_filed']],
    right = opinions_df[['cluster','author']],
    left_on='resource_uri',
    right_on='cluster',
    how='inner'
)
test.shape

(30406, 4)

# incorporate uWash dataset if desired

In [None]:
uw_df = pd.read_csv('scdb_uwash_data.csv', engine='python')
uw_df.shape

(8966, 53)

In [None]:
uw_df.columns

Index(['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
       'naturalCourt', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitioner', 'petitionerState', 'respondent',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
       'lcDispositionDirection', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea', 'decisionDirection',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes'],
      dtype='object')

In [None]:
uw_df.dateArgument[~uw_df.dateArgument.isnull()].min()

'1/10/1947'

In [None]:
uw_df.issueArea.value_counts()

1.0     2029
8.0     1743
2.0     1450
9.0     1232
3.0      680
10.0     406
7.0      361
4.0      348
12.0     312
5.0      116
6.0      102
11.0      99
13.0      24
14.0       4
Name: issueArea, dtype: int64

In [None]:
test = pd.merge(
    left=cases_df[['scdb_id','federal_cite_one','case_name','date_filed']],
    right=uw_df[['caseId','usCite','sctCite']],
    left_on='scdb_id',
    right_on='caseId',
    how='inner'
)
test.shape

(8730, 7)

In [None]:
test = pd.merge(
    left=opinions_df[['scdb_id','federal_cite_one','case_name','date_filed']],
    right=uw_df[['caseId','usCite','sctCite']],
    left_on='scdb_id',
    right_on='caseId',
    how='inner'
)
test.shape

(15297, 7)