In [2]:
import ujson
import os
import sys
import heapq
import pandas as pd
from collections import defaultdict

In [3]:
if "../" not in sys.path:
    sys.path.append ("../")

In [4]:
from modules import constants

In [7]:
DATA_DIR = "/hg191/corpora/legaldata/data/"
FILES_DIR = os.path.join (DATA_DIR, "files")
STATS_DIR = os.path.join (DATA_DIR, "stats")
JURS_FILE = os.path.join (DATA_DIR, "jurs.names")
CITES_FILE = os.path.join (DATA_DIR, "cites.csv")

In [8]:
os.makedirs (STATS_DIR, exist_ok=True)

In [9]:
with open (JURS_FILE) as fin:
    jurs = [line.strip() for line in fin]

In [10]:
def json_iterator (filename):
    with open (filename) as fin:
        for line in fin:
            yield ujson.loads (line)

In [15]:
def mergeAndWrite (jurs):
    iterables = [json_iterator (os.path.join (FILES_DIR, jur + constants.JSONL_EXT)) for jur in jurs]
    
    OPS_FILE = os.path.join (STATS_DIR, "ops.list")
    DATES_FILE = os.path.join (STATS_DIR, "ops.dates")
    STATUS_FILE = os.path.join (STATS_DIR, "ops.pubs")
    COURTS_FILE = os.path.join (STATS_DIR, "ops.courts")
    TYPES_FILE = os.path.join (STATS_DIR, "ops.types")
    
    with open (OPS_FILE, "w") as ops_out, open (DATES_FILE, "w") as dates_out, open (STATUS_FILE, "w") as status_out, open(COURTS_FILE, "w") as courts_out, open(TYPES_FILE, "w") as types_out:
        merged = heapq.merge (*iterables, key=lambda x:x["date"])
        for js in merged:
            ops_out.write ("{0}\n".format (js["opid"]))
            dates_out.write ("{0},{1}\n".format (js["opid"], js["date"]))
            status_out.write ("{0},{1}\n".format (js["opid"], js["isapub"]))
            courts_out.write ("{0},{1}\n".format (js["opid"], js["court"]))
            types_out.write ("{0},{1}\n".format (js["opid"], js["type"]))
            

I. General statistics

- date of the document
- court of the document
- precendential status
- if opinion has a dissent

In [None]:
mergeAndWrite(jurs)

In [19]:
def readCitationNetAsDicts (filename):
    indict = defaultdict (int)
    outdict = defaultdict (int)
    
    df = pd.read_csv (filename, sep=",")
    
    froms = df["citing_opinion_id"].values
    tos = df["cited_opinion_id"].values
    
    for i in range (len(froms)):
        outdict[froms[i]] += 1
        indict[tos[i]] += 1
    
    return indict, outdict

In [20]:
OPS_FILE = os.path.join (STATS_DIR, "ops.list")
ind_dict, outd_dict = readCitationNetAsDicts(CITES_FILE)

IND_FILE = os.path.join (STATS_DIR, "ops.ind")
OUTD_FILE = os.path.join (STATS_DIR, "ops.outd")

with open (OPS_FILE) as fin, open(IND_FILE, "w") as indout, open (OUTD_FILE, "w") as outdout:
    for line in fin:
        op = int(line.strip())
        indout.write ("{0},{1}\n".format (op, ind_dict[op]))
        outdout.write ("{0},{1}\n".format (op, outd_dict[op]))

III. Document statistics

- number of pages
- number of statutes mentioned
- document length (number of unique types and number of tokens)
- bag of words

In [26]:
def mergeAndWriteDocs (jurs):
    filenames = [os.path.join (FILES_DIR, jur + ".tokenized" + constants.JSONL_EXT) for jur in jurs]
    iterables = [json_iterator (filename) for filename in filenames]
    
    #PAGES_FILE = os.path.join (STATS_DIR, "ops.pages")
    #LENGTH_FILES = [os.path.join (STATS_DIR, x) for x in ["ops.nuniqs", "ops.ntokens"]]
    #DOCS_FILE = os.path.join (STATS_DIR, "ops.docs")
    TEXTS_FILE = os.path.join (STATS_DIR, "ops.texts")
    
    #with open (PAGES_FILE, "w") as p_out, open (LENGTH_FILES[0], "w") as nuniqs_out, open (LENGTH_FILES[1], "w") as ntokens_out, open (DOCS_FILE, "w") as docs_out:
    with open (TEXTS_FILE, "w") as texts_out:
        merged = heapq.merge (*iterables, key=lambda x:x["date"])
        for js in merged:
            texts_out.write ("{0}\n".format(js["text"].replace("\r", "").replace("\n","")))

In [27]:
mergeAndWriteDocs (jurs)

III. create sub opinions file

In [11]:
def subopscreation (jurs):
    filenames = [os.path.join (FILES_DIR, jur + ".tokenized" + constants.JSONL_EXT) for jur in jurs]
    iterables = [json_iterator (filename) for filename in filenames]
    
    SUBOPS_FILE = os.path.join (STATS_DIR, "ops.subops")
    
    with open (SUBOPS_FILE, "w") as fout:
        merged = heapq.merge (*iterables, key=lambda x:x["date"])
        for js in merged:
            opid = js["opid"]
            subops = "$".join ([str(subop) for subop in js["subops"]])
            fout.write ("{0},{1}\n".format (opid, subops))

In [12]:
subopscreation(jurs)

IV. Network statistics

- outdegree
- indegree

In [13]:
def readCitationNetAsDicts (filename):
    indict = defaultdict (int)
    outdict = defaultdict (int)
    
    df = pd.read_csv (filename, sep=",")
    
    froms = df["citing_opinion_id"].values
    tos = df["cited_opinion_id"].values
    
    for i in range (len(froms)):
        outdict[froms[i]] += 1
        indict[tos[i]] += 1
    
    return indict, outdict

In [14]:
import numpy as np

In [16]:
OPS_FILE = os.path.join (STATS_DIR, "ops.subops")
ind_dict, outd_dict = readCitationNetAsDicts(CITES_FILE)
print ("Read citation network")

Read citation network


AssertionError: 

In [25]:
IND_FILE = os.path.join (STATS_DIR, "ops.ind")
OUTD_FILE = os.path.join (STATS_DIR, "ops.outd")

ind_exceptions = 0
outd_exceptions = 0
pointerin_dict = {}
pointerout_dict = {}

with open (OPS_FILE) as fin, open(IND_FILE, "w") as indout, open (OUTD_FILE, "w") as outdout:
    for line in fin:
        parts = line.strip().split(",")
        op = int(parts[0])
        subops = list (map(int, parts[1].split("$")))
        if len(subops) > 1:
            if op not in pointerin_dict:
                pointerin_dict[op] = 0
            if op not in pointerout_dict:
                pointerout_dict[op] = 0
            
            in_index = pointerin_dict[op]
            out_index = pointerout_dict[op]
        else:
            in_index = out_index = 0
        indout.write ("{0},{1}\n".format (op, ind_dict[subops[in_index]]))
        outdout.write ("{0},{1}\n".format (op, outd_dict[subops[out_index]]))

In [23]:
print (ind_exceptions)
print (outd_exceptions)

52953
63416
