In [1]:
import glob
import os
import random
import logging
import pandas as pd
import numpy as np
import pickle as pk
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from random import sample
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# import dec2vec model

In [2]:
model = Doc2Vec.load('/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v')
set_of_doc_names = set(model.docvecs.doctags)

# import master dataframe

In [3]:
master_dataframe = pd.read_csv("/home/jcai/geometry_of_law/Encyclopedia Entry/master_dataframe.csv")
master_dataframe = master_dataframe[master_dataframe['docname'].isin(set_of_doc_names)]

# helper

In [4]:
def return_average_vector(list_of_docname):
    '''returns the average vector for the given list of docname'''
    list_of_vectors = [model[x] for x in list_of_docname]
    mean = np.mean(list_of_vectors, axis=0)
    return mean

# define the intruder_sample function

In [5]:
master_dataframe.columns

Index(['docname', 'judge_name', 'party', 'year', 'circuit', 'circuit-year',
       'big-issue', 'detailed-issue', 'circuit-big-issue', 'year-big-issue',
       'circuit-big-issue-year', 'judge-year', 'decade', 'judge-decade',
       'court-decade'],
      dtype='object')

In [6]:
judge_with_over_10_documents = []
for judge,count in master_dataframe.groupby("judge_name").count()['docname'].reset_index(name="count").itertuples(index=False):
    if count > 10:
        judge_with_over_10_documents.append(judge)
    else:
        pass

In [17]:
def intruder_sample(size, opinion_restriction = []):
    
    if len(opinion_restriction) == 0:
        judge = sample(judge_with_over_10_documents,1)[0]
        judge_list = sample(list(master_dataframe[master_dataframe['judge_name'] == judge]['docname'].values),size)
        intruder_opinion = sample(list(master_dataframe['docname'].values),1)
        
        judge_avg_vec = return_average_vector(judge_list)
        intruder_vec = model[intruder_opinion[0]]
        
        judge_sim_list = [1 - cosine(judge_avg_vec,model[i]) for i in judge_list]
        intruder_sim = 1 - cosine(judge_avg_vec,intruder_vec)
        
        if intruder_sim > min(judge_sim_list):
            return False
        else:
            return True
        
    else:
        try:
            group_key = []
            
            for item in opinion_restriction:
                key = sample(list(master_dataframe[master_dataframe[item].notna()][item].unique()),1)[0]
                group_key.append(key)
                
            if len(group_key) == 1:
                restricted_group = master_dataframe.groupby(opinion_restriction).get_group(*group_key)
            else:
                restricted_group = master_dataframe.groupby(opinion_restriction).get_group(tuple(group_key))
            
            judge_over_10_documents = []
            for judge, count in restricted_group.groupby("judge_name").count()['docname'].reset_index(name="count").itertuples(index=False):
                if count > 10:
                    judge_over_10_documents.append(judge)
                else:
                    pass
                
            judge = sample(judge_over_10_documents,1)[0]
            judge_list = sample(list(restricted_group[restricted_group['judge_name'] == judge]['docname'].values),size)
            intruder_opinion = sample(list(restricted_group[restricted_group['judge_name'] != judge]['docname'].values),1)
            
            judge_avg_vec = return_average_vector(judge_list)
            intruder_vec = model[intruder_opinion[0]]
            
            judge_sim_list = [1 - cosine(judge_avg_vec,model[i]) for i in judge_list]
            intruder_sim = 1 - cosine(judge_avg_vec,intruder_vec)
            
            if intruder_sim > min(judge_sim_list):
                return False
            else:
                return True
        
        except:
            return intruder_sample(size, opinion_restriction)

# intruder analysis: randomly sample a judge and an opinion from another judge

In [18]:
results = [sum([intruder_sample(9)/100 for x in range(100)]) for x in range(10)]

In [19]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.9780000000000006  stdev is  0.016613247725836163


# intruder analysis: randomly sample a judge and an opinion from another judge | the opinions are from the same circuit

In [20]:
results = [sum([intruder_sample(9,['circuit']) for x in range(100)])/100 for x in range(10)]

In [21]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.924  stdev is  0.022449944320643633


# intruder analysis: randomly sample a judge and an opinion from another judge | the opinions are from the same issue

In [None]:
results = [sum([intruder_sample(9,['big-issue']) for x in range(10)])/10 for x in range(10)]

In [None]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.8800000000000001  stdev is  0.08717797887081348


# intruder analysis: randomly sample a judge and an opinion from another judge | the opinions are from the same year

In [None]:
results = [sum([intruder_sample(9,['year']) for x in range(10)])/10 for x in range(10)]

In [None]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.8899999999999999  stdev is  0.06999999999999999


# intruder analysis: randomly sample a judge and an opinion from another judge | the opinions are from the same circuit and big issue

In [18]:
results = [sum([intruder_sample(9,['circuit','big-issue']) for x in range(10)])/10 for x in range(10)]

In [19]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.9  stdev is  0.07745966692414835


# intruder analysis: randomly sample a judge and an opinion from another judge | the opinions are from the same year and big issue

In [20]:
results = [sum([intruder_sample(9,['year','big-issue']) for x in range(10)])/10 for x in range(10)]

In [21]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.8900000000000002  stdev is  0.08306623862918074


# intruder analysis: randomly sample a judge and an opinion from another judge | the opinions are from the same year, big issue, circuit

In [22]:
results = [sum([intruder_sample(9,['year','big-issue','circuit']) for x in range(10)])/10 for x in range(10)]

In [23]:
print('The precent detected is ', np.mean(results), ' stdev is ' , np.std(results))

The precent detected is  0.78  stdev is  0.0871779788708135
