In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pickle as pk
import os
import pandas as pd
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [98]:
##change working directory
os.chdir('/home/jcai/geometry_of_law/data_and_dictionary')

In [99]:
##loading average dictionaries
Y_average_dict = pk.load(open( "Y_average_dict.p", "rb" ))
CBY_average_dict = pk.load(open( "CBY_average_dict.p", "rb" ))
JY_average_dict = pk.load(open( "JY_average_dict.p", "rb" ))
YB_average_dict = pk.load(open( "YB_average_dict.p", "rb" ))
CB_average_dict = pk.load(open( "CB_average_dict.p", "rb" ))
CY_average_dict = pk.load(open( "CY_average_dict.p", "rb" ))
JD_average_dict = pk.load(open( "JD_average_dict.p", "rb" ))
CD_average_dict = pk.load(open( "CD_average_dict.p", "rb" ))
C_average_dict = pk.load(open( "C_average_dict.p", "rb" ))
judge_average_dict = pk.load(open( "judge_average_dict.p", "rb" ))

In [100]:
##loading attribute dictionaries
Y_dict = pk.load(open( "Y_dict.p", "rb" ))
CBY_dict = pk.load(open( "CBY_dict.p", "rb" ))
JY_dict = pk.load(open( "JY_dict.p", "rb" ))
YB_dict = pk.load(open( "YB_dict.p", "rb" ))
CB_dict = pk.load(open( "CB_dict.p", "rb" ))
CY_dict = pk.load(open( "CY_dict.p", "rb" ))
JD_dict = pk.load(open( "JD_dict.p", "rb" ))
CD_dict = pk.load(open( "CD_dict.p", "rb" ))
C_dict = pk.load(open( "C_dict.p", "rb" ))

In [101]:
##importing master_dataframe
master_dataframe = pd.read_csv("/home/jcai/geometry_of_law/Encyclopedia Entry/master_dataframe.csv")

In [102]:
model = Doc2Vec.load('/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v')
set_of_doc_names = set(model.docvecs.doctags)

In [103]:
master_dataframe = master_dataframe[master_dataframe['docname'].isin(set_of_doc_names)]

In [104]:
def return_list_of_docname(column, value):
    '''returns the list of docname of the vectors whose column is value'''
    list_of_docname = list(master_dataframe.loc[master_dataframe[column]==value]['docname'])
    return list_of_docname

In [105]:
def return_list_of_docname_2_values(column_1, value_1, column_2, value_2):
    '''returns two list of docnames according to 2 values'''
    list_1 = return_list_of_docname(column_1, value_1)
    list_2 = return_list_of_docname(column_2, value_2)
    list_of_docname = list(set(list_1) & set(list_2))
    return list_of_docname

In [106]:
def return_average_vector(list_of_docname):
    '''returns the average vector for the given list of docname'''
    list_of_vectors = [model[x] for x in list_of_docname]
    mean = np.mean(list_of_vectors, axis=0)
    return mean

In [107]:
##return the least similar vector
##depends on importing the d2v model as "model"
## and the previously defined functions
def return_vector_dissimilar_to_average(judge, attr_1, number_of_return, ascending_or_not):
    '''returns those among the judge's for the given list of docname'''
    list_of_docname = return_list_of_docname_2_values("judge_name",judge,"big-issue",attr_1)
    average_vec = return_average_vector(list_of_docname)
    temp_df = pd.DataFrame()
    temp_df["docname"] = list_of_docname
    temp_df["similarity"] = [1 - cosine(model[x],average_vec) for x in list_of_docname]
    
    temp_df = temp_df.sort_values("similarity", ascending = ascending_or_not)
    return temp_df[:number_of_return]

In [108]:
#to test 
return_vector_dissimilar_to_average('SCHALL, ALVIN ANTHONY','4',5,False)

Unnamed: 0,docname,similarity
69,X12N0L2003_contentMajOp_SCHALL.txt,0.605013
35,X1BN634003_contentMajOp_SCHALL.txt,0.595868
22,X361C11_contentMajOp_Schall.txt,0.595516
43,X12LP0I003_contentMajOp_SCHALL.txt,0.572147
61,XI5530N_contentMajOp_SCHALL.txt,0.571097


In [109]:
docname_vector_dict = pk.load(open( "docname_vector_dict.p", "rb" ))

In [110]:
opinion_vec_demeaned_topic_year = dict()

In [111]:
in_both = list(set(docname_vector_dict.keys()).intersection(set(master_dataframe["docname"])))

## here we calculate for each opinion:
## its difference the average opinion in its year and topic

In [112]:
for key in in_both:
    opinion_vec_demeaned_topic_year[key] = docname_vector_dict[key] - YB_average_dict[YB_dict[key]]

In [113]:
pk.dump(opinion_vec_demeaned_topic_year, open( "opinion_vec_demeaned_topic_year.p", "wb" ))

## here we calculate for each judge:
## the difference between his/her average opinion in its year and topic and the average opinion in its year and topic

In [114]:
judge_topic_year_vec_demeaned_topic_year = dict()

In [115]:
judge_topic_year_vec_demeaned_topic_year_more_than_one = dict()

In [116]:
list_of_judges = list(master_dataframe['judge_name'].unique())

In [117]:
grouped_by_judge_topic_year = master_dataframe.groupby(["judge_name",'year-big-issue'])

In [120]:
for name, group in grouped_by_judge_topic_year:
    judge_topic_year_vec_demeaned_topic_year[name[0]+'-'+name[1]] = return_average_vector(group["docname"].values) - YB_average_dict[name[1]]

In [121]:
def calculating_opinion_YB_judge_YB_dis(docname):
    try:
        judge_YB = JY_dict[docname]+'-'+YB_dict[docname].split("-")[1]
        return cosine(judge_topic_year_vec_demeaned_topic_year[judge_YB],opinion_vec_demeaned_topic_year[docname])
    except:
        return np.nan

In [122]:
#calculating_opinion_YB_judge_YB_dis_distance('X3FE6A_contentMajOp_WILLIAMS.txt')

In [123]:
opinion_YB_judge_YB_dis = pd.DataFrame()
opinion_YB_judge_YB_dis["docname"] = opinion_vec_demeaned_topic_year.keys()
opinion_YB_judge_YB_dis["dis"] = [calculating_opinion_YB_judge_YB_dis(x) for x in opinion_vec_demeaned_topic_year.keys()]

In [124]:
opinion_YB_judge_YB_dis.shape

(292747, 2)

In [125]:
opinion_YB_judge_YB_dis = opinion_YB_judge_YB_dis.merge(master_dataframe, on = "docname")

In [126]:
opinion_YB_judge_YB_dis = opinion_YB_judge_YB_dis[opinion_YB_judge_YB_dis.dis.notna()]

In [127]:
opinion_YB_judge_YB_dis["caseid"] = [x.split("_")[0] for x in opinion_YB_judge_YB_dis['docname']]

In [128]:
opinion_YB_judge_YB_dis.to_csv('/home/jcai/geometry_of_law/data_and_dictionary/opinion_YB_judge_YB_dis.csv')

In [139]:
def calculating_opinion_judge_YB_dis(docname):
    try:
        judge_YB = JY_dict[docname]+'-'+YB_dict[docname].split("-")[1]
        return cosine(judge_topic_year_vec_demeaned_topic_year[judge_YB],opinion_vec_demeaned_topic_year[docname])
    except:
        return np.nan

In [None]:
for name, group in judge_topic_year_vec_demeaned_judge_topic_year:
    judge_topic_year_vec_demeaned_judge_topic_year[name[0]+'-'+name[1]] = return_average_vector(group["docname"].values) - YB_average_dict[name[1]]

# here we calculate distance between average opinion in its year and topic and this opinion

In [134]:
def calculating_opinion_YB_dis(docname):
    try:
        return cosine(YB_average_dict[YB_dict[docname]],docname_vector_dict[docname])
    except:
        return np.nan

In [135]:
opinion_YB_dis = pd.DataFrame()
opinion_YB_dis["docname"] = opinion_vec_demeaned_topic_year.keys()
opinion_YB_dis["dis"] = [calculating_opinion_YB_dis(x) for x in opinion_vec_demeaned_topic_year.keys()]

In [136]:
opinion_YB_dis = opinion_YB_dis.merge(master_dataframe, on = "docname")

In [137]:
opinion_YB_dis["caseid"] = [x.split("_")[0] for x in opinion_YB_dis['docname']]

In [138]:
opinion_YB_dis.to_csv('/home/jcai/geometry_of_law/data_and_dictionary/opinion_YB_dis.csv')

# loading metadata

In [4]:
outcome_data = pd.read_csv('/home/jcai/geometry_of_law/data_and_dictionary/circuit_case_sc_decision_map_full.csv')

In [173]:
outcome_data.columns

In [174]:
merged = opinion_YB_judge_YB_dis.copy().merge(outcome_data, how = "left",on = "caseid")

In [175]:
merged.shape

(292733, 34)

In [44]:
import pandas as pd
import statsmodels.formula.api as sm
import statsmodels.stats.sandwich_covariance as sw
import numpy as np
import statsmodels as statsmodels

In [45]:
merged_affirmed = merged.copy()
merged_affirmed = merged_affirmed[merged_affirmed.if_affirmed.notna()]

In [46]:
merged_all_affirmed = merged_all.copy()
merged_all_affirmed = merged_all_affirmed[merged_all_affirmed.if_affirmed.notna()]

In [47]:
cluster_court_affirmed_ols = sm.ols(formula='if_affirmed ~ dis', data=merged_affirmed).fit(cov_type='cluster',
                                                        cov_kwds={'groups': merged_affirmed['Circuit']},
                                                        use_t=True)

In [48]:
cluster_court_affirmed_ols.summary()

0,1,2,3
Dep. Variable:,if_affirmed,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,5.756
Date:,"Thu, 24 Jan 2019",Prob (F-statistic):,0.0336
Time:,02:28:51,Log-Likelihood:,-179700.0
No. Observations:,264324,AIC:,359400.0
Df Residuals:,264322,BIC:,359400.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6814,0.020,33.458,0.000,0.637,0.726
dis,-0.0750,0.031,-2.399,0.034,-0.143,-0.007

0,1,2,3
Omnibus:,14313.011,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45548.812
Skew:,-0.617,Prob(JB):,0.0
Kurtosis:,1.384,Cond. No.,7.29


In [51]:
merged_Dissenting1 = merged.copy()
merged_Dissenting1 = merged_Dissenting1[merged_Dissenting1.Dissenting1.notna()]

In [52]:
merged_all_Dissenting1 = merged_all.copy()
merged_all_Dissenting1 = merged_all_Dissenting1[merged_all_Dissenting1.Dissenting1.notna()]

In [53]:
cluster_court_Dissenting1_ols = sm.ols(formula='Dissenting1 ~ dis', data=merged_Dissenting1).fit(cov_type='cluster',
                                                        cov_kwds={'groups': merged_Dissenting1['Circuit']},
                                                        use_t=True)

In [54]:
cluster_court_Dissenting1_ols.summary()

0,1,2,3
Dep. Variable:,Dissenting1,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,40.76
Date:,"Thu, 24 Jan 2019",Prob (F-statistic):,3.48e-05
Time:,02:28:52,Log-Likelihood:,-253460.0
No. Observations:,264019,AIC:,506900.0
Df Residuals:,264017,BIC:,506900.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1113,0.010,10.887,0.000,0.089,0.134
dis,0.1614,0.025,6.385,0.000,0.106,0.216

0,1,2,3
Omnibus:,187678.017,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2003318.785
Skew:,3.532,Prob(JB):,0.0
Kurtosis:,14.499,Cond. No.,7.29


In [57]:
from statsmodels.discrete.discrete_model import Logit

In [64]:
cluster_court_affirmed_logit_all = Logit.from_formula(formula='if_affirmed ~ dis', data=merged_all_affirmed).fit()

Optimization terminated successfully.
         Current function value: 0.649245
         Iterations 4


In [65]:
cluster_court_affirmed_logit_all.summary()

0,1,2,3
Dep. Variable:,if_affirmed,No. Observations:,288749.0
Model:,Logit,Df Residuals:,288747.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 24 Jan 2019",Pseudo R-squ.:,0.0003878
Time:,02:47:55,Log-Likelihood:,-187470.0
converged:,True,LL-Null:,-187540.0
,,LLR p-value:,1.713e-33

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.7103,0.010,73.812,0.000,0.691,0.729
dis,-0.2126,0.018,-12.076,0.000,-0.247,-0.178
