In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pickle as pk
import os
import pandas as pd
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
##changing working directory
##importing the doc2vec model
os.chdir('/home/jcai/geometry_of_law/data_and_dictionary')
model = Doc2Vec.load('/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v')

In [3]:
set_of_doc_names = set(model.docvecs.doctags)

In [4]:
##importing master_dataframe
master_dataframe = pd.read_csv("/home/jcai/geometry_of_law/Encyclopedia Entry/master_dataframe.csv")

In [5]:
master_dataframe.shape

(292765, 15)

In [6]:
master_dataframe = master_dataframe[master_dataframe['docname'].isin(set_of_doc_names)]

In [7]:
##defining function to return list of document name
def return_list_of_docname(column, value):
    '''returns the list of docname of the vectors whose column is value'''
    list_of_docname = list(master_dataframe.loc[master_dataframe[column]==value]['docname'])
    return list_of_docname

In [8]:
##defining function to return the average vector
def return_average_vector(list_of_docname):
    '''returns the average vector for the given list of docname'''
    list_of_vectors = [model[x] for x in list_of_docname]
    mean = np.mean(list_of_vectors, axis=0)
    return mean

In [9]:
#the following is largely from 
#3.1.2 application 1 calculate average vector for each party
#find the set of vectors (opinions) by a republican judge
list_of_republican_docname = return_list_of_docname('party',"Republican")
#find the average vectors of the list
average_republican_vector = return_average_vector(list_of_republican_docname)
#for each judge calculate average vector
#find out the list of judges that has cases whose vectors we have already calculated
#find the set of vectors (opinions) by a democrat judge
list_of_democrat_docname = return_list_of_docname('party',"Democratic")
#find the average vectors of the list
average_democrat_vector = return_average_vector(list_of_democrat_docname)
#for each judge calculate average vector
#find out the list of judges that has cases whose vectors we have already calculated


#3.1.3 application 2 calculate average vector for judge
#find the set of docnames (opinions) by judge
list_of_judges = list(master_dataframe['judge_name'].unique())
list_list_of_docnames_judges = [return_list_of_docname(column = 'judge_name', value = x) for x in list_of_judges]
judge_docname_list_dict = dict(zip(list_of_judges, list_list_of_docnames_judges))
pk.dump(judge_docname_list_dict, open( "judge_docname_list_dict.p", "wb" ))
#find the average vector by judge
list_of_average_vectors_for_judges = list(map(return_average_vector, list_list_of_docnames_judges))
#make a dictionary
judge_average_dict = dict(zip(list_of_judges, list_of_average_vectors_for_judges))
pk.dump(judge_average_dict, open( "judge_average_dict.p", "wb" ))

#testing
#for x in list(judge_average_dict.values()):
#    if numpy.isnan(list(judge_average_dict.values())).any():
#        print(x)
#numpy.isnan(list(judge_average_dict.values())).any()


#3.1.4 application 3 calculate average vector for circuit-year
#find the set of docnames (opinions) by circuit-year
list_of_CY = list(master_dataframe['circuit-year'].unique())
list_list_of_docnames_CY = [return_list_of_docname(column = 'circuit-year', value = x) for x in list_of_CY]
#find the average vector by CY
list_of_average_vectors_for_CY = list(map(return_average_vector, list_list_of_docnames_CY))
#make a dictionary
CY_average_dict = dict(zip(list_of_CY, list_of_average_vectors_for_CY))
pk.dump(CY_average_dict, open( "CY_average_dict.p", "wb" ))

#numpy.isnan(list(CY_average_dict.values())).any()

#3.1.5 application 4 calculate average vector for circuit-big-issue
#find the set of docnames (opinions) 
master_dataframe['circuit-big-issue'] = [master_dataframe['circuit'][x]+'-'+ master_dataframe['big-issue'][x] for x in master_dataframe.index]
list_of_CB = list(master_dataframe['circuit-big-issue'].unique())
list_list_of_docnames_CB = [return_list_of_docname(column = 'circuit-big-issue', value = x) for x in list_of_CB]
#find the average vector by CY
list_of_average_vectors_for_CB = list(map(return_average_vector, list_list_of_docnames_CB))
#make a dictionary
CB_average_dict = dict(zip(list_of_CB, list_of_average_vectors_for_CB))
pk.dump(CB_average_dict, open( "CB_average_dict.p", "wb" ))

#3.1.6 application 5 calculate average vector for circuit-big-issue
#find the set of docnames (opinions)
master_dataframe['year-big-issue'] = [str(master_dataframe['year'][x])+'-'+ master_dataframe['big-issue'][x] for x in master_dataframe.index]
list_of_YB = list(master_dataframe['year-big-issue'].unique())
list_list_of_docnames_YB = [return_list_of_docname(column = 'year-big-issue', value = x) for x in list_of_YB]
#find the average vector by YB
list_of_average_vectors_for_YB = list(map(return_average_vector, list_list_of_docnames_YB))
#make a dictionary
YB_average_dict = dict(zip(list_of_YB, list_of_average_vectors_for_YB))
pk.dump(YB_average_dict, open( "YB_average_dict.p", "wb" ))

#3.1.6 application 6 calculate average vector for judge-year
#find the set of docnames (opinions) by circuit-year
master_dataframe['judge-year'] = [master_dataframe['judge_name'][x]+'-'+ str(master_dataframe['year'][x]) for x in master_dataframe.index]
list_of_JY = list(master_dataframe['judge-year'].unique())
list_list_of_docnames_JY = [return_list_of_docname(column = 'judge-year', value = x) for x in list_of_JY]
#find the average vector by JY
list_of_average_vectors_for_JY = list(map(return_average_vector, list_list_of_docnames_JY))
#make a dictionary
JY_average_dict = dict(zip(list_of_JY, list_of_average_vectors_for_JY))
pk.dump(JY_average_dict, open( "JY_average_dict.p", "wb" ))

#3.1.7 application 7 calculate average vector for circuit-big-issue-year
#find the set of docnames (opinions) by circuit-big-issue-year
master_dataframe['circuit-big-issue-year'] = [master_dataframe['circuit'][x]+'-'+ master_dataframe['big-issue'][x]+'-'+ str(master_dataframe['year'][x]) for x in master_dataframe.index]
list_of_CBY = list(master_dataframe['circuit-big-issue-year'].unique())
list_list_of_docnames_CBY = [return_list_of_docname(column = 'circuit-big-issue-year', value = x) for x in list_of_CBY]
#find the average vector by JY
list_of_average_vectors_for_CBY = list(map(return_average_vector, list_list_of_docnames_CBY))
#make a dictionary
CBY_average_dict = dict(zip(list_of_CBY, list_of_average_vectors_for_CBY))
pk.dump(CBY_average_dict, open( "CBY_average_dict.p", "wb" ))

#3.1.8 application 8 calculate average vector for year
#find the set of docnames (opinions) by circuit-big-issue-year
list_of_Y = list(master_dataframe['year'].unique())
list_list_of_docnames_Y = [return_list_of_docname(column = 'year', value = x) for x in list_of_Y]
#find the average vector by JY
list_of_average_vectors_for_Y = list(map(return_average_vector, list_list_of_docnames_Y))
#make a dictionary
Y_average_dict = dict(zip(list(map(int,list_of_Y)), list_of_average_vectors_for_Y))
pk.dump(Y_average_dict, open( "Y_average_dict.p", "wb" ))

#3.1.9 application 9 calculate average vector for judge-decade
#create a judge decade column
master_dataframe['decade'] = [int(master_dataframe['year'][x]/10) for x in master_dataframe.index]
master_dataframe['judge-decade'] = [str(master_dataframe['judge_name'][x])+'-'+ str(master_dataframe['decade'][x]) for x in master_dataframe.index]
master_dataframe['court-decade'] = [master_dataframe['circuit'][x]+'-'+ str(master_dataframe['decade'][x]) for x in master_dataframe.index]

#find the set of docnames (opinions) by judge-decade
list_of_JD = list(master_dataframe['judge-decade'].unique())
list_list_of_docnames_JD = [return_list_of_docname(column = 'judge-decade', value = x) for x in list_of_JD]
#find the average vector by JD
list_of_average_vectors_for_JD = list(map(return_average_vector, list_list_of_docnames_JD))
#make a dictionary
JD_average_dict = dict(zip(list_of_JD, list_of_average_vectors_for_JD))
pk.dump(JD_average_dict, open( "JD_average_dict.p", "wb" ))

#3.1.10 application 10 calculate average vector for court-decade
#find the set of docnames (opinions) by Court decade
list_of_CD = list(master_dataframe['court-decade'].unique())
list_list_of_docnames_CD = [return_list_of_docname(column = 'court-decade', value = x) for x in list_of_CD]
#find the average vector by JD
list_of_average_vectors_for_CD = list(map(return_average_vector, list_list_of_docnames_CD))
#make a dictionary
CD_average_dict = dict(zip(list_of_CD, list_of_average_vectors_for_CD))
pk.dump(CD_average_dict, open( "CD_average_dict.p", "wb" ))

#3.1.11 application 11 calculate average vector for court
#find the set of docnames (opinions) by Court decade
list_of_C = list(master_dataframe['circuit'].unique())
list_list_of_docnames_C = [return_list_of_docname(column = 'circuit', value = x) for x in list_of_C]
#find the average vector by C
list_of_average_vectors_for_C = list(map(return_average_vector, list_list_of_docnames_C))
#make a dictionary
C_average_dict = dict(zip(list_of_C, list_of_average_vectors_for_C))
pk.dump(C_average_dict, open( "C_average_dict.p", "wb" ))