In [1]:
import os

import pandas as pd
import json

In [2]:
from convokit import Corpus
from convokit.text_processing import TextProcessor, TextToArcs
from convokit import download

In [11]:
from convokit.convokitPipeline import ConvokitPipeline

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
DEMO_CORPUS_NAME = 'oyez_2019'

In [4]:
DATA_DIR = '/kitchen/experimental_justine/scotus/demo_data'

In [6]:
demo_corpus = Corpus(download(DEMO_CORPUS_NAME, data_dir=DATA_DIR))
# demo_corpus = Corpus(os.path.join(DATA_DIR, DEMO_CORPUS_NAME))

Downloading oyez_2019 to /kitchen/experimental_justine/scotus/demo_data/oyez_2019
Downloading oyez_2019 from http://zissou.infosci.cornell.edu/convokit/datasets/oyez-corpus/oyez_2019.zip (14.2MB)... Done


In [7]:
demo_corpus.print_summary_stats()

Number of Speakers: 113
Number of Utterances: 13707
Number of Conversations: 58


In [10]:
demo_corpus.load_info('utterance',['parsed'])

In [12]:
text_prep_pipe = ConvokitPipeline([
    ('arcs_per_sent', TextToArcs(output_field='arcs_per_sent')),
    ('arcs', TextProcessor(input_field='arcs_per_sent', output_field='arcs',
                     proc_fn=lambda sents: '\n'.join(sents))),
    ('wordcount', TextProcessor(input_field='parsed', output_field='wordcount',
           proc_fn=lambda sents: sum(sum(x['tag'] != '_SP' for x in sent['toks']) for sent in sents))),
    ('tokens', TextProcessor(input_field='parsed', output_field='tokens',
           proc_fn=lambda sents: '\n'.join((' '.join(x['tok'] for x in sent['toks']).strip()) for sent in sents)))
])

In [13]:
demo_corpus = text_prep_pipe.transform(demo_corpus)

In [20]:
utt = demo_corpus.get_utterance('24929__0_000')

In [21]:
print(utt.text)

We'll hear argument next in Case 18-877, Allen versus Cooper. Mr. Shaffer.


In [22]:
utt.retrieve_meta('wordcount')

18

In [23]:
utt.retrieve_meta('arcs')

"'ll_* allen_* allen_versus argument_* case_* cooper_* hear_'ll hear_* hear_allen hear_argument hear_next hear_we in_* in_case next_* next_in versus_* versus_cooper we>'ll we>* we_*\nshaffer_*"

In [25]:
print(utt.retrieve_meta('tokens'))

We 'll hear argument next in Case 18 - 877 , Allen versus Cooper .
Mr. Shaffer .


In [26]:
def get_context_id_df(corpus):
    prev_df = pd.DataFrame([{'id': utt.id, 'prev_id': utt.reply_to} for utt in corpus.iter_utterances()])
    context_id_df = prev_df.join(prev_df.drop_duplicates('prev_id').set_index('prev_id')['id'].rename('next_id'), on='id')
    return context_id_df

In [27]:
context_id_df = get_context_id_df(demo_corpus)

In [28]:
context_id_df.head()

Unnamed: 0,id,prev_id,next_id
0,24929__0_000,,24929__0_001
1,24929__0_001,24929__0_000,24929__0_002
2,24929__0_002,24929__0_001,24929__0_003
3,24929__0_003,24929__0_002,24929__0_004
4,24929__0_004,24929__0_003,24929__0_005


In [29]:
source_filter = lambda utt: (utt.retrieve_meta('speaker_type') == 'J') and (utt.retrieve_meta('arcs') != '')
target_filter = lambda utt: (utt.retrieve_meta('speaker_type') == 'A') and (utt.retrieve_meta('arcs') != '')

In [30]:
for utt in demo_corpus.iter_utterances():
    utt.set_info('source_filter',source_filter(utt))
    utt.set_info('target_filter',target_filter(utt))

In [31]:
utt_df = demo_corpus.get_attribute_table('utterance', ['wordcount', 'source_filter','target_filter'])

In [32]:
utt_df.head()

Unnamed: 0_level_0,source_filter,target_filter,wordcount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24929__0_000,True,False,18
24929__0_001,False,True,390
24929__0_002,True,False,45
24929__0_003,False,True,161
24929__0_004,True,False,4


In [33]:
full_context_df = context_id_df.join(utt_df, on='id')\
    .join(utt_df, on='prev_id', rsuffix='_prev')\
    .join(utt_df, on='next_id', rsuffix='_next')

In [34]:
full_context_df.head()

Unnamed: 0,id,prev_id,next_id,source_filter,target_filter,wordcount,source_filter_prev,target_filter_prev,wordcount_prev,source_filter_next,target_filter_next,wordcount_next
0,24929__0_000,,24929__0_001,True,False,18,,,,False,True,390.0
1,24929__0_001,24929__0_000,24929__0_002,False,True,390,True,False,18.0,True,False,45.0
2,24929__0_002,24929__0_001,24929__0_003,True,False,45,False,True,390.0,False,True,161.0
3,24929__0_003,24929__0_002,24929__0_004,False,True,161,True,False,45.0,True,False,4.0
4,24929__0_004,24929__0_003,24929__0_005,True,False,4,False,True,161.0,False,True,7.0


In [36]:
min_wc_source = 10
max_wc_source = 50
min_wc_target = 10
max_wc_target = 75

In [37]:
source_df = full_context_df[full_context_df.source_filter
           & full_context_df.wordcount.between(min_wc_source, max_wc_source)
           & full_context_df.wordcount_prev.between(min_wc_target, max_wc_target)
           & full_context_df.wordcount_next.between(min_wc_target, max_wc_target)].set_index('id')

In [39]:
target_df = full_context_df[full_context_df.target_filter
   & full_context_df.wordcount.between(min_wc_target, max_wc_target)].set_index('id')
source_df = source_df[source_df.prev_id.isin(target_df.index)
         & source_df.next_id.isin(target_df.index)]

In [40]:
len(source_df)

353

In [41]:
len(target_df)

2087

In [42]:
text_cols = ['arcs','tokens']
text_df = demo_corpus.get_attribute_table('utterance',text_cols)

In [43]:
source_df = source_df[['prev_id','next_id']].join(text_df)
target_df = target_df[[]].join(text_df)

In [44]:
source_df.head()

Unnamed: 0_level_0,prev_id,next_id,arcs,tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24929__0_014,24929__0_013,24929__0_015,but>* but>how could_* going_* going_rules goin...,But how -- how could -- how could we have the ...
24929__0_016,24929__0_015,24929__0_017,'re_* asking_'re asking_* asking_basically ask...,"So , basically , you 're asking us to overrule..."
24929__0_054,24929__0_053,24929__0_055,a_* by_* by_government by_state constitutional...,Every -- every infringement is a violate -- ev...
24929__0_082,24929__0_081,24929__0_083,all>* all_* california_* over_* over_all over_...,All over California .\nWhy does n't California...
24929__0_086,24929__0_085,24929__0_087,'m_* about_* about_copyright copyright_* i>'m ...,I 'm not talking about copyright .\nI 'm talki...


In [45]:
target_df.head()

Unnamed: 0_level_0,arcs,tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1
24929__0_013,argument_* argument_kagan argument_that be_* b...,It would be certainly open to folks in patent ...
24929__0_015,be_* be_prediction be_that be_would my_* predi...,That would be my prediction .\nMy prediction i...
24929__0_017,'m_* alito_* alito_justice alito_think asking_...,"I 'm asking this Court to follow Katz , Justic..."
24929__0_019,basis_* basis_for basis_the florida_* for_* fo...,I think it -- it overruled -- it overruled the...
24929__0_025,court_* court_the decide_* decide_court decide...,"Well , Justice Kavanaugh , obviously , the Cou..."


In [46]:
MIN_YEAR = 1955
MAX_YEAR = 2019

In [47]:
source_dfs = []
target_dfs = []

In [48]:
for year in range(MIN_YEAR, MAX_YEAR + 1):
    source_dfs.append(pd.read_csv(os.path.join(DATA_DIR, 'oyez_' + str(year) + '.source.tsv'), sep='\t', index_col=0))
    target_dfs.append(pd.read_csv(os.path.join(DATA_DIR, 'oyez_' + str(year) + '.target.tsv'), sep='\t', index_col=0))

In [50]:
source_df = pd.concat(source_dfs)
target_df = pd.concat(target_dfs)

In [51]:
len(source_df)

91924

In [52]:
len(target_df)

372268

In [54]:
import numpy as np

from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.preprocessing import normalize

from sklearn.metrics.pairwise import cosine_distances
from scipy import sparse

In [55]:
source_df['mtx_idx'] = np.arange(len(source_df))
target_df['mtx_idx'] = np.arange(len(target_df))

In [56]:
MIN_DF = 100
MAX_DF = 1.
MAX_FEATURES = 2000

TEXT_COL = 'arcs'

In [57]:
class ColNormedTfidf(TransformerMixin):
    
    def __init__(self, norm_cols=True, **kwargs):
        self.tfidf_model = TfidfVectorizer(token_pattern=r'(?u)(\S+)',**kwargs)
        self.norm_cols = norm_cols
    
    def fit(self, X, y=None):
        tfidf_vects_raw = self.tfidf_model.fit_transform(X)
        self.tfidf_norms = sparse.linalg.norm(tfidf_vects_raw, axis=0)
    
    def transform(self, X):
        tfidf_vects_raw = self.tfidf_model.transform(X)
        if self.norm_cols:
            tfidf_vect = tfidf_vects_raw.T / self.tfidf_norms[:,np.newaxis]
        else:
            tfidf_vect = tfidf_vects_raw.T / np.ones_like(self.tfidf_norms[:,np.newaxis])
        return tfidf_vect
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

    def get_feature_names(self):
        return self.tfidf_model.get_feature_names()
    
    def get_params(self, deep=True):
        return self.tfidf_model.get_params(deep=deep)
    
    def set_params(self, **params):
        return self.tfidf_model.set_params(**params)

In [58]:
source_tfidf_obj = ColNormedTfidf(max_features=MAX_FEATURES, binary=True,
                                 min_df=MIN_DF, max_df=MAX_DF)
source_tfidf_vect = source_tfidf_obj.fit_transform(source_df[TEXT_COL].values)

In [59]:
source_tfidf_vect.shape

(2000, 91924)

In [60]:
target_tfidf_obj = ColNormedTfidf(max_features=MAX_FEATURES, binary=True,
                                 min_df=MIN_DF, max_df=MAX_DF)
target_tfidf_vect = target_tfidf_obj.fit_transform(target_df[TEXT_COL].values)

In [61]:
target_tfidf_vect.shape

(2000, 372268)

In [62]:
frequency = np.array(source_tfidf_vect > 0).sum(axis=1)

In [63]:
SVD_DIMS = 15
RANDOM_STATE = 2019

In [64]:
def get_svd_obj(vect, svd_dims, random_state=RANDOM_STATE):
    U,s,V = randomized_svd(vect, n_components=svd_dims, random_state=random_state)
    return {'U': U, 's': s, 'V': V.T}

In [65]:
target_svd_obj = get_svd_obj(target_tfidf_vect, SVD_DIMS)

In [68]:
target_svd_obj['s']

array([4.96525955, 2.22446003, 2.11388959, 2.05197851, 2.00981703,
       1.98249565, 1.9364843 , 1.92670078, 1.91136899, 1.89870573,
       1.88872119, 1.84319579, 1.83926558, 1.82746449, 1.82167165])

In [69]:
def snip(vects, dim=None, snip_first_dim=True):
    if dim is None:
        dim = vects.shape[1]
    return normalize(vects[:,int(snip_first_dim):dim])

In [66]:
source_df = source_df.join(target_df.mtx_idx, on='prev_id', rsuffix='_prev')\
    .join(target_df.mtx_idx, on='next_id', rsuffix='_next')

In [67]:
fw_idx_mapping = source_df[['mtx_idx','mtx_idx_next']].values
bk_idx_mapping = source_df[['mtx_idx','mtx_idx_prev']].values

In [70]:
class CrossEmbed:
    
    def __init__(self, source_vects, target_embeddings, target_s, idx_mapping, snip_first_dim=True):
        
        self.source_vects = source_vects
        self.target_embeddings = target_embeddings
        self.target_s = target_s
        
        source_subset = self.source_vects[:, idx_mapping[:,0]]
        target_subset = self.target_embeddings[idx_mapping[:, 1]]
        
        self.term_embeddings = source_subset * target_subset / target_s
        
        full_dists = cosine_distances(
            snip(self.term_embeddings, snip_first_dim=snip_first_dim),
            snip(target_subset, snip_first_dim=snip_first_dim)
        )
        weights = normalize(np.array(source_subset > 0), norm='l1')
        clipped_dists = np.clip(full_dists, None, 1)
        
        self.term_ranges = (clipped_dists * weights).sum(axis=1)
    
    def embed_docs(self, doc_vect):
        return doc_vect.T * self.term_embeddings / self.target_s
    
    def compute_docs_range(self, doc_vect):
        return np.dot(normalize(doc_vect.T, norm='l1'), self.term_ranges)

In [71]:
fw_obj = CrossEmbed(source_tfidf_vect, target_svd_obj['V'], target_svd_obj['s'], fw_idx_mapping)

In [72]:
bk_obj = CrossEmbed(source_tfidf_vect, target_svd_obj['V'], target_svd_obj['s'], bk_idx_mapping)

In [73]:
orientation = bk_obj.term_ranges - fw_obj.term_ranges

In [74]:
orientation_df = pd.DataFrame({
    'index': source_tfidf_obj.get_feature_names(),
    'orientation': orientation,
    'fw_range': fw_obj.term_ranges,
    'bk_range': bk_obj.term_ranges,
    'n': frequency
}).set_index('index')

In [75]:
np.sign(orientation_df.orientation).value_counts(normalize=True)

 1.0    0.639
-1.0    0.361
Name: orientation, dtype: float64

In [76]:
orientation_df[orientation_df.n >= 300].sort_values('orientation').head(25)

Unnamed: 0_level_0,orientation,fw_range,bk_range,n
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
available_*,-0.068312,0.875046,0.806734,317
commission_*,-0.067559,0.85292,0.785361,656
specific_*,-0.060206,0.875184,0.814978,342
and>it,-0.059189,0.855133,0.795944,329
employees_*,-0.054333,0.767615,0.713282,318
laughter_*,-0.053807,0.832771,0.778964,683
in_order,-0.053699,0.873568,0.819868,410
understand_that,-0.053405,0.880014,0.826609,392
agency_*,-0.05231,0.798216,0.745906,398
all>*,-0.051825,0.882862,0.831037,697


In [77]:
orientation_df[orientation_df.n >= 300].sort_values('orientation', ascending=False).head(25)

Unnamed: 0_level_0,orientation,fw_range,bk_range,n
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
in_brief,0.145374,0.654769,0.800142,313
brief_your,0.141113,0.676582,0.817695,338
be_what,0.11334,0.723905,0.837245,363
be_you,0.102827,0.749427,0.852254,368
of_appeals,0.097252,0.614031,0.711283,1094
where>*,0.091886,0.725915,0.817801,556
is_where,0.091748,0.728747,0.820495,433
suppose>*,0.090441,0.721628,0.812069,591
brief_*,0.088758,0.70296,0.791717,663
raised_*,0.088452,0.689256,0.777707,372


In [None]:
def get_cross_embed_neighbors(source_term_embeds, target_term_embeds, source_terms, target_terms,
                             snip_first_dim=True):
    neighbors = cosine_distances(snip(source_term_embeds, snip_first_dim=snip_first_dim),
                                snip(target_term_embeds, snip_first_dim=snip_first_dim))
    return pd.DataFrame(data=neighbors, index=source_terms, columns=target_terms)