In [1]:
import pandas as pd
import nltk
import time

In [2]:
df = pd.read_csv('./report-data/dialog.csv')

In [3]:
df[:10]

Unnamed: 0,role,name,text,was interrupted,turn,sentiment,idx,docket
0,justice,ROBERTS,We will hear argument first this morning in Ca...,0,0,0,0,12-1226_4d46
1,other,BAGENSTOS,"Thank you, Mr. Chief Justice, and may it pleas...",0,1,0,1,12-1226_4d46
2,justice,KENNEDY,"Well, what you make it sound as if the only co...",0,2,-1,2,12-1226_4d46
3,other,BAGENSTOS,"Well, I -- so I think on the summary judgment ...",1,3,0,3,12-1226_4d46
4,justice,SOTOMAYOR,I'm sorry. I'm confused.,0,4,0,4,12-1226_4d46
5,justice,KENNEDY,"Well, I mean, I think that's a necessary start...",0,5,0,5,12-1226_4d46
6,other,BAGENSTOS,"Well, I -- Your Honor, I would submit that tha...",1,6,0,6,12-1226_4d46
7,justice,GINSBURG,"Mr. Bagenstos, what would your case be if -- l...",0,7,0,7,12-1226_4d46
8,other,BAGENSTOS,"Yes. So in that case, our position would be, a...",1,8,0,8,12-1226_4d46
9,justice,SCALIA,Most favored nations treatment.,0,9,0,9,12-1226_4d46


In [4]:
df['text'][0]

'We will hear argument first this morning in Case 12-1226, Young V. United Parcel Service. Mr. Bagenstos. ORAL ARGUMENT OF SAMUEL BAGENSTOS ON BEHALF OF THE PETITIONER'

In [5]:
def strip_interpunction(word):
    if len(word) == 0:
        return word
    start = 0
    end = len(word)
    if word[0] in '.,/;\'"!?-':
        start = 1
    if word[end-1] in '.,/;\'"!?-':
        end -= 1
        
    return word[start:end].lower()


def extract_words(row):
    row['text'] = str(row['text']) if str(row['text']) != 'nan' else ''
    words = row['text'].split(' ')
    final_words = [strip_interpunction(word.strip()) for word in words]
    row['words'] = final_words
    return row

In [6]:
df_test = df[:10].apply(lambda row: extract_words(row), axis=1)
print(df_test['words'].apply(lambda cell: len(cell)))

0     27
1    119
2     58
3     71
4      4
5     26
6     99
7     48
8     38
9      4
Name: words, dtype: int64


In [7]:
df = df.apply(lambda row: extract_words(row), axis=1)

In [8]:
all_words = []
df.apply(lambda row: all_words.extend(row['words']), axis=1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
29293    None
29294    None
29295    None
29296    None
29297    None
29298    None
29299    None
29300    None
29301    None
29302    None
29303    None
29304    None
29305    None
29306    None
29307    None
29308    None
29309    None
29310    None
29311    None
29312    None
29313    None
29314    None
29315    None
29316    None
29317    None
29318    None
29319    None
29320    None
29321    None
29322    None
dtype: object

In [9]:
len(all_words)

1414257

In [10]:
freq = nltk.FreqDist(all_words)

In [11]:
idx = 0
freq_dict = dict(freq)

In [12]:
from tools.synonyms.lin import Lin

In [13]:
syns_reader = Lin()

In [14]:
syns_reader.SetSimilarity(0.03)
test = syns_reader.GetSynonyms('argument')

In [15]:
sorted(test, key= lambda cell: cell[1], reverse=True)

['assertion', 'argument', 'appeal', 'claim', 'debate', 'testimony']

In [16]:
all_words_syns = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader.GetSynonyms(w)
    all_words_syns.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_set = set(all_words_syns)
len(all_words_syns_set)

Timed: 124.229000092


28229

In [17]:
syns_reader_all = Lin()
syns_reader_all.SetSimilarity(0.03)
syns_reader_all.SetMaxWords(0)


In [22]:
all_words_syns_all = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader_all.GetSynonyms(w)
    all_words_syns_all.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_all_set = set(all_words_syns_all)
len(all_words_syns_all)

Timed: 124.463000059


6214861

In [23]:
syns_reader_lemma = Lin()
syns_reader_lemma.SetSimilarity(0.03)
syns_reader_lemma.SetUseLemma(False)

In [24]:
all_words_syns_not_lemma = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader_all.GetSynonyms(w)
    all_words_syns_not_lemma.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_not_lemma_set = set(all_words_syns_not_lemma)
len(all_words_syns_not_lemma_set)

Timed: 124.208999872


34824

In [25]:
syns_reader_lemma.SetMaxWords(0)
all_words_syns_not_lemma_all_words = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader_all.GetSynonyms(w)
    all_words_syns_not_lemma_all_words.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_not_lemma_all_words_set = set(all_words_syns_not_lemma_all_words)
len(all_words_syns_not_lemma_all_words_set)

Timed: 126.514000177


34824

In [27]:
testing = [[0.03, 0, True], [0.03, 0, False], [0.03, 5, True], [0.03, 5, False],
            [0.02, 0, True], [0.02, 0, False], [0.02, 5, True], [0.02, 5, False],
          [0.04, 0, True], [0.04, 0, False], [0.04, 5, True], [0.04, 5, False],
          [0.03, 0, True], [0.03, 0, False], [0.03, 6, True], [0.03, 6, False],
          [0.04, 0, True], [0.04, 0, False], [0.04, 6, True], [0.04, 6, False]]
syns_reader_test = Lin()

In [30]:
words_list = {}
words_list_set = {}
for test in testing:
    test_key = '{}-{}-{}'.format(test[0], test[1], test[2])
    words_list[test_key] = []
    syns_reader_test.SetSimilarity(test[0])
    syns_reader_test.SetMaxWords(test[1])
    syns_reader_test.SetUseLemma(test[2])
    tStart = time.time()
    for w in all_words:
        current_words = syns_reader_test.GetSynonyms(w)
        words_list[test_key].extend(current_words)

    tEnd = time.time()
    words_list_set[test_key] = set(words_list[test_key])
    print("Test sim = {}; words = {}; Lemma = {} ===> {}    {}".format(test[0], test[1], test[2], len(words_list_set[test_key]), tEnd-tStart))


Test sim = 0.03; words = 0; Lemma = True ===> 34824    ()
Test sim = 0.03; words = 0; Lemma = False ===> 34742    ()
Test sim = 0.03; words = 5; Lemma = True ===> 28229    ()
Test sim = 0.03; words = 5; Lemma = False ===> 28177    ()
Test sim = 0.02; words = 0; Lemma = True ===> 29925    ()
Test sim = 0.02; words = 0; Lemma = False ===> 29891    ()
Test sim = 0.02; words = 5; Lemma = True ===> 27354    ()
Test sim = 0.02; words = 5; Lemma = False ===> 27297    ()
Test sim = 0.04; words = 0; Lemma = True ===> 39664    ()
Test sim = 0.04; words = 0; Lemma = False ===> 39376    ()
Test sim = 0.04; words = 5; Lemma = True ===> 28786    ()
Test sim = 0.04; words = 5; Lemma = False ===> 28732    ()
Test sim = 0.03; words = 0; Lemma = True ===> 34824    ()
Test sim = 0.03; words = 0; Lemma = False ===> 34742    ()
Test sim = 0.03; words = 6; Lemma = True ===> 28755    ()
Test sim = 0.03; words = 6; Lemma = False ===> 28704    ()
Test sim = 0.04; words = 0; Lemma = True ===> 39664    ()
Test s