In [2]:
import pandas as pd
import nltk
import time
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [53]:
df = pd.read_csv('./parsed-data/dialog.csv')

In [54]:
df[:10]

Unnamed: 0,role,name,text,was interrupted,turn,sentiment,idx,docket
0,justice,ROBERTS,We'll hear argument first this morning in Case...,0.0,0.0,0.0,0.0,11-681
1,other,MESSENGER,"Mr Chief Justice, and may it please the Court:...",1.0,1.0,0.0,1.0,11-681
2,justice,GINSBURG,I thought it was to negotiate what's typically...,0.0,2.0,0.0,2.0,11-681
3,other,MESSENGER,The subjects of bargaining here are the reimbu...,0.0,3.0,1.0,3.0,11-681
4,justice,GINSBURG,But how does it differ from the typical bargai...,0.0,4.0,0.0,4.0,11-681
5,other,MESSENGER,"Yes. When -- in the public sector, when a grou...",0.0,5.0,0.0,5.0,11-681
6,justice,SOTOMAYOR,Is your argument dependent on this being sort ...,0.0,6.0,-1.0,6.0,11-681
7,other,MESSENGER,That is our position for why Abood is distingu...,0.0,7.0,0.0,7.0,11-681
8,justice,KAGAN,"But your argument, of course, isn't limited to...",0.0,8.0,0.0,8.0,11-681
9,other,MESSENGER,"Yes. And that the -- the actual bargaining, ev...",0.0,9.0,0.0,9.0,11-681


In [55]:
df['text'][0]

"We'll hear argument first this morning in Case 11-681, Harris v. Quinn. Mr Messenger. ORAL ARGUMENT OF WILLIAM L. MESSENGER ON BEHALF OF THE PETITIONERS"

In [56]:
def strip_interpunction(word):
    if len(word) == 0:
        return word
    start = 0
    end = len(word)
    if word[0] in '.,/;\'"!?-':
        start = 1
    if word[end-1] in '.,/;\'"!?-':
        end -= 1
        
    return word[start:end].lower()


def extract_words(row):
    row['text'] = str(row['text']) if str(row['text']) != 'nan' else ''
    words = row['text'].split(' ')
    final_words = [strip_interpunction(word.strip()) for word in words]
    row['words'] = final_words
    return row

In [58]:
df = df.apply(lambda row: extract_words(row), axis=1)

In [59]:
all_words = []
df.apply(lambda row: all_words.extend(row['words']), axis=1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
42863    None
42864    None
42865    None
42866    None
42867    None
42868    None
42869    None
42870    None
42871    None
42872    None
42873    None
42874    None
42875    None
42876    None
42877    None
42878    None
42879    None
42880    None
42881    None
42882    None
42883    None
42884    None
42885    None
42886    None
42887    None
42888    None
42889    None
42890    None
42891    None
42892    None
dtype: object

In [62]:
len(all_words)

2114109

In [63]:
unique_words = set(all_words)
len(unique_words)

27836

In [64]:
freq = nltk.FreqDist(all_words)

In [84]:
idx = 0
freq_dict_tmp = dict(freq)
freq_dict = [(key, freq_dict_tmp[key]) for key in freq_dict_tmp.keys() if len(key) > 2]
freq_dict.sort(key= lambda rec: rec[1], reverse=True)

In [90]:
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger

pos_tagger = PerceptronTagger()
freq_dict_clean = []
for key in range(len(freq_dict)):
    tmp = TextBlob(freq_dict[key][0], pos_tagger= pos_tagger)
    if tmp.tags[0][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD', 'PP', 'PP', 'NN',
                          'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS']:
        freq_dict_clean.append(freq_dict[key])

In [110]:
all_words_relevant = [w for w in all_words if len(w) > 2]

In [112]:
all_words_clean = []
for w in all_words_relevant:
    tmp = TextBlob(w, pos_tagger= pos_tagger)
    if tmp.tags[0][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD', 'PP', 'PP', 'NN',
                          'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS']:
        all_words_clean.append(w)

In [113]:
len(all_words_clean)

967870

In [86]:
len(freq_dict)

27461

In [96]:
len(freq_dict_clean)

23230

In [97]:
words_clean = [w[0] for w in freq_dict_clean]

In [92]:
from tools.synonyms.lin import Lin

In [93]:
syns_reader = Lin()

In [132]:
syns_reader.SetSimilarity(0.03)
syns_reader.SetUseLemma(True)

In [133]:
all_words_syns = []
tStart = time.time()
for w in all_words_clean:
    current_words = syns_reader.GetSynonyms(w)
    if not any([w in all_words_syns for w in current_words]):
        all_words_syns.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_set = set(all_words_syns)
len(all_words_syns_set)

KeyboardInterrupt: 

In [120]:
all_words_syns.sort()
all_words_syns[:10]

['$loo',
 "'inaccurate",
 "'wages",
 '(a)',
 '(a)(2)',
 '(a)(2)(a)',
 '(a)(2)(b)',
 '(a)(2)(d)',
 '(a)(3)',
 '(a)(3)(c)']

In [127]:
count_groupped = 0
for w in words_clean:
    found = False
    start = 0
    end = len(all_words_syns) - 1
    while start <= end:
        middle = (end+start) / 2
        if all_words_syns[middle] == w:
            found = True
            break
        else:
            if all_words_syns[middle] > w:
                end = middle - 1
            else:
                start = middle + 1
    if not found:
        count_groupped += freq[w]

In [128]:
count_groupped

219107

In [94]:
syns_reader_all = Lin()
syns_reader_all.SetSimilarity(0.03)
syns_reader_all.SetMaxWords(0)


In [95]:
all_words_syns_all = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader_all.GetSynonyms(w)
    all_words_syns_all.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_all_set = set(all_words_syns_all)
len(all_words_syns_all)

Timed: 14.8919999599


9271525

In [23]:
syns_reader_lemma = Lin()
syns_reader_lemma.SetSimilarity(0.03)
syns_reader_lemma.SetUseLemma(False)

In [24]:
all_words_syns_not_lemma = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader_all.GetSynonyms(w)
    all_words_syns_not_lemma.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_not_lemma_set = set(all_words_syns_not_lemma)
len(all_words_syns_not_lemma_set)

Timed: 124.208999872


34824

In [25]:
syns_reader_lemma.SetMaxWords(0)
all_words_syns_not_lemma_all_words = []
tStart = time.time()
for w in all_words:
    current_words = syns_reader_all.GetSynonyms(w)
    all_words_syns_not_lemma_all_words.extend(current_words)

tEnd = time.time()
print("Timed: {}".format(tEnd-tStart))
all_words_syns_not_lemma_all_words_set = set(all_words_syns_not_lemma_all_words)
len(all_words_syns_not_lemma_all_words_set)

Timed: 126.514000177


34824

In [129]:
testing = [[0.03, 0, True], [0.03, 0, False],
          [0.04, 0, True], [0.04, 0, False],
          [0.05, 0, True], [0.05, 0, False],
          [0.06, 0, True], [0.06, 0, False],
          [0.07, 0, True], [0.07, 0, False],
          [0.08, 0, True], [0.08, 0, False],
          [0.09, 0, True], [0.09, 0, False],
          [0.10, 0, True], [0.10, 0, False]]
syns_reader_test = Lin()

In [134]:
def findInArray(needle, haystack):
    found = False
    start = 0
    end = len(haystack) - 1
    while start <= end:
        middle = (end+start) / 2
        if haystack[middle] == needle:
            found = True
            break
        else:
            if haystack[middle] > needle:
                end = middle - 1
            else:
                start = middle + 1
    return found

dtData = []
for test in testing:
    found_syns_words = []
    syns_reader_test.SetSimilarity(test[0])
    syns_reader_test.SetMaxWords(test[1])
    syns_reader_test.SetUseLemma(test[2])
    count_groupped = 0
    tStart = time.time()
    for w in all_words_clean:
        current_words = syns_reader_test.GetSynonyms(w)
        if not any([findInArray(word, found_syns_words) for word in current_words]):
            found_syns_words.extend(current_words)
            found_syns_words.sort()
    tEnd = time.time()

    for w in words_clean:
        if not findInArray(w, found_syns_words):
            count_groupped += freq[w]

    dtData.append([test[0], test[1], test[2], len(found_syns_words), count_groupped])
    print("Test sim = {}; words = {}; Lemma = {} ===> {}({})    {}".format(test[0], test[1], test[2], count_groupped, len(found_syns_words), tEnd-tStart))


Test sim = 0.03; words = 0; Lemma = True ===> 215483(21948)    43.4600000381
Test sim = 0.03; words = 0; Lemma = False ===> 145915(23896)    42.1890001297
Test sim = 0.04; words = 0; Lemma = True ===> 241581(21414)    57.2880001068
Test sim = 0.04; words = 0; Lemma = False ===> 171630(23554)    56.6160001755
Test sim = 0.05; words = 0; Lemma = True ===> 305742(20944)    79.6809999943
Test sim = 0.05; words = 0; Lemma = False ===> 231934(23154)    76.1429998875
Test sim = 0.06; words = 0; Lemma = True ===> 324353(20440)    119.142999887
Test sim = 0.06; words = 0; Lemma = False ===> 235378(22851)    102.949999809
Test sim = 0.07; words = 0; Lemma = True ===> 326920(20380)    144.30099988
Test sim = 0.07; words = 0; Lemma = False ===> 244702(22697)    121.103999853
Test sim = 0.08; words = 0; Lemma = True ===> 339198(20180)    160.532999992
Test sim = 0.08; words = 0; Lemma = False ===> 245589(22608)    154.386999846
Test sim = 0.09; words = 0; Lemma = True ===> 344435(20120)    214.9779

In [135]:
dtData

[[0.03, 0, True, 21948, 215483],
 [0.03, 0, False, 23896, 145915],
 [0.04, 0, True, 21414, 241581],
 [0.04, 0, False, 23554, 171630],
 [0.05, 0, True, 20944, 305742],
 [0.05, 0, False, 23154, 231934],
 [0.06, 0, True, 20440, 324353],
 [0.06, 0, False, 22851, 235378],
 [0.07, 0, True, 20380, 326920],
 [0.07, 0, False, 22697, 244702],
 [0.08, 0, True, 20180, 339198],
 [0.08, 0, False, 22608, 245589],
 [0.09, 0, True, 20120, 344435],
 [0.09, 0, False, 22606, 259674],
 [0.1, 0, True, 20136, 359866],
 [0.1, 0, False, 22525, 276031]]

In [136]:
dfSim = pd.DataFrame(dtData, columns=['Sim', 'MaxWords', 'Lemma', 'UniqueWords', 'CondensatedWords'])

In [159]:
def calcCondensate(row):
    row['Condensated'] = (row['CondensatedWords'] / float(len(all_words_clean))) * 100
    return row

dfSim = dfSim.apply(lambda row: calcCondensate(row), axis=1)

In [160]:
dfSim

Unnamed: 0,Sim,MaxWords,Lemma,UniqueWords,CondensatedWords,Condensated
0,0.03,0,True,21948,215483,22.26363
1,0.03,0,False,23896,145915,15.075888
2,0.04,0,True,21414,241581,24.960067
3,0.04,0,False,23554,171630,17.732753
4,0.05,0,True,20944,305742,31.58916
5,0.05,0,False,23154,231934,23.963342
6,0.06,0,True,20440,324353,33.512042
7,0.06,0,False,22851,235378,24.319175
8,0.07,0,True,20380,326920,33.777263
9,0.07,0,False,22697,244702,25.282528


In [166]:
fig = plt.figure()
ax = fig.add_subplot(111)
dataVariants = [[0, True, 'orange'], [0, False, '#25e6e9']]
plt.xlabel('Similarity Rate', fontsize=16)

barWidth = 0.6 / len(dataVariants)
idx = 0
for variant in dataVariants:
    dtTemp = dfSim[dfSim['Lemma'] == variant[1]].reset_index(drop=True)
    dtTemp.index = dtTemp['Sim'].values
    dtTemp['UniqueWords'].plot.bar(rot=45, ax=ax, position=idx,
                                   color=variant[2], width=barWidth,
                                   fontsize=14,
                             label='Lemma={}'.format(variant[1]))
    idx += 1
    
plt.ylabel('Condensation Rate', fontsize=18)
plt.legend(loc='upper left', fontsize=16)
#ax2.set_ylabel('Turns per Docket', fontsize=18)
plt.show()