-
Notifications
You must be signed in to change notification settings - Fork 10
/
sample_lexical_diversity.py
100 lines (80 loc) · 3.69 KB
/
sample_lexical_diversity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import re
import json
import string
import math
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from collections import OrderedDict
from collections import defaultdict
from random import choice
from typing import List, Set
from copy import deepcopy
stops = set(stopwords.words("english"))
ps = PorterStemmer()
# reorder the list
# now the list is going to be a random shuffle of sorted_l[:top_p fraction] + a random shuffle of sorted_l[top_p fraction:]
def re_order(sorted_l: List[str], top_p: float) -> List[str]:
part1 = sorted_l[:int(len(sorted_l) * top_p) + 1]
part2 = sorted_l[int(len(sorted_l) * top_p) + 1:]
np.random.shuffle(part1)
np.random.shuffle(part2)
return part1 + part2
def get_word_set_of_sample(sample: str) -> Set[str]:
sample_no_punc = sample.translate(str.maketrans('', '', string.punctuation))
word_set = {ps.stem(word) for word in word_tokenize(sample_no_punc) if word not in stops}
return word_set
# sorted_A is a sorted list of samples for Corpus A, with more representative samples at the beginning
# sorted_B is a sorted list of samples for Corpus B, with more representative samples at the beginning
# top_p is the fraction of samples we want to focus on, but if we run out of samples, we will use the rest of the samples
# num_samples is the number of samples we want to return for each group
def lexical_diversity(sorted_A: List[str], sorted_B: List[str], top_p: float = 0.2, num_samples: int = 4, max_gap=None):
sorted_A, sorted_B = deepcopy(sorted_A), deepcopy(sorted_B)
a_candidates = []
b_candidates = []
if max_gap is None:
max_gap = (num_samples // 4 + 1)
reordered_A = re_order(sorted_A, top_p)
reordered_B = re_order(sorted_B, top_p)
a_words_count, b_words_count = defaultdict(int), defaultdict(int)
# enumerate through the samples
# keeps track of how many samples we have examined
cur_A_pointer, cur_B_pointer = 0, 0
# we add 1 sentence for group A and group B alternatively, until we have num_samples samples
for _ in range(num_samples):
# add a sentence for group A
# enumerate the sentence from the reordered_A list until we find a legitimate sentence
while cur_A_pointer < len(reordered_A):
# get the sentence
sample_A = reordered_A[cur_A_pointer]
cur_A_pointer += 1
# get the set of words of the sentence
word_set_A = get_word_set_of_sample(sample_A)
# decide whether to add the sentence
add_A_flg = True
for word in word_set_A:
if a_words_count[word] - b_words_count[word] >= max_gap:
add_A_flg = False
break
# if we decide to add the sentence, add it to the candidate list and then stop the while loop
if add_A_flg:
a_candidates.append(sample_A)
for word in word_set_A:
a_words_count[word] += 1
break
# same as above, but for group B instead of group A
while cur_B_pointer < len(reordered_B):
sample_B = reordered_B[cur_B_pointer]
cur_B_pointer += 1
word_set_B = get_word_set_of_sample(sample_B)
add_B_flg = True
for word in word_set_B:
if b_words_count[word] - a_words_count[word] >= max_gap:
add_B_flg = False
if add_B_flg:
b_candidates.append(sample_B)
for word in word_set_B:
b_words_count[word] += 1
break
return a_candidates, b_candidates