-
Notifications
You must be signed in to change notification settings - Fork 14
/
Analyzer.java
380 lines (304 loc) · 16.8 KB
/
Analyzer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import java.io.*;
import java.util.*;
/**
* Class to analyze extractions.
*
* Created by sebastian on 20/05/15.
*/
public class Analyzer {
/**
* Key: Emotion. Value: Frequency.
*/
private Map<String, Double> emotionFreqs = new HashMap<String, Double>();
/**
* Key: np_cause, s_cause_subj_pred, s_cause_pred_dobj, emotion_holder. Value: Map with key: Unigram in ngram type; value: frequency of appearance.
*/
private Map<Enums.NgramSource, Map<String, Double>> ngramTypeUnigramFreqs = new HashMap<Enums.NgramSource, Map<String, Double>>();
/**
* Key: np_cause, s_cause_subj_pred, s_cause_pred_dobj, emotion_holder. Value: Map with key: emotion tab unigram in ngram type; value: frequency of appearing together.
*/
private Map<Enums.NgramSource, Map<String, Double>> emotionNgramTypeUnigramFreqs = new HashMap<Enums.NgramSource, Map<String, Double>>();
/**
* Key: np_cause, s_cause_subj_pred, s_cause_pred_dobj, emotion_holder. Value: Map with key: bigram in ngram type; value: frequency of appearance.
*/
private Map<Enums.NgramSource, Map<String, Double>> ngramTypeBigramFreqs = new HashMap<Enums.NgramSource, Map<String, Double>>();
/**
* Key: np_cause, s_cause_subj_pred, s_cause_pred_dobj, emotion_holder. Value: Map with key: emotion tab bigram in ngram type; value: frequency of appearing together.
*/
private Map<Enums.NgramSource, Map<String, Double>> emotionNgramTypeBigramFreqs = new HashMap<Enums.NgramSource, Map<String, Double>>();
/**
* Key: ngramType. Value: number of unigrams (tokens) that appeared in cause.
*/
private Map<String, Double> causeUnigramCount = new HashMap<String, Double>();
/**
* Number of emotions that appeared in total.
*/
private double emotionCount = 0;
/**
* List of stop words in English.
*/
private List<String> stopWords = new ArrayList<String>();
/**
* Initializes an <code>Analyzer</code>.
* @param stopWordsFile the path to the stop words file
* @throws IOException in case stop word file wasn't found or couldn't be read.
*/
public Analyzer(String stopWordsFile) throws IOException {
File file = new File(stopWordsFile);
if (!file.exists()) {
throw new FileNotFoundException(String.format("Couldn't find {0}", stopWordsFile));
}
BufferedReader reader = new BufferedReader(new FileReader(stopWordsFile));
String line = reader.readLine();
while (line != null && !line.equals("")) {
this.stopWords.add(line);
line = reader.readLine();
}
// initialize maps
for (Enums.NgramSource ngramSource : Enums.NgramSource.values()) {
this.ngramTypeUnigramFreqs.put(ngramSource, new HashMap<String, Double>());
this.ngramTypeBigramFreqs.put(ngramSource, new HashMap<String, Double>());
this.emotionNgramTypeUnigramFreqs.put(ngramSource, new HashMap<String, Double>());
this.emotionNgramTypeBigramFreqs.put(ngramSource, new HashMap<String, Double>());
}
}
/**
* Counts independent as well as co-occurrence frequencies of emotions, unigrams, and bigrams in cause given a list
* of <code>Extraction</code> and stores them in fields.
* @param extractions the list of <code>Extraction</code>
*/
public void countFrequencies(List<Extraction> extractions) {
// iterate over the extractions
for (Extraction extraction : extractions) {
// increment emotion count since one extraction contains one emotion
this.emotionCount++;
String emotion = extraction.getEmotion();
// add emotion frequency
Extensions.updateMap(this.emotionFreqs, emotion);
// get NP and S cause elements
String[] extractionNPCauseSplit = extraction.getNPCause().split(" ");
String[] extractionSCauseSplit = String.format("%s %s %s", extraction.getSubjSCause(),
extraction.getPredSCause(), extraction.getDobjSCause()).split(" ");
String[] extractionEmotionHolder = extraction.getEmotionHolder().split(" ");
String[][] causes = { extractionNPCauseSplit, extractionSCauseSplit, extractionEmotionHolder};
// iterate over NP cause, S cause, and emotion holder
Enums.NgramSource[] ngramSources = new Enums.NgramSource[] {Enums.NgramSource.np_cause, Enums.NgramSource.s_cause,
Enums.NgramSource.emotion_holder};
for (int j = 0; j < ngramSources.length; j++) {
Enums.NgramSource ngramSource = ngramSources[j];
for (int i = 1; i < causes[j].length + 1; i++) {
String unigram = ngramToLowerCase(causes[j][i - 1]);
if (unigram.equals("") || stopWords.contains(unigram)) {
continue;
}
Extensions.updateMap(this.causeUnigramCount, ngramSource.toString());
String emotionUnigram = emotion + "\t" + unigram;
// add unigram and emotion - unigram frequencies
Extensions.updateMap(ngramTypeUnigramFreqs.get(ngramSource), unigram);
Extensions.updateMap(emotionNgramTypeUnigramFreqs.get(ngramSource), emotionUnigram);
if (i < causes[j].length) {
String bigram = unigram + " " + ngramToLowerCase(causes[j][i]);
if (causes[j][i].equals("") || stopWords.contains(causes[j][i])) {
continue;
}
String emotionBigram = emotion + "\t" + bigram;
// add bigram and emotion - bigram frequencies
Extensions.updateMap(ngramTypeBigramFreqs.get(ngramSource), bigram);
Extensions.updateMap(emotionNgramTypeBigramFreqs.get(ngramSource), emotionBigram);
}
}
}
// S cause subj + pred
String[] subjSCause = extraction.getSubjSCause().split(" ");
// convert ngram to lower case; replace NE tags
String predSCause = ngramToLowerCase(extraction.getPredSCause());
for (String token : subjSCause) {
// don't consider empty tokens or stop words
if (token.equals("") || stopWords.contains(token)) {
continue;
}
Enums.NgramSource ngramSource = Enums.NgramSource.s_cause_subj_pred;
String bigram = ngramToLowerCase(token) + " " + predSCause;
String emotionBigram = emotion + "\t" + bigram;
Extensions.updateMap(ngramTypeBigramFreqs.get(ngramSource), bigram);
Extensions.updateMap(emotionNgramTypeBigramFreqs.get(ngramSource), emotionBigram);
Extensions.updateMap(this.causeUnigramCount, ngramSource.toString());
}
// S cause pred + dobj
String[] dobjSCause = extraction.getDobjSCause().split(" ");
for (int i = dobjSCause.length - 1; i > 0; i--) {
String token = dobjSCause[i];
if (token.equals("") || stopWords.contains(token) || token.contains(":")) {
continue;
}
else {
Enums.NgramSource ngramSource = Enums.NgramSource.s_cause_pred_dobj;
String bigram = predSCause + " " + ngramToLowerCase(token);
String emotionBigram = emotion + "\t" + bigram;
Extensions.updateMap(ngramTypeBigramFreqs.get(ngramSource), bigram);
Extensions.updateMap(emotionNgramTypeBigramFreqs.get(Enums.NgramSource.s_cause_pred_dobj), emotionBigram);
Extensions.updateMap(this.causeUnigramCount, ngramSource.toString());
break;
}
}
}
printFrequencies();
}
/**
* Prints out the total unigram and bigram frequencies for all ngram types.
*/
private void printFrequencies() {
System.out.println("Ngram type\t# unigrams");
printTotal(ngramTypeUnigramFreqs);
System.out.println("\nNgram type\t# bigrams");
printTotal(ngramTypeBigramFreqs);
}
/**
* Prints out the total frequency of an ngram type given an ngram type map.
* @param ngramTypeMap map with key: ngram type; value: map with key: ngram, value: frequency
*/
private void printTotal(Map<Enums.NgramSource, Map<String, Double>> ngramTypeMap) {
for (Enums.NgramSource ngramSource : ngramTypeMap.keySet()) {
double total = 0;
for (Map.Entry<String, Double> entry : ngramTypeMap.get(ngramSource).entrySet()) {
total += entry.getValue();
}
System.out.printf("%s\t%f\n", ngramSource.toString(), total);
}
}
/**
* Convert ngram to lower if it isn't a named entity. Shorten named entity tags.
* @param ngram the ngram to be converted to lower
* @return the ngram to lower
*/
private String ngramToLowerCase(String ngram) {
if (ngram.contains("/")) {
return ngram.replace("/PERSON", "/PERS").replace("/ORGANIZATION", "/ORG").replace("/LOCATION", "/LOC");
}
else if (ngram.equals("NUMBER")) {
return "NUM";
}
else {
return ngram.toLowerCase();
}
}
/**
* Calculates discounted point-wise mutual information. Discount filters out expressions that occur very rarely.
* Stores them in a map with the ngram as key and the PMI score as value.
* @param ngramSource ngram source that the PMI score should be calculated from
* @param ngramEnum the ngram that should be used for calculation; unigram or bigram
* @return the map of ngrams and their PMI score
*/
public Map<String, Double> calculatePMI(Enums.NgramSource ngramSource, Enums.Ngram ngramEnum) {
// get the appropriate parameters
Map<String, Double> ngramFreqs = ngramEnum.equals(Enums.Ngram.unigram) ? this.ngramTypeUnigramFreqs.get(ngramSource) : this.ngramTypeBigramFreqs.get(ngramSource);
Map<String, Double> emotionNgramFreqs = ngramEnum.equals(Enums.Ngram.unigram) ? this.emotionNgramTypeUnigramFreqs.get(ngramSource) : this.emotionNgramTypeBigramFreqs.get(ngramSource);
double ngramCount = 0;
ngramCount = this.causeUnigramCount.get(ngramSource.toString());
// initializes the final ngram - PMI map
Map<String, Double> PMIMap = new HashMap<String, Double>();
// iterate over all emotions and all tokens (i.e. unigrams or bigrams)
for (String emotion : this.emotionFreqs.keySet()) {
for (String ngram : ngramFreqs.keySet()) {
String emotionNgram = emotion + "\t" + ngram;
if (!emotionNgramFreqs.containsKey(emotionNgram)) {
continue;
}
// get frequencies
double emotionFreq = this.emotionFreqs.get(emotion);
double ngramFreq = ngramFreqs.get(ngram);
double emotionNgramFreq = emotionNgramFreqs.get(emotionNgram);
// continue if ngram never appeared; shouldn't happen but just to be safe
if (ngramFreq == 0) {
continue;
}
// calculate probabilities
double pEmotion = (emotionFreq / this.emotionCount); // P(x)
double pNgram = ngramFreq / ngramCount; // P(y)
double pJoint = emotionNgramFreq / this.emotionCount; // P(x,y)
// calculate discounted probabilities (multiplied with freq / (freq + 1))
double pEmotionNgramDiscount = (emotionFreq * ngramFreq) / (Math.pow(this.emotionCount, 2) * (ngramFreq / (ngramFreq + 1)));
double pJointDiscount = (emotionNgramFreq / this.emotionCount) * (emotionNgramFreq / (emotionNgramFreq + 1));
double pmi = Math.log(pJoint / (pEmotion * pNgram));
double pmiDiscount = Math.log(pJointDiscount / pEmotionNgramDiscount);
// double pmiNormalized = pmiDiscount / (- Math.log(pJointDiscount));
PMIMap.put(emotionNgram, pmiDiscount);
}
}
return PMIMap;
}
/**
* Calculates chi-square. Stores the scores in a map with the ngram as key and the chi-square score as value.
* @param ngramSource ngram souce that chi-square should be calculated from
* @param ngramEnum the ngram used for calculation
* @return the map of ngrams and their chi-square score
*/
public Map<String, Double> calculateChiSquare(Enums.NgramSource ngramSource, Enums.Ngram ngramEnum) {
// get the appropriate parameters
Map<String, Double> ngramFreqs = ngramEnum.equals(Enums.Ngram.unigram) ? this.ngramTypeUnigramFreqs.get(ngramSource) : this.ngramTypeBigramFreqs.get(ngramSource);
Map<String, Double> emotionNgramFreqs = ngramEnum.equals(Enums.Ngram.unigram) ? this.emotionNgramTypeUnigramFreqs.get(ngramSource) : this.emotionNgramTypeBigramFreqs.get(ngramSource);
double ngramCount = 0;
ngramCount = this.causeUnigramCount.get(ngramSource.toString());
// initialize final chi-square map
Map<String, Double> chiSquareMap = new HashMap<String, Double>();
// iterate over all emotions and ngrams
for (String emotion : this.emotionFreqs.keySet()) {
for (String ngram : ngramFreqs.keySet()) {
String emotionNgram = emotion + "\t" + ngram;
if (!emotionNgramFreqs.containsKey(emotionNgram)) {
continue;
}
double emotionFreq = this.emotionFreqs.get(emotion);
double ngramFreq = ngramFreqs.get(ngram);
double emotionNgramFreq = emotionNgramFreqs.get(emotionNgram);
// continue if token never appeared; shouldn't happen but just to be safe
if (ngramFreq == 0) {
continue;
}
// joint probability of x and y P(x, y)
double pJoint = emotionNgramFreq / this.emotionCount;
double pNgram = ngramFreq / ngramCount; // P(x)
// fraction of documents, i.e extractions which contain token
double Fw = ngramFreq / this.emotionCount; // F(w)
// fraction of documents which contain emotion
double P_i = emotionFreq / this.emotionCount;
// conditional prob of class i for extractions which contain w: p(w, i) / p(w)
double p_iw = pJoint / pNgram;
double chiSquare = (this.emotionCount * Math.pow(Fw, 2) * Math.pow((p_iw - P_i), 2)) / (Fw * (1 - Fw) * P_i * (1 - P_i));
chiSquareMap.put(emotionNgram, chiSquare);
}
}
return chiSquareMap;
}
// counts which unigrams/bigrams are shared per emotion
/**
* Puts the ngrams in a map that lists them along with their positive scores for each emotion.
* @param ngramSource the source where the ngrams should be taken from
* @param ngramEnum the ngrams that should be used
* @param metricMap a map with key: emotion tab ngram; value: their association score
* @return a map with key: ngram; value: a map with key: emotion, value: the association score for that emotion
*/
public Map<String, Map<String, Double>> calculcateEmotionOverlap(Enums.NgramSource ngramSource, Enums.Ngram ngramEnum, Map<String, Double> metricMap) {
// get the appropriate parameters
Map<String, Double> ngramFreqs = ngramEnum.equals(Enums.Ngram.unigram) ? this.ngramTypeUnigramFreqs.get(ngramSource) : this.ngramTypeBigramFreqs.get(ngramSource);
// create the overlap map
Map<String, Map<String, Double>> overlapMap = new HashMap<String, Map<String, Double>>();
// iterate through all ngrams and all emotions
for (String ngram : ngramFreqs.keySet()) {
for (String emotion : this.emotionFreqs.keySet()) {
String emotionNgram = emotion + "\t" + ngram;
// skip emotion if metric map doesn't contain a score for that emotion or score is smaller than 0
if (!metricMap.containsKey(emotionNgram) || metricMap.get(emotionNgram) < 0) {
continue;
}
else if (!overlapMap.containsKey(ngram)) {
// otherwise create a new map if ngram doesn't exist yet
overlapMap.put(ngram, new HashMap<String, Double>());
}
// add ngram along with score for that emotion
overlapMap.get(ngram).put(emotion, metricMap.get(emotionNgram));
}
}
return overlapMap;
}
}