In [85]:
"""
Program to find the oxymoronic pairs from the adverbly adjectives data which
contains adverb-adjective pairs from the COCA, CORE and TIME corpora.

The NRC VAD dictionary or SentiWordNet 3.0 can be used to generate the polarity
differences between the adverbs and the adjectives. Functions exist to do this
which take a pandas DataFrame (along with the name of the adverb and adjective
columns) as input.

@author Vasundhara Gautam
"""

# Imports and pre-processing


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from difflib import get_close_matches as gcm
from itertools import chain
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError

# You will likely need to replace the file paths for COCA, CORE and TIME data, as well as the VAD dict

coca_file = '~/sfuvault/Discourse-Lab/Data/Adverbly_adjectives/COCA/all_years/all_years.csv'
core_file = '~/sfuvault/Discourse-Lab/Data/Adverbly_adjectives/CORE/data/CORE_allgenres.xlsx'
time_file = '~/sfuvault/Discourse-Lab/Data/Adverbly_adjectives/TIME/time_all.csv'
socc_file = '~/sfuvault/Discourse-Lab/Data/Adverbly_adjectives/SOCC/socc_all.csv'

VAD_dict_file = '~/Documents/University/Semester6/USRA/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/v-scores.txt'
VAD_dict = pd.read_table(VAD_dict_file, header=None, names=['VAD_word', 'VAD_valence'])


# Function definitions


"""
Function to use WordNet to find adjectives from input adverb
E.g., if 'terribly' is not found in the VAD dictionary, look for the valence of 'terrible'
"""
def advToAdj(adv):
    try:
        possible_adjectives = [k.name() for k in chain(*[j.pertainyms() for j in chain(*[i.lemmas() for i in wn.synsets(adv)])])]
    except WordNetError:
        return None
    if len(possible_adjectives) == 0:
        return None
    closest_matches = gcm(adv,possible_adjectives)
    if len(closest_matches) == 0:
        return None
    return closest_matches[0]

"""
SWN polarity of an input term with a certain part of speech is calculated by the
best performing method outlined in Guerini et al for regression using SWN 3.0 -
harmonic weighted sums of all the subjective senses of a word.
"""
def calculateSWNPolarity(term, pos, how):
    # Get synsets for all the senses given a specified part of speech
    try:
        synsets = list(swn.senti_synsets(term, pos=pos))
    except WordNetError:
        return None
    # Throw out entirely objective synsets
    for i in synsets:
        if i.obj_score() == 1:
            synsets.remove(i)
    if len(synsets) == 0:
        return None
    # Sort positive and negative scores in descending order
    #   "word’s prior polarity might be more related to its posterior polarities score,
    #   rather than to sense frequencies"
    # So giving more relevance to more "valenced" senses
    pos = sorted([i.pos_score() for i in synsets], reverse=True)
    neg = sorted([i.neg_score() for i in synsets], reverse=True)
    obj = sorted([i.obj_score() for i in synsets], reverse=True)
    
    pos_harmonic = 0
    neg_harmonic = 0
    obj_harmonic = 0

    if how == 'harmonic':
        # Weighting them with a harmonic series
        for i in range(len(pos)):
            pos_harmonic += pos[i]/(i+1)
            neg_harmonic += neg[i]/(i+1)
            obj_harmonic += obj[i]/(i+1)
    else:
        # Weighting them with a geometric series
        for i in range(len(pos)):
            pos_harmonic += pos[i]*(0.5**i)
            neg_harmonic += neg[i]*(0.5**i)
            obj_harmonic += obj[i]*(0.5**i)
            
    pos_harmonic /= len(pos)
    neg_harmonic /= len(neg)
    obj_harmonic /= len(obj)
    
#     # Absolute maximum of the scores, assigning a negative to indicate negativity
#     return pos_harmonic if pos_harmonic >= neg_harmonic else -neg_harmonic

    return ','.join([str(pos_harmonic), str(neg_harmonic), str(obj_harmonic)])

"""
Merging on adverbs and then adjectives in the pairs.
The polarity of a word is calculated using the calculateSWNPolarity function.
"""
def oxymoronsBySWN(df, adv_col, adj_col, weighting_type):
    df = df.dropna(axis=1)
    adv = pd.DataFrame(df[adv_col].drop_duplicates())
    adj = pd.DataFrame(df[adj_col].drop_duplicates())

    adv['SWN_polarity'] = adv[adv_col].apply(calculateSWNPolarity, pos='r', how=weighting_type)
    adj['SWN_polarity'] = adj[adj_col].apply(calculateSWNPolarity, pos='a', how=weighting_type)

    df_adv = df.merge(adv, how='left', on=adv_col)
    df_adj = df_adv.merge(adj, how='left', on=adj_col)
    df_SWN = pd.DataFrame(df_adj.dropna())
    df_SWN['diff'] = (df_SWN['SWN_polarity_x'].sub(df_SWN['SWN_polarity_y'])).abs()
    df_SWN.sort_values('diff', ascending=False, inplace=True)
    return df_SWN



In [7]:
coca_raw = pd.read_csv(coca_file)
split = coca_raw['pair'].str.split(n=1, expand=True)
coca_raw['Adv'] = split[0].str.lower()
coca_raw['Adj'] = split[1].str.lower()

In [93]:
coca_raw = coca_raw.dropna(axis=1)
adv = pd.DataFrame(coca_raw['Adv'].drop_duplicates())
adj = pd.DataFrame(coca_raw['Adj'].drop_duplicates())

In [97]:
df = adv['Adv'].apply(calculateSWNPolarity, pos='r', how='harmonic').str.split(',', expand=True).astype('float').rename(columns={0 : 'pos_harmonic', 1 : 'neg_harmonic', 2 : 'obj_harmonic'})

In [98]:
adv_scores = pd.concat([adv['Adv'], df], axis=1).dropna()

## Problem

The average objective score of all the adverbs is quite high, 0.712.
We want to be able to reduce this by getting rid of the really objective adverbs right off the bat.
Note that we start with 1827 adverbs for which we actually have SWN entries.

In [123]:
adv_scores.describe()

Unnamed: 0,pos_harmonic,neg_harmonic,obj_harmonic
count,1827.0,1827.0,1827.0
mean,0.210118,0.049453,0.712072
std,0.139021,0.126321,0.162946
min,0.0,0.0,0.125
25%,0.125,0.0,0.625
50%,0.25,0.0,0.75
75%,0.25,0.0,0.875
max,0.75,0.875,1.0


For instance, there are many adverbs with an objective score of 1.0 (and positive and negative scores of 0).
Here are some examples

In [131]:
adv_scores[adv_scores['obj_harmonic'] == 1]

Unnamed: 0,Adv,pos_harmonic,neg_harmonic,obj_harmonic
9955,avowedly,0.0,0.0,1.0
12576,bimonthly,0.0,0.0,1.0
12859,biweekly,0.0,0.0,1.0
14003,briefly,0.0,0.0,1.0
17678,chronically,0.0,0.0,1.0
20502,compactly,0.0,0.0,1.0
23536,consequently,0.0,0.0,1.0
25099,continuously,0.0,0.0,1.0
29801,decisively,0.0,0.0,1.0
29861,deeply,0.0,0.0,1.0


## Solutions?

This is a bit extreme but we can try keeping only those adverbs with an objective score of 0.33 or lower.
Why not 0.5? Because the 0-1 scale of scores for any given word is shared between the positive, negative and objective scores. So having 0.66 left over could still mean that a word is 0.33 positive and 0.33 negative, which would overall not give a clear polarity in any direction.
This approach ends up giving us only 48 adverbs total. So perhaps we overdid it.

In [133]:
adv_scores[adv_scores['obj_harmonic'] < 0.33].describe()

Unnamed: 0,pos_harmonic,neg_harmonic,obj_harmonic
count,48.0,48.0,48.0
mean,0.320517,0.392773,0.225386
std,0.258704,0.275924,0.067286
min,0.0,0.0,0.125
25%,0.0,0.238453,0.125
50%,0.34375,0.284722,0.25
75%,0.5,0.625,0.25
max,0.75,0.875,0.326389


We could try using 0.5. One could argue that it allows 0.5 polarity (and assume that it is not typical for a word to be split 50/50 between positive and negative).
This is slightly better because it gives us 155 adverbs. Still a far cry from the 1827 we started with, but perhaps justified.

In [134]:
adv_scores[adv_scores['obj_harmonic'] < 0.5].describe()

Unnamed: 0,pos_harmonic,neg_harmonic,obj_harmonic
count,155.0,155.0,155.0
mean,0.293797,0.231501,0.351142
std,0.224052,0.247359,0.099196
min,0.0,0.0,0.125
25%,0.083333,0.0,0.265625
50%,0.28125,0.125,0.375
75%,0.5,0.375,0.421441
max,0.75,0.875,0.494792


Before we arbitrarily increase the threshold, let's look at the words that are between 0.5 and, say, 0.66.
It's already a bit of a mixture. 'desolately' sounds good, 'yearly' souds meh.
'naturally' sounds meh, 'cruelly' sounds great. So at this point, our cutoff seems somewhat arbitrary.

In [138]:
adv_scores[(adv_scores['obj_harmonic'] < 0.66) & (adv_scores['obj_harmonic'] >= 0.5)].sort_values('obj_harmonic')

Unnamed: 0,Adv,pos_harmonic,neg_harmonic,obj_harmonic
195781,yearly,0.12500,0.3750,0.50000
33187,desolately,0.00000,0.5000,0.50000
73286,hopefully,0.50000,0.0000,0.50000
161449,steadfastly,0.50000,0.0000,0.50000
161799,straightforwardly,0.37500,0.1250,0.50000
30868,deferentially,0.31250,0.0000,0.50000
29826,decoratively,0.50000,0.0000,0.50000
76063,imperfectly,0.00000,0.5000,0.50000
76145,implausibly,0.00000,0.5000,0.50000
76754,improbably,0.00000,0.5000,0.50000


In [105]:
adv_scores[adv_scores['obj_harmonic'] < 0.66].describe()

Unnamed: 0,pos_harmonic,neg_harmonic,obj_harmonic
count,651.0,651.0,651.0
mean,0.276994,0.111331,0.534829
std,0.180343,0.184224,0.122433
min,0.0,0.0,0.125
25%,0.125,0.0,0.5
50%,0.34375,0.0,0.59375
75%,0.375,0.1875,0.625
max,0.75,0.875,0.65625


In [122]:
adv_scores[adv_scores['obj_harmonic'] > 0.66].sort_values('obj_harmonic')

Unnamed: 0,Adv,pos_harmonic,neg_harmonic,obj_harmonic
127480,practically,0.0625,0.0625,0.68750
113220,originally,0.0625,0.0000,0.71875
40886,endlessly,0.0000,0.0625,0.71875
116402,particularly,0.0625,0.0000,0.71875
113161,organically,0.0625,0.0000,0.71875
142445,regularly,0.0625,0.0000,0.71875
164480,stupidly,0.2500,0.0000,0.75000
167102,suitably,0.2500,0.0000,0.75000
167075,suggestively,0.2500,0.0000,0.75000
85457,intermittently,0.2500,0.0000,0.75000


In [111]:
adv_scores[adv_scores['Adv'] == 'hugely']

Unnamed: 0,Adv,pos_harmonic,neg_harmonic,obj_harmonic
74161,hugely,0.0,0.25,0.75


In [139]:
core_file = '/Users/vasundhara/sfuvault/Discourse-Lab/Data/Adverbly_adjectives/CORE/data/newly_extracted_counts/allpairs.csv'

In [154]:
core_raw = pd.read_csv(core_file, header=None, names=['pair', 'count'])

In [156]:
split = core_raw['pair'].str.split(n=1, expand=True)
core_raw['Adv'] = split[0].str.lower()
core_raw['Adj'] = split[1].str.lower()

In [157]:
core_raw

Unnamed: 0,pair,count,Adv,Adj
0,questionably human,1,questionably,human
1,mentally strong,11,mentally,strong
2,strictly essential,2,strictly,essential
3,ironically parallel,1,ironically,parallel
4,basically sinew,1,basically,sinew
5,supposedly incurable,1,supposedly,incurable
6,increasingly worse,1,increasingly,worse
7,particularly tight,3,particularly,tight
8,initially several,1,initially,several
9,previously silent,1,previously,silent
