In [1]:
import numpy as np
import pandas as pd

## Load label words from domain knowledge

In [2]:
df = pd.read_csv('big_five_description.tsv', sep='\t')
df.columns = ['word', 'A', 'E', 'C', 'N', 'O', 'nnon']
df.head()

Unnamed: 0,word,A,E,C,N,O,nnon
0,Naive,0.26*,-0.2,-0.21,0.12,-0.18,
1,Altruistic,0.25*,0.0,-0.03,-0.06,0.22,
2,Compliant,0.24*,-0.19,-0.02,0.01,-0.04,
3,Natural,0.23*,0.09,0.03,-0.17,0.14,
4,Suggestible,0.17*,0.0,-0.06,0.07,-0.16,


In [3]:
import re
def numeric(input_):
    x = str(input_).replace(' ', '')
    if x.endswith('*'):
        x=x[:-1]
    if x.startswith('3'):
        x = "0."+x
    if x.startswith('2'):
        x = "0."+x
    elif x.startswith('-3'):
        x = "-0."+x[1:]
    elif x.startswith('03'):
        x = '0.3'+x[2:]
    elif x.startswith('-03'):
        x = '-0.3'+x[3:]
    try:
        x = eval(x)
    except:
        print(input_)
    return x

df['A'] = df['A'].apply(numeric)
df['C'] = df['C'].apply(numeric)
df['E'] = df['E'].apply(numeric)
df['O'] = df['O'].apply(numeric)
df['N'] = df['N'].apply(numeric)

## Find synonyms and antonyms for each label word from ConceptNet

In [4]:
import requests

words = df['word']

words_dict = {}
for w in [i.lower() for i in words]:
    words_dict[w] = []
    synonyms = []
    antonyms = []
    # Synonym
    s_obj = requests.get('http://api.conceptnet.io/query?start=/c/en/'+w+'&rel=/r/Synonym&limit=100').json()
    cnt = 0
    for d in s_obj['edges']:
        if d['end']['language'] == 'en':
            synonyms.append(d['end']['label'])
            cnt += 1
        if cnt > 10:
            break
    
    # Antonym
    a_obj = requests.get('http://api.conceptnet.io/query?start=/c/en/'+w+'&rel=/r/Antonym&limit=100').json()
    cnt = 0
    for d in a_obj['edges']:
        if d['end']['language'] == 'en':
            antonyms.append(d['end']['label'])
            cnt += 1
        if cnt > 10:
            break
    words_dict[w] = [synonyms, antonyms]

---

## Create verbalizer for each big-five trait

In [5]:
for trait in ['A','C','E', 'N', 'O']:
    f_word = open(trait+'_words.txt', 'w')
    f_weight = open(trait+'_weights.txt', 'w')
    words_pos = []
    words_neg = []
    weights_pos = []
    weights_neg = []
    for i,r in df.iterrows():
        try:
            if r[trait] > 0:
                words_pos.append(r['word'].lower())
                weights_pos.append(r[trait])
                for w in words_dict[r['word'].lower()][0]:
                    if not ((w in words_neg) or (w in words_pos)): 
                        words_pos.append(w)
                        weights_pos.append(r[trait])
                for w in words_dict[r['word'].lower()][1]:
                    if not ((w in words_neg) or (w in words_pos)): 
                        words_neg.append(w) 
                        weights_neg.append(-r[trait])
            elif r[trait] < 0:
                words_neg.append(r['word'].lower())
                weights_neg.append(r[trait])
                for w in words_dict[r['word'].lower()][0]:
                    if not ((w in words_neg) or (w in words_pos)): 
                        words_neg.append(w)
                        weights_neg.append(r[trait])
                for w in words_dict[r['word'].lower()][1]:
                    if not ((w in words_neg) or (w in words_pos)):
                        words_pos.append(w)
                        weights_pos.append(-r[trait])
        except:
            # print(r['word'])
            pass
    
    
    
    print('Total label words for ', trait, 'is', str(len(words_pos)), ',', str(len(words_neg)))
    f_word.write(','.join(words_pos)+'\n')
    f_word.write(','.join(words_neg))
    f_weight.write(str(weights_pos)+'\n')
    f_weight.write(str(weights_neg))
    f_word.close()
    f_weight.close()

Total label words for  A is 765 , 775
Total label words for  C is 788 , 742
Total label words for  E is 791 , 718
Total label words for  N is 724 , 790
Total label words for  O is 868 , 649
