In [103]:
import pandas as pd
import re
import numpy as np

## Building out a lexicon for news sentiment analysis
#### Data from (https://sraf.nd.edu/textual-analysis/resources/#LM%20Sentiment%20Word%20Lists)

In [11]:
df = pd.read_csv("data/loughran_dictionary.csv")

In [18]:
df.head()

Unnamed: 0,Word,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
0,AARDVARK,1,277,1.48e-08,1.24e-08,3.56e-06,84,0,0,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.6e-10,9.73e-12,9.86e-09,1,0,0,0,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,8,4.28e-10,1.39e-10,6.23e-08,7,0,0,0,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,12,6.41e-10,3.16e-10,9.38e-08,12,0,0,0,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,7250,3.87e-07,3.68e-07,3.37e-05,914,0,0,0,0,0,0,0,0,0,0,3,12of12inf


In [13]:
df.describe()

Unnamed: 0,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables
count,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0,86486.0
mean,43243.5,216353.7,1.156255e-05,1.156183e-05,1.904424e-05,18285.29,54.706588,8.223192,6.899417,21.004035,4.274819,1.300835,1.579585,0.00148,0.001827,0.073665,2.840934
std,24966.502028,7204730.0,0.0003850416,0.0003893376,0.00012677,90995.8,326.982665,128.269555,117.53376,204.367726,92.580984,51.104943,56.310995,0.06062,0.042703,0.344561,1.225666
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21622.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,43243.5,63.0,3.37e-09,2.87e-09,6.26e-07,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
75%,64864.75,1905.0,1.02e-07,8.64e-08,5.83e-06,834.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
max,86486.0,1386049000.0,0.07407449,0.07539358,0.01152622,1028262.0,2014.0,2012.0,2012.0,2018.0,2011.0,2009.0,2009.0,3.0,1.0,2.0,9.0


In [25]:
df.astype(bool).sum(axis=0)

Word                  86486
Sequence Number       86486
Word Count            63114
Word Proportion       63114
Average Proportion    63114
Std Dev               63114
Doc Count             63114
Negative               2355
Positive                354
Uncertainty             297
Litigious               904
Constraining            184
Superfluous              56
Interesting              68
Modal                    60
Irr_Verb                158
Harvard_IV             4188
Syllables             85970
Source                86486
dtype: int64

Appears to be only 2355 words labeled as negative and 354 labeled as positive, but their values seem to all be 2009?

In [216]:
df.loc[df['Positive'] != 0].shape

(354, 19)

In [217]:
df.loc[df['Negative'] != 0].shape

(2355, 19)

### Loughran Dictionary above gives weird pos/neg sentiment values... (Why 2009???) Going to check out the other sentiment dictionary by loughran

In [195]:
xls = pd.ExcelFile("data/loughran_sentiment_dictionary.xlsx")

In [196]:
pos = pd.read_excel(xls, "Positive")
positive_words = list(pos["ABLE"])
print(positive_words[:10])
print(len(positive_words))

['ABUNDANCE', 'ABUNDANT', 'ACCLAIMED', 'ACCOMPLISH', 'ACCOMPLISHED', 'ACCOMPLISHES', 'ACCOMPLISHING', 'ACCOMPLISHMENT', 'ACCOMPLISHMENTS', 'ACHIEVE']
353


In [197]:
neg = pd.read_excel(xls, "Negative")
negative_words = list(neg["ABANDON"])
print(negative_words[:10])
print(len(negative_words))

['ABANDONED', 'ABANDONING', 'ABANDONMENT', 'ABANDONMENTS', 'ABANDONS', 'ABDICATED', 'ABDICATES', 'ABDICATING', 'ABDICATION', 'ABDICATIONS']
2354


Data also contains strong/weak modals and uncertain words.. not going to use for now, but could look into this later

In [198]:
unc = pd.read_excel(xls, "Uncertainty")
strong = pd.read_excel(xls, "StrongModal")
weak = pd.read_excel(xls, "WeakModal")
print(unc.size, strong.size, weak.size)

296 18 26


#### Following data taken from (http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/) 

In [199]:
subj = pd.read_csv("data/subj-clues.csv")
subj.drop(inplace=True, columns=["Unnamed: 0", "Unnamed: 2", "Unnamed: 3",	"Unnamed: 4", "Unnamed: 5",	"Unnamed: 6"])
subj.rename(inplace=True, index=str, columns={"values": "subj"})
print(subj.shape)

(8222, 1)


In [200]:
# Functions to parse the values for word and sentiment polarity
def word(x):
    return x.split(" ")[2].split("=")[1]
def sent(x):
    if len(x.split(" ")) > 5:
        if len(x.split(" ")[5].split("=")) > 1:
            return x.split(" ")[5].split("=")[1]
        else:
            return "NaN"
    else:
        return "NaN"
    
# Transform dataframe to only have word and associated polarity
word_vect = np.vectorize(word)
sent_vect = np.vectorize(sent)
subj = subj.assign(word=lambda x: word_vect(x.subj))
subj = subj.assign(sent=lambda x: sent_vect(x.subj))
subj.drop(columns=["subj"], inplace=True)
subj.shape

(8222, 2)

In [201]:
subj.astype(bool).sum(axis=0)

word    8222
sent    8222
dtype: int64

In [202]:
subj.head()

Unnamed: 0,word,sent
0,abandoned,negative
1,abandonment,negative
2,abandon,negative
3,abase,negative
4,abasement,negative


In [203]:
subj[subj['sent'].str.contains('negative', regex=False)].shape

(4911, 2)

In [204]:
subj[subj['sent'].str.contains('positive', regex=False)].shape

(2718, 2)

In [205]:
subj[~(subj['sent'].str.contains('positive', regex=False) | subj['sent'].str.contains('negative', regex=False))].shape

(593, 2)

In [206]:
subj.shape

(8222, 2)

Now, adding these new words to our list of positive/negative words

Removing duplicates

In [229]:
print(len(set(positive_words)))
print(len(set(negative_words)))
positive_words = list(set(positive_words))
negative_words = list(set(negative_words))

2485
5763


Turn all words to lower case

In [219]:
positive_words = [x.lower() for x in positive_words]
negative_words = [x.lower() for x in negative_words]

Take top 2500 from each set of words, to even odds between pos/neg

In [222]:
pos_words = positive_words[:2500]
neg_words = negative_words[:2500]

### Testing
I am simply checking whether how many good/bad words are contained in the test files

In [228]:
fb = "fb-bad.txt"
fb2 = "fb-sure-bad.txt"
goog = "goog-bad.txt"
pix = "pixel-bad.txt"
uber = "uber-bad.txt"

test = "test/"

file = test + fb

hehe = ["The", "When"]

with open(file) as f:
    data = f.read()
    words = re.sub("[^\w]", " ",  data).split()
    print(words[:100])
    print("Number of positive words:", len([c for c in words if c.lower() in pos_words]))    
    print("Number of negative words:", len([c for c in words if c.lower() in neg_words]))

['The', 'panic', 'attacks', 'started', 'after', 'Chloe', 'watched', 'a', 'man', 'die', 'She', 'spent', 'the', 'past', 'three', 'and', 'a', 'half', 'weeks', 'in', 'training', 'trying', 'to', 'harden', 'herself', 'against', 'the', 'daily', 'onslaught', 'of', 'disturbing', 'posts', 'the', 'hate', 'speech', 'the', 'violent', 'attacks', 'the', 'graphic', 'pornography', 'In', 'a', 'few', 'more', 'days', 'she', 'will', 'become', 'a', 'full', 'time', 'Facebook', 'content', 'moderator', 'or', 'what', 'the', 'company', 'she', 'works', 'for', 'a', 'professional', 'services', 'vendor', 'named', 'Cognizant', 'opaquely', 'calls', 'a', 'process', 'executive', 'For', 'this', 'portion', 'of', 'her', 'education', 'Chloe', 'will', 'have', 'to', 'moderate', 'a', 'Facebook', 'post', 'in', 'front', 'of', 'her', 'fellow', 'trainees', 'When', 'it', 's', 'her', 'turn', 'she', 'walks']
Number of positive words: 304
Number of negative words: 189
