In [103]:
import pandas as pd
import numpy as np
import glob
import docx
import time
import os

In [143]:
def neg(bucket):
    if bucket < 4:
        return bucket + 4
    else:
        return bucket - 4
def remove_nan(l):
    return [x for x in l if not pd.isnull(x)]

def get_text(doc_names):
    documents = []
    titles = []
    for i in range(len(doc_names)):
        doc_name = doc_names[i]
        paragraphs = docx.Document(doc_name).paragraphs
        par_text = [paragraph.text.replace('\xa0', ' ') for paragraph in paragraphs]
        text = par_text[0]
        titles.append(text)
        for i in range(1, len(par_text)):
            par = par_text[i]
            if par:
                text += ' ' + par
        documents.append(text)
    return titles, documents

def bin_of_words():
    df = pd.read_excel('Bin of words TOFU software.xlsx')
    return [remove_nan(df.D.to_list()), remove_nan(df.ECD.to_list()), remove_nan(df.HED.to_list()), \
            remove_nan(df.END.to_list()), remove_nan(df.N.to_list()), remove_nan(df.ECN.to_list()), \
            remove_nan(df.HEN.to_list()), remove_nan(df.ENN.to_list())]

1: Dream 

2: Economic dream

3: Health dream

4: Environmental dream

5: Nightmare

6: Economic nightmare

7: Health nightmare

8: Environmental nightmare

In [22]:
bins_of_words = bin_of_words()

In [72]:
doc_names = sorted(glob.glob('webpages dream nightmare analysis/*.doc*'))
titles, documents = get_text(doc_names)

In [73]:
len(documents)

32

In [137]:
word_matrix = np.empty((len(documents), len(bins_of_words)), dtype=object)

In [138]:
for i in range(len(documents)):
    document = documents[i]
    for j in range(len(bins_of_words)):
        if str(word_matrix[i][j]) == 'None':
            word_matrix[i][j] = []
        bucket = bins_of_words[j]
        for word in bucket:
            if word in document:
                for _ in range(document.count(word)):
                    tmp = word_matrix[i][j].copy()
                    tmp.append(word)
                    word_matrix[i][j] = tmp

In [77]:
from stanfordnlp.server import CoreNLPClient
CoreNLP = "/home/roguehydra/Documents/Jaar 3/TOFU/SA/stanford-corenlp"
os.environ["CORENLP_HOME"] = CoreNLP
# text = "Barack Obama was not born in Hawaii."

In [140]:
begin = time.time()
ann_doc = []
# set up the client
with CoreNLPClient(annotators=['tokenize','ssplit','pos','depparse'], timeout=60000, memory='16G') as client:
    # annotate documents on the server
    for text in documents:
        ann_doc.append(client.annotate(text))
        
negations = []
for ann in ann_doc:
    negated=[]
    token_dict = {}
    for i in range(len(ann.sentence)):
        # get the sentence
        sentence = ann.sentence[i]

        # get the dependency parse of the sentence
        dependency_parse = sentence.basicDependencies

        #get a dictionary associating each token/node with its label
        for j in range(0, len(sentence.token)) :
            token_dict[sentence.token[j].tokenEndIndex] = sentence.token[j].word

        #take previous sentences into consideration
        offset = sentence.token[0].tokenBeginIndex        

        #get a list of the dependencies with the words they connect
        for item in dependency_parse.edge:
            dep = item.dep
            if dep == 'neg':
                source_node = item.source
                source_name = token_dict[offset + source_node]
                negated.append(source_name)
    negations.append(negated)
end = time.time()
print("Elapsed time: {}s".format(round(end - begin,1)))

Starting server with command: java -Xmx16G -cp /home/roguehydra/Documents/Jaar 3/TOFU/SA/stanford-corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-e8575e09f3584428.props -preload tokenize,ssplit,pos,depparse
Elapsed time: 44.6s


In [149]:
for i in range(len(negations)):
    negated = negations[i]
    for word in negated:
        for j in range(len(bins_of_words)):
            
            if word in word_matrix[i][j]:
                tmp = word_matrix[i][j].copy()
                tmp.pop(tmp.index(word))
                word_matrix[i][j] = tmp
                
                j2 = neg(j)
                tmp = word_matrix[i][j2].copy()
                tmp.append(word)
                word_matrix[i][j2] = tmp                

good
mix
droplets
overdosing
differ
needed
correct
interest
have
demonstrate
conform
place
consider
date
started
done
start
possible
unique
judge
harm
only
provide
subject
subject
consumed
present
disadvantageous
misleading
released
contact
over
longer
tracked
meet
lend
able
match
built
decided
relied
wind
power
north
bonds
zoning
protection
allow
interested
candidates
group
north
agricultural
group
community
fight
target
like
only
stage
impact
hampered
harmful
stop
undertake
reveal
show
gasses
gases
regulated
regulated
carry
use
oversight
know
research
formula
carry
use
great
carbonate
perform
interfere
prescriptive
asked
connected
blowing
mean
generate
harmful
come
seen
have
water


In [150]:
score_matrix = np.zeros((len(documents), len(bins_of_words)))

In [151]:
for i in range(len(documents)):
    for j in range(len(bins_of_words)):
        score_matrix[i][j] = len(word_matrix[i][j])

In [152]:
for i in range(len(documents)):
    print("Document {} \n\n{}\n".format(i+1, titles[i]))
    positive = score_matrix[i][0] + score_matrix[i][1] + score_matrix[i][2] + score_matrix[i][3]
    negative = score_matrix[i][4] + score_matrix[i][5] + score_matrix[i][6] + score_matrix[i][7]
    economic = score_matrix[i][1] + score_matrix[i][5]
    health = score_matrix[i][2] + score_matrix[i][6]
    environment = score_matrix[i][3] + score_matrix[i][7]
    print("Positive    : {}".format(int(positive)))
    print("Negative    : {}".format(int(negative)))
    print("Average     : {} \n".format(int(positive-negative)))
    print("Economic    : {}".format(int(economic)))
    print("Health      : {}".format(int(health)))
    print("Evironmental: {}".format(int(environment)))
    print("- - - - - - - - - - - - - - - -")

Document 1 

Accurately mixing oil and water is not good.

Positive    : 58
Negative    : 4
Average     : 54 

Economic    : 31
Health      : 11
Evironmental: 0
- - - - - - - - - - - - - - - -
Document 2 

Wind energy is a clean, renewable form of energy that has many pros. Learn more below.

Positive    : 12
Negative    : 0
Average     : 12 

Economic    : 6
Health      : 3
Evironmental: 0
- - - - - - - - - - - - - - - -
Document 3 

Interior Letter Canceling Wind Project Notes High Value Area for Bighorn Sheep, Eagles

Positive    : 70
Negative    : 51
Average     : 19 

Economic    : 25
Health      : 42
Evironmental: 2
- - - - - - - - - - - - - - - -
Document 4 

Considerations for the Development of Shale Gas in the United Kingdom _ PSE _ Physicians, Scientists, and Engineers for Healthy Energy

Positive    : 19
Negative    : 7
Average     : 12 

Economic    : 9
Health      : 5
Evironmental: 0
- - - - - - - - - - - - - - - -
Document 5 

Hydraulic Fracturing for Oil and Gas: Impact