In [1]:
import pandas as pd
import numpy as np
import glob
import docx
import time
import os
from stanfordnlp.server import CoreNLPClient

In [70]:
def neg(bucket):
    if bucket < 4:
        return bucket + 4
    else:
        return bucket - 4
    
def remove_nan(l):
    return [x for x in l if not pd.isnull(x)]

def get_text(doc_names):
    documents = []
    titles = []
    for i in range(len(doc_names)):
        doc_name = doc_names[i]
        paragraphs = docx.Document(doc_name).paragraphs
        par_text = [paragraph.text.replace('\xa0', ' ') for paragraph in paragraphs]
        text = par_text[0]
        titles.append(text)
        for i in range(1, len(par_text)):
            par = par_text[i]
            if par:
                text += ' ' + par
        documents.append(text.lower())
    return titles, documents

def bin_of_words(filename):
    df = pd.read_excel(filename)
    return [sorted(list(set(remove_nan(df[col].values)))) for col in df.columns]

In [71]:
CoreNLP = "/home/roguehydra/Documents/Jaar 3/TOFU/SA/stanford-corenlp"
os.environ["CORENLP_HOME"] = CoreNLP
doc_names = sorted(glob.glob('/home/roguehydra/Documents/Jaar 3/TOFU/SA/Webpages/TRAIN/*.doc*'))
titles, documents = get_text(doc_names)
print("Amount of docs: {}\n".format(len(documents)))
begin = time.time()
ann_doc = []
# set up the client
with CoreNLPClient(annotators=['tokenize','ssplit','pos','depparse'], timeout=60000, memory='16G') as client:
    # annotate documents on the server
    for text in documents:
        ann_doc.append(client.annotate(text))
        
negations = []
for ann in ann_doc:
    negated=[]
    token_dict = {}
    for i in range(len(ann.sentence)):
        # get the sentence
        sentence = ann.sentence[i]

        # get the dependency parse of the sentence
        dependency_parse = sentence.basicDependencies

        #get a dictionary associating each token/node with its label
        for j in range(0, len(sentence.token)) :
            token_dict[sentence.token[j].tokenEndIndex] = sentence.token[j].word

        #take previous sentences into consideration
        offset = sentence.token[0].tokenBeginIndex        

        #get a list of the dependencies with the words they connect
        for item in dependency_parse.edge:
            dep = item.dep
            if dep == 'neg':
                source_node = item.source
                source_name = token_dict[offset + source_node]
                negated.append(source_name)
    negations.append(negated)
end = time.time()
print("\nElapsed time: {}s".format(round(end - begin,1)))

Amount of docs: 32

Starting server with command: java -Xmx16G -cp /home/roguehydra/Documents/Jaar 3/TOFU/SA/stanford-corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-99170f1d914b4c5c.props -preload tokenize,ssplit,pos,depparse

Elapsed time: 45.2s


1: Dream 

2: Economic

3: Health

4: Environment

5: Nightmare

In [153]:
bins_of_words = bin_of_words('BINS - version 2.xlsx')

In [154]:
word_matrix = np.empty((len(documents), len(bins_of_words)), dtype=object)

In [155]:
for i in range(len(documents)):
    document = documents[i]
    for j in range(len(bins_of_words)):
        if str(word_matrix[i][j]) == 'None':
            word_matrix[i][j] = []
        bucket = bins_of_words[j]
        for word in bucket:
            if word in document:
                for _ in range(document.count(word)):
                    tmp = word_matrix[i][j].copy()
                    tmp.append(word)
                    word_matrix[i][j] = tmp

In [156]:
rows = [0,4]
for i in range(len(negations)):
    negated = negations[i]
    for word in negated:
        
        for j in rows:
            
            if word in word_matrix[i][j]:
#                 print('Word: {}, Doc: {}\n'.format(word,i))
                tmp = word_matrix[i][j].copy()
                tmp.pop(tmp.index(word))
                word_matrix[i][j] = tmp
                
                j2 = neg(j)
#                 print(word_matrix[i][j2], '\n')
                tmp2 = word_matrix[i][j2].copy()
                tmp2.append(word)
                word_matrix[i][j2] = tmp2 
#                 print(word_matrix[i][j2])
                break


In [157]:
score_matrix = np.zeros((len(documents), len(bins_of_words)))

In [158]:
for i in range(len(documents)):
    for j in range(len(bins_of_words)):
        score_matrix[i][j] = len(word_matrix[i][j])

In [159]:
for i in range(len(documents)):
    print("Document {} \n\n{}\n".format(i+1, titles[i]))
    positive = score_matrix[i][0]
    negative = score_matrix[i][4]
    total_sentiment = positive - negative
    total = positive + negative
    
    economic = score_matrix[i][1] 
    health = score_matrix[i][2] 
    environment = score_matrix[i][3]
    total_topic =  economic + health + environment
    if total_sentiment > 3:
        judgement = 'positive'
        sen_score = total_sentiment/positive
    elif total_sentiment < -3:
        judgement = 'negative'
        sen_score = total_sentiment/negative
    elif total_sentiment == 0:
        judgement = 'neutral'
        sen_score = 0
    elif total_sentiment < 0 and total_sentiment >= -3: 
        judgement = 'neutral (leaning towards negative)'
        sen_score = total_sentiment/negative
    else:
        judgement = 'neutral (leaning towards positive)' 
        sen_score = total_sentiment/positive
    
    print("Sentiment (score) : {}, ({})\n".format(judgement,round(sen_score,2)))
    print("Economic score    : {}".format(round(economic/total_topic,2)))
    print("Health score      : {}".format(round(health/total_topic,2)))
    print("Evironmental score: {}".format(round(environment/total_topic,2)))
    print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")

Document 1 

Accurately mixing oil and water.

Sentiment (score) : positive, (0.84)

Economic score    : 0.75
Health score      : 0.18
Evironmental score: 0.07
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 2 

Wind energy is a clean, renewable form of energy that has many pros. Learn more below.

Sentiment (score) : positive, (1.0)

Economic score    : 0.44
Health score      : 0.08
Evironmental score: 0.47
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 3 

Interior Letter Canceling Wind Project Notes High Value Area for Bighorn Sheep, Eagles

Sentiment (score) : neutral (leaning towards positive), (0.04)

Economic score    : 0.16
Health score      : 0.18
Evironmental score: 0.67
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Document 4 

Considerations for the Development of Shale Gas in the United Kingdom _ PSE _ Physicians, Scientists, and Engineers for Healthy Energy

Sentiment (score) : positive, (0.68)

Economic sc