In [1]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import pandas as pd
import glob
import docx

In [2]:
def SentimentScore(paragraph, weight=1):
    sentences = []
    lines_list = tokenize.sent_tokenize(paragraph)
    sentences.extend(lines_list)
    sid = SentimentIntensityAnalyzer()
    sentiment = 0
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        score = ss['compound']
        if score < 0:
            sentiment += score*weight
        else:
            sentiment += score
    return round(sentiment,2), round(sentiment/len(sentences),2)

def remove_nan(l):
    return [x for x in l if not pd.isnull(x)]

def get_text(doc_names):
    documents = []
    titles = []
    for i in range(len(doc_names)):
        doc_name = doc_names[i]
        paragraphs = docx.Document(doc_name).paragraphs
        par_text = [paragraph.text.replace('\xa0', ' ') for paragraph in paragraphs]
        text = par_text[0]
        titles.append(text)
        for i in range(1, len(par_text)):
            par = par_text[i]
            if par:
                text += ' ' + par
        documents.append(text)
    return titles, documents

def bin_of_words():
    df = pd.read_excel('Bin of words TOFU software.xlsx')
    return [remove_nan(df.D.to_list()), remove_nan(df.ECD.to_list()), remove_nan(df.HED.to_list()), \
            remove_nan(df.END.to_list()), remove_nan(df.N.to_list()), remove_nan(df.ECN.to_list()), \
            remove_nan(df.HEN.to_list()), remove_nan(df.ENN.to_list())]

In [7]:
bins_of_words = bin_of_words()
doc_names = sorted(glob.glob('Webpages/TRAIN/*.doc*'))
titles, documents = get_text(doc_names)

In [14]:
for i in range(len(documents)):
    print("Document {} \n\n{}\n".format(i+1, titles[i]))
    doc = documents[i]
    _, average_sen = SentimentScore(doc, weight=2.5)
#     print("Total sentiment: {}".format(total_sen))

    if average_sen > 0.1:
        judgement = 'positive'
    elif average_sen < -0.1:
        judgement = 'negative'
    elif average_sen == 0:
        judgement = 'neutral'
    elif average_sen < 0 and average_sen >= -0.1: 
        judgement = 'neutral (leaning towards negative)'
    else:
        judgement = 'neutral (leaning towards positive)'
    print("Sentiment (score): {}, ({})".format(judgement, average_sen))
    print("- - - - - - - - - - - - - - -")

Document 1 

Accurately mixing oil and water.

Sentiment (score): positive, (0.19)
- - - - - - - - - - - - - - -
Document 2 

Wind energy is a clean, renewable form of energy that has many pros. Learn more below.

Sentiment (score): positive, (0.32)
- - - - - - - - - - - - - - -
Document 3 

Interior Letter Canceling Wind Project Notes High Value Area for Bighorn Sheep, Eagles

Sentiment (score): neutral (leaning towards positive), (0.02)
- - - - - - - - - - - - - - -
Document 4 

Considerations for the Development of Shale Gas in the United Kingdom _ PSE _ Physicians, Scientists, and Engineers for Healthy Energy

Sentiment (score): positive, (0.25)
- - - - - - - - - - - - - - -
Document 5 

Hydraulic Fracturing for Oil and Gas: Impacts from the Hydraulic Fracturing Water Cycle on Drinking Water Resources in the United States (Final Report)

Sentiment (score): neutral (leaning towards negative), (-0.01)
- - - - - - - - - - - - - - -
Document 6 

FDA’s Approach to Regulation of Nanotech