In [18]:
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import os
import csv

vader_model = SentimentIntensityAnalyzer()
nltk.download('vader_lexicon', quiet=False)
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/soniadias/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores


In [20]:
def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

In [22]:
sentences = []
all_vader_output = []
gold = []

with open("/Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/lab3/sentiment-topic-final-test.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    for line in tsv_file:
        text = line[1]
        if text == "text":
            continue
        sentiment = line[2]
        vader_output = run_vader(text, lemmatize=True) # run vader
        vader_label = vader_output_to_label(vader_output)# convert vader output to category

        sentences.append(text)
        all_vader_output.append(vader_label)
        gold.append(sentiment)
        
        
        

In [23]:
from sklearn.metrics import classification_report
y_pred = all_vader_output
y_true = gold
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
     neutral       0.00      0.00      0.00         3
    positive       0.50      1.00      0.67         4

    accuracy                           0.50        10
   macro avg       0.33      0.44      0.36        10
weighted avg       0.35      0.50      0.39        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
movie_review_sentences = []
all_vader_output = []
gold = []

# negative reviews
neg_reviews_dir = '/Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/lab3/movie_reviews/neg'
for filename in os.listdir(neg_reviews_dir):
    f = os.path.join(neg_reviews_dir, filename)
    if os.path.isfile(f) and filename.endswith('.txt'):
        with open(f, "r") as a_file:
            for line in a_file:
                
                vader_output = run_vader(line, lemmatize=True) # run vader
                vader_label = vader_output_to_label(vader_output)# convert vader output to category

                movie_review_sentences.append(line)
                all_vader_output.append(vader_label)
                gold.append("negative")
                
# positive reviews
pos_reviews_dir = '/Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/lab3/movie_reviews/pos'
for filename in os.listdir(pos_reviews_dir):
    f = os.path.join(pos_reviews_dir, filename)
    if os.path.isfile(f) and filename.endswith('.txt'):
        with open(f, "r") as a_file:
            for line in a_file:
                
                vader_output = run_vader(line, lemmatize=True) # run vader
                vader_label = vader_output_to_label(vader_output)# convert vader output to category

                movie_review_sentences.append(line)
                all_vader_output.append(vader_label)
                gold.append("positive")

from sklearn.metrics import classification_report
y_pred = all_vader_output
y_true = gold
print(classification_report(y_true, y_pred))

KeyboardInterrupt: 

['neg', 'pos']

neg 1000
pos 1000
