<a href="https://colab.research.google.com/github/subhobrata/Natural-Language-Processing-From-Scratch/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import string
import gzip
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
pos = np.loadtxt('/content/gdrive/My Drive/data/positive-words.txt', dtype='str', comments=';')
neg = np.loadtxt('/content/gdrive/My Drive/data/negative-words.txt', dtype='str', comments=';')

In [0]:
valence = {}

for word in pos:
    valence[word.lower()] = 1
    
for word in neg:
    valence[word.lower()] = -1

In [0]:
def extract_words(text):
    temp = text.split() # Split the text on whitespace
    text_words = []

    for word in temp:
        # Remove any punctuation characters present in the beginning of the word
        while word[0] in string.punctuation:
            word = word[1:]

        # Remove any punctuation characters present in the end of the word
        while word[-1] in string.punctuation:
            word = word[:-1]

        # Append this word into our list of words.
        text_words.append(word.lower())
        
    return text_words

In [0]:
def sentiment(text, valence):
    words = extract_words(text.lower())
    
    word_count = 0
    score = 0
    
    for word in words:
        if word in valence:
            score += valence[word]
            word_count += 1
            
    return score/word_count

In [7]:
texts = ["I'm very happy",
         "The product is pretty annoying, and I hate it",
         "I'm sad",
        ]

for text in texts:
    print(sentiment(text, valence))

1.0
-0.3333333333333333
-1.0


In [8]:
words = extract_words(texts[1].lower())
for word in words:
    if word in valence:
        print(word, valence[word])

pretty 1
annoying -1
hate -1


In [0]:
modifiers = {
    "very": 1.5,
    "much": 1.3,
    "not": -1,
    "pretty": 1.5,
    "somewhat": 1.2}

In [0]:
def sentiment_modified(text, valence, modifiers, verbose=False):
    words = extract_words(text.lower())
    
    word_count = 0
    score = 0
    ngrams = [[]]
    
    # generate ngrams
    for i in range(len(words)):
        word = words[i]
        
        if word in modifiers:
            ngrams[-1].append(word)
            continue

        if word in valence:
            ngrams[-1].append(word)
        else:
            if len(ngrams[-1]) > 0:
                ngrams.append([])

    score = 0
    
    # Remove the trailing empty ngram if necessary
    if len(ngrams[-1]) == 0:
        ngrams = ngrams[:-1]

    for ngram in ngrams:
        value = 1

        for word in ngram:
            if word in modifiers:
                value *= modifiers[word]
            elif word in valence:
                value *= valence[word]

        if verbose:
            print(ngram, value)

        score += value

    return score/len(ngrams)

In [11]:
sentiment_modified(texts[1], valence, modifiers, True)

['pretty', 'annoying'] -1.5
['hate'] -1


-1.25

In [12]:
sentiment_modified("It was not very good", valence, modifiers, True)

['not', 'very', 'good'] -1.5


-1.5

In [13]:
sentiment_modified("It was not not very very good", valence, modifiers, True)

['not', 'not', 'very', 'very', 'good'] 2.25


2.25

In [0]:
vader = pd.read_csv("/content/gdrive/My Drive/data/vader_lexicon.txt", sep='\t', header=None)

In [15]:
vader.head()

Unnamed: 0,0,1,2,3
0,$:,-1.5,0.80623,"[-1, -1, -1, -1, -3, -1, -3, -1, -2, -1]"
1,%),-0.4,1.0198,"[-1, 0, -1, 0, 0, -2, -1, 2, -1, 0]"
2,%-),-1.5,1.43178,"[-2, 0, -2, -2, -1, 2, -2, -3, -2, -3]"
3,&-:,-0.4,1.42829,"[-3, -1, 0, 0, -1, -1, -1, 2, -1, 2]"
4,&:,-0.7,0.64031,"[0, -1, -1, -1, 1, -1, -1, -1, -1, -1]"


In [16]:
scores = eval(vader.iloc[4446][3])
print(scores)

[3, 3, 3, 3, 3, 3, 3, 4, 4, 3]


In [17]:
vader.iloc[4447]

0                             loved
1                               2.9
2                               0.7
3    [3, 3, 4, 2, 2, 4, 3, 2, 3, 3]
Name: 4447, dtype: object

In [0]:
valence_vader = dict(vader[[0,1]].values)

In [19]:
valence_vader[':)']

2.0

In [20]:
sentiment_modified("It was not not very very good", valence_vader, modifiers, True)

['not', 'not', 'very', 'very', 'good'] 4.2749999999999995


4.2749999999999995