## 1.5. Sentiment Analysis

### Project Flow
1. Data Loading
2. Create counter for positive,negative and total words
3. Process data to creative predictive attribut with respect to sentiment: pos_neg_ratio
4. Create vocabulary, map word to index
5. Process input layer, label targets
6. Combine above - Build network
7. Recontruct network to inrease efficency - remove 0 mulitiplication
8. Modify input process part to reduce noise - add polarity cutoff
9. Check word similarity and visiualization

In [1]:
from collections import Counter
import numpy as np
import time
import sys

In [2]:
# 1. data loading

def load_file(filepath):
    g = open(filepath, 'r')
    data = list(map(lambda x:x[:-1].lower(), g.readlines()))
    g.close
    return data

reviews=load_file("..\\ClassSampleData\\Data1.5_reviews.txt")
labels=load_file("..\\ClassSampleData\\Data1.5_labels.txt")
    
labels=list(map(lambda x:x.upper(), labels))


In [3]:
# 2. create counter objects for positive, negative and total words
pos_cnt, neg_cnt, total_cnt = Counter(), Counter(), Counter()
for i in range(len(reviews)):
    for word in reviews[i].split(" "):
        if(labels[i]=='POSITIVE'):
            pos_cnt[word] += 1
        else:
            neg_cnt[word] += 1
        total_cnt[word] += 1

In [4]:
# 3. create positive to negative ratio Counter
#scale neutral word to 0, positive word positive values, negative word negative value
pos_neg_ratio = Counter()
for word, cnt in list(total_cnt.most_common()):
    if(cnt>100):
        ratio = pos_cnt[word]/float(neg_cnt[word]+1)
        pos_neg_ratio[word] = np.log(ratio)

#checking most common words
print('Most Positive Words:\n', pos_neg_ratio.most_common()[:3])
print('Most Negative Words:\n', list(reversed(pos_neg_ratio.most_common()))[:3])

Most Positive Words:
 [('edie', 4.6913478822291435), ('paulie', 4.07753744390572), ('felix', 3.152736022363656)]
Most Negative Words:
 [('boll', -4.969813299576001), ('uwe', -4.624972813284271), ('seagal', -3.644143560272545)]


In [5]:
# 4. create vocab
vocab = set(total_cnt.keys())
vocab_size = len(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i


In [6]:
# 5. input layer: count how many times a word is in the review
def input_layer(review):
    global layer_0
    
    layer_0*=0
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

def target_label(label):
    return 1 if label=='POSITIVE' else 0

In [7]:
# 6. building neural netwrok
class sentimentNetwork:
    '''
    hidden layer_1: activation = None
    output layer _2: activation = sigmoid
    '''
    def __init__(self, reviews, labels, hidden_nodes=10, learn_rate=0.1):
        np.random.seed(1)
        self.process_data(reviews, labels)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learn_rate)
    
    def process_data(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size, self.label_vocab_size = len(self.review_vocab), len(self.label_vocab)
        
        self.word2index = {}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        self.label2index = {}
        for i,word in enumerate(self.label_vocab):
            self.label2index[word]=i    
    
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learn_rate):
        self.input_nodes, self.hidden_nodes, self.output_nodes = input_nodes, hidden_nodes, output_nodes
        self.lr = learn_rate
        
        self.weight_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weight_1_2 = np.random.normal(0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        
        self.layer_0 = np.zeros((1, input_nodes))
        
    def update_input_layer(self, review):
        self.layer_0*=0
        for word in review.split(" "):
            if (word in self.word2index.keys()): # prevent new words cause key error
                self.layer_0[0][self.word2index[word]] = 1
                # Assign 1 instead of 1 can reduce the noise cause frequent words (like 'the', 'a')

    def get_target_label(self, label):
        return 1 if label=='POSITIVE' else 0

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def sigmoid_prime(self, output):
        return output * (1-output)
    
    def train(self, train_reviews, train_labels):
        assert(len(train_reviews) == len(train_labels))
        correct_so_far = 0
        
        start = time.time()
        for i in range(len(train_reviews)):
            review, label = train_reviews[i], train_labels[i]
            self.update_input_layer(review)
            layer_1 = self.layer_0.dot(self.weight_0_1)
            layer_2 = self.sigmoid(layer_1.dot(self.weight_1_2))
            
            layer_2_error = layer_2 - self.get_target_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_prime(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weight_1_2.T)
            layer_1_delta = layer_1_error
            
            self.weight_1_2 -= layer_1.T.dot(layer_2_delta) * self.lr
            self.weight_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.lr
            
            if (layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif (layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            elapsed_time = float(time.time()-start)
            reviews_per_second = i/elapsed_time if elapsed_time > 0 else 0
            
            if (i%(len(train_reviews)/10)==0):
                print('Trained: ', i, '.... Reviews per Second: ', reviews_per_second, '.... Accuracy: ', correct_so_far/(i+1))
    
    def test(self, test_reviews, test_labels):
        correct = 0
        for i in range(len(test_reviews)):
            pred = self.run(test_reviews[i])
            if (pred==test_labels[i]):
                correct += 1
        print('Test Accuracy: ', correct/float(i+1))
    
    def run(self, review):
        self.update_input_layer(review.lower())
        layer_1 = self.layer_0.dot(self.weight_0_1)
        layer_2 = self.sigmoid(layer_1.dot(self.weight_1_2))
        if (layer_2>=0.5):
            return 'POSITIVE'
        else:
            return 'NEGATIVE'      
    

In [8]:
mlp = sentimentNetwork(reviews[:-1000],labels[:-1000], learn_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
mlp.test(reviews[-1000:],labels[-1000:])

Trained:  0 .... Reviews per Second:  0.0 .... Accuracy:  1.0
Trained:  2400 .... Reviews per Second:  146.56147335437026 .... Accuracy:  0.7271970012494794
Trained:  4800 .... Reviews per Second:  146.56452485580007 .... Accuracy:  0.7631743386794417
Trained:  7200 .... Reviews per Second:  146.5924039631661 .... Accuracy:  0.7871129009859742
Trained:  9600 .... Reviews per Second:  145.96278079660183 .... Accuracy:  0.8014790126028539
Trained:  12000 .... Reviews per Second:  146.10287295056173 .... Accuracy:  0.8106824431297391
Trained:  14400 .... Reviews per Second:  146.23802823395448 .... Accuracy:  0.818832025553781
Trained:  16800 .... Reviews per Second:  146.34619573171292 .... Accuracy:  0.8225105648473305
Trained:  19200 .... Reviews per Second:  146.42407578965145 .... Accuracy:  0.8279777094942972
Trained:  21600 .... Reviews per Second:  146.41816284844631 .... Accuracy:  0.8332021665663627
Test Accuracy:  0.85


In [9]:
# 7. reconstruct network to be more efficent
class sentimentNetwork2:
    '''
    hidden layer_1: activation = None
    output layer _2: activation = sigmoid
    '''
    def __init__(self, reviews, labels, hidden_nodes=10, learn_rate=0.1):
        np.random.seed(1)
        self.process_data(reviews, labels)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learn_rate)
    
    def process_data(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size, self.label_vocab_size = len(self.review_vocab), len(self.label_vocab)
        
        self.word2index = {}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        self.label2index = {}
        for i,word in enumerate(self.label_vocab):
            self.label2index[word]=i    
    
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learn_rate):
        self.input_nodes, self.hidden_nodes, self.output_nodes = input_nodes, hidden_nodes, output_nodes
        self.lr = learn_rate
        
        self.weight_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weight_1_2 = np.random.normal(0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        
        self.layer_1 = np.zeros((1, hidden_nodes))
        
    def get_target_label(self, label):
        return 1 if label=='POSITIVE' else 0

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def sigmoid_prime(self, output):
        return output * (1-output)
    
    def train(self, train_reviews_raw, train_labels):
        
        train_reviews = list()
        for review in train_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            train_reviews.append(list(indices))
        
        assert(len(train_reviews) == len(train_labels))
        correct_so_far = 0
        
        start = time.time()
        for i in range(len(train_reviews)):
            review, label = train_reviews[i], train_labels[i]
    
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weight_0_1[index]
            layer_2 = self.sigmoid(self.layer_1.dot(self.weight_1_2))
            
            layer_2_error = layer_2 - self.get_target_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_prime(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weight_1_2.T)
            layer_1_delta = layer_1_error
            
            self.weight_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.lr
            for index in review:
                self.weight_0_1[index] -= layer_1_delta[0] * self.lr
            
            if (layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif (layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            elapsed_time = float(time.time()-start)
            reviews_per_second = i/elapsed_time if elapsed_time > 0 else 0
            
            if (i%(len(train_reviews)/10)==0):
                print('Trained: ', i, '.... Reviews per Second: ', reviews_per_second, '.... Accuracy: ', correct_so_far/(i+1))
    
    def test(self, test_reviews, test_labels):
        correct = 0
        for i in range(len(test_reviews)):
            pred = self.run(test_reviews[i])
            if (pred==test_labels[i]):
                correct += 1
        print('Test Accuracy: ', correct/float(i+1))
    
    def run(self, review):
        
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weight_0_1[index]
        layer_2 = self.sigmoid(self.layer_1.dot(self.weight_1_2))
        if (layer_2>=0.5):
            return 'POSITIVE'
        else:
            return 'NEGATIVE'      
    

In [10]:
mlp = sentimentNetwork2(reviews[:-1000],labels[:-1000], learn_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
mlp.test(reviews[-1000:],labels[-1000:])

Trained:  0 .... Reviews per Second:  0.0 .... Accuracy:  1.0
Trained:  2400 .... Reviews per Second:  1493.1297071315894 .... Accuracy:  0.7259475218658892
Trained:  4800 .... Reviews per Second:  1457.308646373833 .... Accuracy:  0.7585919600083316
Trained:  7200 .... Reviews per Second:  1449.5317255760247 .... Accuracy:  0.7823913345368699
Trained:  9600 .... Reviews per Second:  1458.4155781095164 .... Accuracy:  0.7995000520779085
Trained:  12000 .... Reviews per Second:  1456.6890007721386 .... Accuracy:  0.8110990750770769
Trained:  14400 .... Reviews per Second:  1454.3639410460098 .... Accuracy:  0.8189709047982779
Trained:  16800 .... Reviews per Second:  1447.6992809035523 .... Accuracy:  0.8233438485804416
Trained:  19200 .... Reviews per Second:  1448.6200200078154 .... Accuracy:  0.8289151606687152
Trained:  21600 .... Reviews per Second:  1441.693150576111 .... Accuracy:  0.833433637331605
Test Accuracy:  0.851


In [11]:
# 8. reconstruct network to reduce noise
class sentimentNetwork3:
    '''
    hidden layer_1: activation = None
    output layer _2: activation = sigmoid
    '''
    def __init__(self, reviews, labels, min_cnt=10, polarity_cutoff=0.1, hidden_nodes=10, learn_rate=0.1):
        np.random.seed(1)
        self.process_data(reviews, labels, polarity_cutoff, min_cnt)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learn_rate)
    
    def process_data(self, reviews, labels, polarity_cutoff, min_cnt):
        
        pos_cnt, neg_cnt, total_cnt = Counter(), Counter(), Counter()
        for i in range(len(reviews)):
            for word in reviews[i].split(" "):
                if(labels[i]=='POSITIVE'):
                    pos_cnt[word] += 1
                else:
                    neg_cnt[word] += 1
                total_cnt[word] += 1
        
        self.pos_neg_ratio = Counter()
        for word, cnt in list(total_cnt.most_common()):
            if(cnt>100):
                ratio = pos_cnt[word]/float(neg_cnt[word]+1)
                self.pos_neg_ratio[word] = np.log(ratio)
                
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if(total_cnt[word]>min_cnt):
                    if (word in self.pos_neg_ratio.keys()):
                        if (abs(self.pos_neg_ratio[word])>=polarity_cutoff):
                            review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size, self.label_vocab_size = len(self.review_vocab), len(self.label_vocab)
        
        self.word2index = {}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        self.label2index = {}
        for i,word in enumerate(self.label_vocab):
            self.label2index[word]=i    
    
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learn_rate):
        self.input_nodes, self.hidden_nodes, self.output_nodes = input_nodes, hidden_nodes, output_nodes
        self.lr = learn_rate
        
        self.weight_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weight_1_2 = np.random.normal(0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        
        self.layer_1 = np.zeros((1, hidden_nodes))
        
    def get_target_label(self, label):
        return 1 if label=='POSITIVE' else 0

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def sigmoid_prime(self, output):
        return output * (1-output)
    
    def train(self, train_reviews_raw, train_labels):
        
        train_reviews = list()
        for review in train_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            train_reviews.append(list(indices))
        
        assert(len(train_reviews) == len(train_labels))
        correct_so_far = 0
        
        start = time.time()
        for i in range(len(train_reviews)):
            review, label = train_reviews[i], train_labels[i]
    
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weight_0_1[index]
            layer_2 = self.sigmoid(self.layer_1.dot(self.weight_1_2))
            
            layer_2_error = layer_2 - self.get_target_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_prime(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weight_1_2.T)
            layer_1_delta = layer_1_error
            
            self.weight_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.lr
            for index in review:
                self.weight_0_1[index] -= layer_1_delta[0] * self.lr
            
            if (layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif (layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            elapsed_time = float(time.time()-start)
            reviews_per_second = i/elapsed_time if elapsed_time > 0 else 0
            
            if (i%(len(train_reviews)/10)==0):
                print('Trained: ', i, '.... Reviews per Second: ', reviews_per_second, '.... Accuracy: ', correct_so_far/(i+1))
    
    def test(self, test_reviews, test_labels):
        correct = 0
        for i in range(len(test_reviews)):
            pred = self.run(test_reviews[i])
            if (pred==test_labels[i]):
                correct += 1
        print('Test Accuracy: ', correct/float(i+1))
    
    def run(self, review):
        
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weight_0_1[index]
        layer_2 = self.sigmoid(self.layer_1.dot(self.weight_1_2))
        if (layer_2>=0.5):
            return 'POSITIVE'
        else:
            return 'NEGATIVE'      
    

In [12]:
mlp = sentimentNetwork3(reviews[:-1000],labels[:-1000], min_cnt=20, polarity_cutoff=0.05, learn_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])
mlp.test(reviews[-1000:],labels[-1000:])

Trained:  0 .... Reviews per Second:  0.0 .... Accuracy:  1.0
Trained:  2400 .... Reviews per Second:  2033.4384427272312 .... Accuracy:  0.7838400666389005
Trained:  4800 .... Reviews per Second:  1990.4251937738866 .... Accuracy:  0.8031660070818579
Trained:  7200 .... Reviews per Second:  1979.2061570839733 .... Accuracy:  0.8162755172892654
Trained:  9600 .... Reviews per Second:  1985.8951218929976 .... Accuracy:  0.8267888761587334
Trained:  12000 .... Reviews per Second:  1983.0261201258427 .... Accuracy:  0.8333472210649112
Trained:  14400 .... Reviews per Second:  1984.358969152764 .... Accuracy:  0.8370252065828762
Trained:  16800 .... Reviews per Second:  1977.1870579327256 .... Accuracy:  0.8389381584429498
Trained:  19200 .... Reviews per Second:  1975.8787861806172 .... Accuracy:  0.8416749127649602
Trained:  21600 .... Reviews per Second:  1966.589572037546 .... Accuracy:  0.8445905282162863
Test Accuracy:  0.859


In [15]:
# 9. check word similarity and visiualization
def get_most_similar_words(mlp, focus):
    most_similar = Counter()
    for w in mlp.word2index.keys():
        most_similar[w] = np.dot(mlp.weight_0_1[mlp.word2index[w]], mlp.weight_0_1[mlp.word2index[focus]])
    return most_similar.most_common()

get_most_similar_words(mlp, 'worst')[:10]



[('worst', 0.2808058120010506),
 ('awful', 0.22262472226916488),
 ('waste', 0.20486370634924148),
 ('terrible', 0.16873217903325333),
 ('poor', 0.16166298061841913),
 ('dull', 0.15730899124270206),
 ('disappointment', 0.14814052399370076),
 ('disappointing', 0.1428794709315688),
 ('fails', 0.13998772693746095),
 ('poorly', 0.139409864294293)]

In [14]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in mlp.pos_neg_ratio.most_common(500):
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratio.most_common()))[0:500]:
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)

pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in mlp.pos_neg_ratio.keys():
        vectors_list.append(mlp.weight_0_1[mlp.word2index[word]])
        if(mlp.pos_neg_ratio[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)