In [64]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
from utils.utils import build_freqs, process_tweet

from tqdm.notebook import tqdm
import time

import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import warnings
from sys import getsizeof
warnings.filterwarnings('ignore')


#### To be optimized later with generator function

In [81]:
class SentimentAnalysis:
    
    def __init__(self, trainingSplit, tweets_df):
        self.trainingSplit = trainingSplit
        self.tweets_df = tweets_df
    
    def getCleanDataFrame(self, df):
        df = df[[0, 5]]
        df[0] = df[0].map({0:0, 4:1})
        df.rename(columns = {0:'sentiment', 5:'tweet'}, inplace = True)
        self.tweets_df = df
        return df
    
    def extract_features(self, tweet, freqs):

        word_l = process_tweet(tweet)
        
        x = np.zeros((1, 3)) 
        
        x[0,0] = 1 

        for word in word_l:

            x[0,1] += freqs.get((word, 1.0),0)
            x[0,2] += freqs.get((word, 0.0),0)

        assert(x.shape == (1, 3))
        return x
    
    
    def performTrainDevTestSplit(self):
        positive = self.tweets_df.query('sentiment == 1')
        negative = self.tweets_df.query('sentiment == 0')
        
        print(positive.head())
        print(negative.head())
        
        pos_observations = positive.shape[0]
        neg_observations = negative.shape[0]
        
        total_observations = positive.shape[0]
        devLen = testLen = math.floor(((100 - self.trainingSplit)/2) * pos_observations * 0.01)
        trainLen = math.floor(pos_observations * self.trainingSplit * 0.01)
        
        train_pos_x = list(positive['tweet'].iloc[0:trainLen])
        train_pos_y = list(positive['sentiment'].iloc[0:trainLen])
        dev_pos_x = list(positive['tweet'].iloc[trainLen:total_observations - testLen])
        dev_pos_y = list(positive['sentiment'].iloc[trainLen:total_observations - testLen])
        test_pos_x = list(positive['tweet'].iloc[trainLen + devLen:])
        test_pos_y = list(positive['sentiment'].iloc[trainLen + devLen:])
        
        total_observations = negative.shape[0]
        devLen = testLen = math.floor(((100 - self.trainingSplit)/2) * neg_observations * 0.01)
        trainLen = math.floor(neg_observations * self.trainingSplit * 0.01)

        train_neg_x = list(negative['tweet'].iloc[0:trainLen])
        train_neg_y = list(negative['sentiment'].iloc[0:trainLen])
        dev_neg_x = list(negative['tweet'].iloc[trainLen:total_observations - testLen])
        dev_neg_y = list(negative['sentiment'].iloc[trainLen:total_observations - testLen])
        test_neg_x = list(negative['tweet'].iloc[trainLen + devLen:])
        test_neg_y = list(negative['sentiment'].iloc[trainLen + devLen:])
        

        train_x = train_pos_x + train_neg_x
        train_y = train_pos_y + train_neg_y
        dev_x = dev_pos_x + dev_neg_x
        dev_y = dev_pos_y + dev_neg_y
        test_x = test_pos_x + test_neg_x
        test_y = test_pos_y + test_neg_y
        
        return (train_x, train_y, dev_x, dev_y, test_x, test_y)
    
    def getTrainDevTestArray(self, datasets):
        dataset_arrays = []
        for Xi, Y in datasets:
            print(f'X len: {len(Xi)}')
            print(f'Y len: {len(Y)}')
            freqs = build_freqs(Xi, Y)
            X = np.zeros((len(Xi), 3))
            for i in range(len(X)):
                X[i, :]= sentiment.extract_features(Xi[i], freqs)

            # training labels corresponding to X
            Y = torch.from_numpy(np.array(Y)).float()
            
            dataset_arrays.append(torch.from_numpy(X).float())
            dataset_arrays.append(Y.reshape(len(X),1))
        
        return tuple(dataset_arrays)
        

In [82]:
if __name__ == '__main__':
    tweets_df = pd.read_csv("data/tweets.csv", encoding = "ISO-8859-1", header=None)
#     tweets_df = pd.read_csv("data/tweets.csv", encoding = "utf-8", header=None)
    
#     tweets_df_1 = tweets_df.iloc[:100]
#     tweets_df_2 = tweets_df.iloc[len(tweets_df)-100:]
#     tweets_df = tweets_df_1.append(tweets_df_2)
    
    print('Total number of tweets: ',tweets_df.shape[0])
    print('\n *************** \n')
    sentiment = SentimentAnalysis(97, tweets_df)
    tweets_df = sentiment.getCleanDataFrame(tweets_df)
    train_x, train_y, dev_x, dev_y, test_x, test_y = sentiment.performTrainDevTestSplit()
    train_X, train_Y, dev_X, dev_Y, test_X, test_Y = sentiment.getTrainDevTestArray([(train_x, train_y), (dev_x, dev_y), (test_x, test_y)])
    

Total number of tweets:  200

 *************** 

         sentiment                                              tweet
1599900          1  @cris1015 My goal on stocks like $MTXX is to h...
1599901          1  @wolfpack40 That's what I'm thinking! Knock he...
1599902          1  @Jchawes yeah what about remotes for your car ...
1599903          1  Post your Le Mans 2009 pics: .. I didn't reall...
1599904          1                 @say_my_name TRAITOR!!!! love you 
   sentiment                                              tweet
0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          0  is upset that he can't update his Facebook by ...
2          0  @Kenichan I dived many times for the ball. Man...
3          0    my whole body feels itchy and like its on fire 
4          0  @nationwideclass no, it's not behaving at all....
X len: 160
Y len: 160
X len: 20
Y len: 20
X len: 20
Y len: 20


1. Design Model (input, output, forward pass) <br>
2. Construct loss and optimizer <br>
3. Training Loop:
 - forward pass : compute prediction and loss
 - backward pass: gradients
 - update weights


In [83]:
x = train_Y

In [86]:
# model
class LogisticRegression(nn.Module):
    
    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
        
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred
    

model = LogisticRegression(train_X.shape[1])

# Loss and Optimizer

learning_rate = 0.01
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# training loop

num_epochs = 100

for epoch in range(num_epochs):
    # forward pass and loss
    y_pred = model.forward(train_X)
    loss = criterion(y_pred, train_Y)
    # backward 
    loss.backward()
    # updates
    optimizer.step()
    # reset the gradients
    optimizer.zero_grad()
    
    if (epoch+1)%10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')
        
with torch.no_grad():
    y_pred = model.forward(test_X)
    y_pred_cls = y_pred.round()
    acc = y_pred_cls.eq(test_Y).sum() / float(test_Y.shape[0])
    print(f'Accuracy {acc:.4f}')

epoch: 10, loss = 6.1094
epoch: 20, loss = 4.6659
epoch: 30, loss = 3.2259
epoch: 40, loss = 1.8554
epoch: 50, loss = 0.8788
epoch: 60, loss = 0.7878
epoch: 70, loss = 0.5524
epoch: 80, loss = 0.4798
epoch: 90, loss = 0.4136
epoch: 100, loss = 0.3760
epoch: 110, loss = 0.3477
epoch: 120, loss = 0.3264
epoch: 130, loss = 0.3098
epoch: 140, loss = 0.2961
epoch: 150, loss = 0.2847
epoch: 160, loss = 0.2751
epoch: 170, loss = 0.2668
epoch: 180, loss = 0.2597
epoch: 190, loss = 0.2534
epoch: 200, loss = 0.2479
epoch: 210, loss = 0.2430
epoch: 220, loss = 0.2386
epoch: 230, loss = 0.2347
epoch: 240, loss = 0.2312
epoch: 250, loss = 0.2280
epoch: 260, loss = 0.2252
epoch: 270, loss = 0.2226
epoch: 280, loss = 0.2202
epoch: 290, loss = 0.2181
epoch: 300, loss = 0.2162
epoch: 310, loss = 0.2144
epoch: 320, loss = 0.2127
epoch: 330, loss = 0.2113
epoch: 340, loss = 0.2099
epoch: 350, loss = 0.2086
epoch: 360, loss = 0.2075
epoch: 370, loss = 0.2064
epoch: 380, loss = 0.2054
epoch: 390, loss = 0.