In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
from utils.utils import build_freqs, process_tweet

from tqdm.notebook import tqdm
import time

import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import warnings
from sys import getsizeof
warnings.filterwarnings('ignore')


#### To be optimized later with generator function

In [2]:
class SentimentAnalysis:
    
    def __init__(self, trainingSplit, tweets_df):
        self.trainingSplit = trainingSplit
        self.tweets_df = tweets_df
    
    def getCleanDataFrame(self, df):
        df = df[[0, 5]]
        df[0] = df[0].map({0:0, 4:1})
        df.rename(columns = {0:'sentiment', 5:'tweet'}, inplace = True)
        self.tweets_df = df
        return df
    
    def extract_features(self, tweet, freqs):

        word_l = process_tweet(tweet)
        
        x = np.zeros((1, 3)) 
        
        x[0,0] = 1 

        for word in word_l:

            x[0,1] += freqs.get((word, 1.0),0)
            x[0,2] += freqs.get((word, 0.0),0)

        assert(x.shape == (1, 3))
        return x
    
    
    def performTrainDevTestSplit(self):
        positive = self.tweets_df.query('sentiment == 1')
        negative = self.tweets_df.query('sentiment == 0')
        
        print(positive.head())
        print(negative.head())
        
        pos_observations = positive.shape[0]
        neg_observations = negative.shape[0]
        
        devLen = testLen = math.floor(((100 - self.trainingSplit)/2) * pos_observations * 0.01)
        trainLen = math.floor(pos_observations * self.trainingSplit * 0.01)
        
        train_pos_x = list(positive['tweet'].iloc[0:trainLen])
        train_pos_y = list(positive['sentiment'].iloc[0:trainLen])
        dev_pos_x = list(positive['tweet'].iloc[trainLen:devLen])
        dev_pos_y = list(positive['sentiment'].iloc[trainLen:devLen])
        test_pos_x = list(positive['tweet'].iloc[testLen:])
        test_pos_y = list(positive['sentiment'].iloc[testLen:])
        
        devLen = testLen = math.floor(((100 - self.trainingSplit)/2) * neg_observations * 0.01)
        trainLen = math.floor(neg_observations * self.trainingSplit * 0.01)
        
        train_neg_x = list(negative['tweet'].iloc[0:trainLen])
        train_neg_y = list(negative['sentiment'].iloc[0:trainLen])
        dev_neg_x = list(negative['tweet'].iloc[trainLen:devLen])
        dev_neg_y = list(negative['sentiment'].iloc[trainLen:devLen])
        test_neg_x = list(negative['tweet'].iloc[testLen:])
        test_neg_y = list(negative['sentiment'].iloc[testLen:])
        
        train_x = train_pos_x + train_neg_x
        train_y = train_pos_y + train_neg_y
        dev_x = dev_pos_x + dev_neg_x
        dev_y = dev_pos_y + dev_neg_y
        test_x = test_pos_x + test_neg_x
        test_y = test_pos_y + test_neg_y
        
        return (train_x, train_y, dev_x, dev_y, test_x, test_y)
    
    def getTrainDevTestArray(self, datasets):
        dataset_arrays = []
        for X, Y in datasets:
            print(f'X len: {len(X)}')
            print(f'Y len: {len(Y)}')
            freqs = build_freqs(X, Y)
            X = np.zeros((len(X), 3))
            for i in range(len(X)):
                X[i, :]= sentiment.extract_features(X[i], freqs)

            # training labels corresponding to X
            Y = np.array(Y)
            dataset_arrays.append(X)
            dataset_arrays.append(Y)
        
        return tuple(dataset_arrays)
        

In [3]:
if __name__ == '__main__':
    tweets_df = pd.read_csv("data/tweets.csv", encoding = "ISO-8859-1", header=None)
#     tweets_df = pd.read_csv("data/tweets.csv", encoding = "utf-8", header=None)
    
    tweets_df_1 = tweets_df.iloc[:10000]
    tweets_df_2 = tweets_df.iloc[len(tweets_df)-10000:]
    tweets_df = tweets_df_1.append(tweets_df_2)
    
    print('Total number of tweets: ',tweets_df.shape[0])
    print('\n *************** \n')
    sentiment = SentimentAnalysis(80, tweets_df)
    tweets_df = sentiment.getCleanDataFrame(tweets_df)
    train_x, train_y, dev_x, dev_y, test_x, test_y = sentiment.performTrainDevTestSplit()
    train_X, train_Y, dev_X, dev_Y, test_X, test_Y = sentiment.getTrainDevTestArray([(train_x, train_y), (dev_x, dev_y), (test_x, test_y)])
    

Total number of tweets:  20000

 *************** 

         sentiment                                              tweet
1590000          1                                @michaelahills YEP 
1590001          1  need to do homework.. can't wait for LVATT.. i...
1590002          1  @AnneSudworth Gosh, that's hectic... Tea? I'm ...
1590003          1                @hana77  Daily:  8am - 12 midnight 
1590004          1  watching the staff in the posh Soho face cream...
   sentiment                                              tweet
0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          0  is upset that he can't update his Facebook by ...
2          0  @Kenichan I dived many times for the ball. Man...
3          0    my whole body feels itchy and like its on fire 
4          0  @nationwideclass no, it's not behaving at all....
X len: 16000
Y len: 16000
X len: 0
Y len: 0
X len: 18000
Y len: 18000


1. Design Model (input, output, forward pass) <br>
2. Construct loss and optimizer <br>
3. Training Loop:
 - forward pass : compute prediction and loss
 - backward pass: gradients
 - update weights


In [52]:
a = 'sdsd'
a.decode('utf-8')

AttributeError: 'str' object has no attribute 'decode'

In [None]:
# model
class LogisticRegression(nn.Module):
    
    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
        
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred
    

model = LogisticRegression()

# Loss and Optimizer

learning_rate = 0.01
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

# training loop

num_epochs = 10
for epoch in range(num_epochs):
    # forward pass and loss
    y_pred = model.forward(train_x)
    loss = criterion(y_pred, train_y)
    # backward 
    loss.backward()
    # updates
    optimizer.step()
    # reset the gradients
    optimizer.zero_grad()
    
    if (epoch+1)%10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')
        
with torch.no_grad():
    y_pred = model.forward(dev_x)
    y_pred_cls = y_pred.round()
    acc = y_pred_cls.eq(dev_y).sum() / float(dev_y.shape[0])
    print(f'Accuracy {acc:.4f}')