In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
from utils.utils import build_freqs, process_tweet

import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import warnings
from sys import getsizeof
warnings.filterwarnings('ignore')


#### To be optimized later with generator function

In [2]:
class SentimentAnalysis:
    
    def __init__(self, trainingSplit, tweets_df):
        self.trainingSplit = trainingSplit
        self.tweets_df = tweets_df
    
    def getCleanDataFrame(self, df):
        df = df[[0, 5]]
        df[0] = df[0].map({0:-1, 4:1})
        df.rename(columns = {0:'sentiment', 5:'tweet'}, inplace = True)
        self.tweets_df = df
        return df
    
    def extract_features(self, tweet, freqs):

        word_l = process_tweet(tweet)
        
        x = np.zeros((1, 3)) 
        
        x[0,0] = 1 

        for word in word_l:

            x[0,1] += freqs.get((word, 1.0),0)
            x[0,2] += freqs.get((word, 0.0),0)

        assert(x.shape == (1, 3))
        return x
    
    
    def performTrainDevTestSplit(self):
        positive = self.tweets_df.query('sentiment == 1')
        negative = self.tweets_df.query('sentiment == -1')
        
        print(positive.head())
        print(negative.head())
        
        pos_observations = positive.shape[0]
        neg_observations = negative.shape[0]
        
        devLen = testLen = math.floor(((100 - self.trainingSplit)/2) * pos_observations * 0.01)
        trainLen = math.floor(pos_observations * self.trainingSplit * 0.01)
        
        train_pos_x = list(positive['tweet'].iloc[0:trainLen])
        train_pos_y = list(positive['sentiment'].iloc[0:trainLen])
        dev_pos_x = list(positive['tweet'].iloc[trainLen:devLen])
        dev_pos_y = list(positive['sentiment'].iloc[trainLen:devLen])
        test_pos_x = list(positive['tweet'].iloc[testLen:])
        test_pos_y = list(positive['sentiment'].iloc[testLen:])
        
        devLen = testLen = math.floor(((100 - self.trainingSplit)/2) * neg_observations * 0.01)
        trainLen = math.floor(neg_observations * self.trainingSplit * 0.01)
        
        train_neg_x = list(negative['tweet'].iloc[0:trainLen])
        train_neg_y = list(negative['sentiment'].iloc[0:trainLen])
        dev_neg_x = list(negative['tweet'].iloc[trainLen:devLen])
        dev_neg_y = list(negative['sentiment'].iloc[trainLen:devLen])
        test_neg_x = list(negative['tweet'].iloc[testLen:])
        test_neg_y = list(negative['sentiment'].iloc[testLen:])
        
        train_x = train_pos_x + train_neg_x
        train_y = train_pos_y + train_neg_y
        dev_x = dev_pos_x + dev_neg_x
        dev_y = dev_pos_y + dev_neg_y
        test_x = test_pos_x + test_neg_x
        test_y = test_pos_y + test_neg_y
        
        return (train_x, train_y, dev_x, dev_y, test_x, test_y)
        

In [3]:
if __name__ == '__main__':
    tweets_df = pd.read_csv("data/tweets.csv", encoding = "ISO-8859-1", header=None)
    print('Total number of tweets: ',tweets_df.shape[0])
    print('\n *************** \n')
    sentiment = SentimentAnalysis(95, tweets_df)
    tweets_df = sentiment.getCleanDataFrame(tweets_df)
    train_x, train_y, dev_x, dev_y, test_x, test_y = sentiment.performTrainDevTestSplit()
    

Total number of tweets:  1600000

 *************** 

        sentiment                                              tweet
800000          1       I LOVE @Health4UandPets u guys r the best!! 
800001          1  im meeting up with one of my besties tonight! ...
800002          1  @DaRealSunisaKim Thanks for the Twitter add, S...
800003          1  Being sick can be really cheap when it hurts t...
800004          1    @LovesBrooklyn2 he has that effect on everyone 
   sentiment                                              tweet
0         -1  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         -1  is upset that he can't update his Facebook by ...
2         -1  @Kenichan I dived many times for the ball. Man...
3         -1    my whole body feels itchy and like its on fire 
4         -1  @nationwideclass no, it's not behaving at all....


In [4]:
freqs = build_freqs(train_x, train_y)


array([[1.e+00, 5.e+03, 0.e+00]])

In [6]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= sentiment.extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

1. Design Model (input, output, forward pass) <br>
2. Construct loss and optimizer <br>
3. Training Loop:
 - forward pass : compute prediction and loss
 - backward pass: gradients
 - update weights
