In [45]:
import pandas as pd
import numpy as np
import nltk

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

In [46]:
# load google news word2vec
import gensim.downloader as api
# w2v = api.load('word2vec-google-news-300')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [47]:
dataset = pd.read_csv('Data/sentiment140.csv')
dataset

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
...,...,...,...,...,...
1599995,Just woke up. Having no school is the best fee...,Tue Jun 16 08:40:49 PDT 2009,AmandaMarie1028,4,NO_QUERY
1599996,TheWDB.com - Very cool to hear old Walt interv...,Tue Jun 16 08:40:49 PDT 2009,TheWDBoards,4,NO_QUERY
1599997,Are you ready for your MoJo Makeover? Ask me f...,Tue Jun 16 08:40:49 PDT 2009,bpbabe,4,NO_QUERY
1599998,Happy 38th Birthday to my boo of alll time!!! ...,Tue Jun 16 08:40:49 PDT 2009,tinydiamondz,4,NO_QUERY


In [48]:
#remove web addresses, signs
#change to lowercase

import re
import string

def clean_en_text(text):

    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '<URL>', text)
    text = re.sub('@\S+', '<MENTION>', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)

    # removing sign
    text = ''.join([i for i in text if ord(i) not in [33, 34, 35, 36, 37, 38,
                                                      39, 40, 41, 42, 43, 44,
                                                      45, 46, 47, 58, 59, 60,
                                                      61, 62, 63, 64, 91, 92,
                                                      93, 94, 95, 96, 123, 124,
                                                      125, 126, 1548, 1567]])
    return text

dataset['text'] = dataset['text'].apply(clean_en_text)

In [49]:
# Tokenizing

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
dataset['text'] = dataset['text'].apply(lambda x: tokenizer.tokenize(x))

In [50]:
#Lemmatizing
from nltk.stem import WordNetLemmatizer

def lemmatizer(text_list):
    lemm = WordNetLemmatizer()
    words = [lemm.lemmatize(word) for word in text_list]
    return words


dataset['text'] = dataset['text'].apply(lambda x: lemmatizer(x))

In [51]:
#Showing 20 first twits
dataset['text'][0:20]

0     [MENTION, URL, awww, that, s, a, bummer, you, ...
1     [is, upset, that, he, can, t, update, his, fac...
2     [MENTION, i, dived, many, time, for, the, ball...
3     [my, whole, body, feel, itchy, and, like, it, ...
4     [MENTION, no, it, s, not, behaving, at, all, i...
5                      [MENTION, not, the, whole, crew]
6                                        [need, a, hug]
7     [MENTION, hey, long, time, no, see, yes, rain,...
8              [MENTION, nope, they, didn, t, have, it]
9                             [MENTION, que, me, muera]
10     [spring, break, in, plain, city, it, s, snowing]
11                      [i, just, re, pierced, my, ear]
12    [MENTION, i, couldn, t, bear, to, watch, it, a...
13    [MENTION, it, it, count, idk, why, i, did, eit...
14    [MENTION, i, would, ve, been, the, first, but,...
15    [MENTION, i, wish, i, got, to, watch, it, with...
16    [hollis, death, scene, will, hurt, me, severel...
17                               [about, to, fil

In [52]:
def create_vocabulary(all_text):
    vocabulary = {'PAD': 0, 'END': 1, 'UNK': 2}

    for row in all_text:
        for word in row:
            if word not in vocabulary.keys():
                vocabulary[word] = len(vocabulary.keys())
    return vocabulary


def get_max_length(all_text: pd.DataFrame):
    max_length = 0
    for row in all_text:
        if len(row) > max_length:
            max_length = len(row)
    return max_length

def vectorization(all_text: pd.DataFrame, vocabulary: dict):
    new_embedding = all_text.copy(deep=True)
    for i in range(len(all_text)):
        row = all_text.iloc[i]
        new_embedding.iloc[i] = [vocabulary[token] for token in row]
    return new_embedding

def apply_padding(vectorized_dataset: pd.DataFrame, max_length: int, vocabulary: dict, pad_token: str):
    copy_data = vectorized_dataset.copy(deep=True)
    for i in range(len(vectorized_dataset)):
        row = copy_data.iloc[i]
        padding_token_id = vocabulary[pad_token]
        copy_data.iloc[i] = row + [padding_token_id for _ in range(max_length - len(row))]
    return copy_data

vocabulary = create_vocabulary(dataset['text'])
max_length = get_max_length(dataset['text'])
vectorized_data = vectorization(dataset['text'], vocabulary)
vectorized_padding = apply_padding(vectorized_data, max_length=max_length, vocabulary=vocabulary, pad_token='PAD')
vectorized_padding[:10]

0    [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
1    [22, 23, 6, 24, 25, 26, 27, 28, 29, 30, 31, 20...
2    [3, 40, 41, 42, 43, 44, 45, 46, 47, 18, 48, 45...
3    [53, 54, 55, 56, 57, 32, 58, 20, 59, 60, 0, 0,...
4    [3, 61, 20, 7, 62, 63, 64, 65, 40, 66, 67, 68,...
5    [3, 62, 45, 54, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
6    [76, 8, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
7    [3, 78, 79, 43, 61, 72, 80, 81, 8, 82, 83, 8, ...
8    [3, 88, 89, 90, 26, 91, 20, 0, 0, 0, 0, 0, 0, ...
9    [3, 92, 93, 94, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: text, dtype: object

In [60]:
#train set, validation set creation
from sklearn.model_selection import train_test_split

labeled_data = pd.concat([vectorized_padding, dataset['sentiment']], axis=1)
x_train, x_valid, y_train, y_valid = train_test_split(labeled_data['text'], labeled_data['sentiment'], train_size=0.8, random_state=42)

In [54]:
 #training Word2Vec with x_train
from gensim.models import Word2Vec
model = Word2Vec(vector_size=128, window=5, sg=0)

In [69]:
# create Tensor datasets
train_data = TensorDataset(torch.Tensor(x_train), torch.Tensor(y_train))
# valid_data = TensorDataset(torch.Tensor(x_valid), torch.Tensor(y_valid))

# dataloaders
batch_size = 50

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
# valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [72]:
class SentimentRNN(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.Softmax()


    def forward(self, x, hidden):
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        rnn_out, hidden = self.rnn(embeds, hidden)

        rnn_out = rnn_out[:, -1, :] # getting the last time step output

        # dropout and fully-connected layer
        out = self.dropout(rnn_out)
        out = self.fc(out)
        # sigmoid function
        softmax_out = self.softmax(out)

        # return last sigmoid output and hidden state
        return softmax_out, hidden

In [73]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocabulary)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 300
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
net

SentimentRNN(
  (embedding): Embedding(255282, 300)
  (rnn): RNN(300, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (softmax): Softmax(dim=None)
)