In [1]:
import pandas as pd
import numpy as np
import nltk

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# load google news word2vec
import gensim.downloader as api
# w2v = api.load('word2vec-google-news-300')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
dataset = pd.read_csv('Data/sentiment140.csv')
dataset[:10]

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
5,@Kwesidei not the whole crew,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,0,NO_QUERY
6,Need a hug,Mon Apr 06 22:20:03 PDT 2009,mybirch,0,NO_QUERY
7,@LOLTrish hey long time no see! Yes.. Rains a...,Mon Apr 06 22:20:03 PDT 2009,coZZ,0,NO_QUERY
8,@Tatiana_K nope they didn't have it,Mon Apr 06 22:20:05 PDT 2009,2Hood4Hollywood,0,NO_QUERY
9,@twittera que me muera ?,Mon Apr 06 22:20:09 PDT 2009,mimismo,0,NO_QUERY


In [20]:
#remove web addresses, signs

import re
import string

def clean_en_text(text):

    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '<URL>', text)
    text = re.sub('@\S+', '<MENTION>', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)

    # removing sign
    text = ''.join([i for i in text if ord(i) not in [33, 34, 35, 36, 37, 38,
                                                      39, 40, 41, 42, 43, 44,
                                                      45, 46, 47, 58, 59, 60,
                                                      61, 62, 63, 64, 91, 92,
                                                      93, 94, 95, 96, 123, 124,
                                                      125, 126, 1548, 1567]])
    return text

dataset['text'] = dataset['text'].apply(clean_en_text)
dataset['text'][:10]

0     MENTION   URL    awww  that s a bummer   you ...
1    is upset that he can t update his facebook by ...
2     MENTION  i dived many times for the ball  man...
3      my whole body feels itchy and like its on fire 
4     MENTION  no  it s not behaving at all  i m ma...
5                         MENTION  not the whole crew 
6                                          need a hug 
7     MENTION  hey  long time no see  yes   rains a...
8                   MENTION  nope they didn t have it 
9                             MENTION  que me muera   
Name: text, dtype: object

In [21]:
# Tokenizing
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
dataset['text'] = dataset['text'].apply(lambda x: tokenizer.tokenize(x))
dataset['text'][:10]

0    [MENTION, URL, awww, that, s, a, bummer, you, ...
1    [is, upset, that, he, can, t, update, his, fac...
2    [MENTION, i, dived, many, times, for, the, bal...
3    [my, whole, body, feels, itchy, and, like, its...
4    [MENTION, no, it, s, not, behaving, at, all, i...
5                     [MENTION, not, the, whole, crew]
6                                       [need, a, hug]
7    [MENTION, hey, long, time, no, see, yes, rains...
8             [MENTION, nope, they, didn, t, have, it]
9                            [MENTION, que, me, muera]
Name: text, dtype: object

In [6]:
#Lemmatizing
from nltk.stem import WordNetLemmatizer

def lemmatizer(text_list):
    lemm = WordNetLemmatizer()
    words = [lemm.lemmatize(word) for word in text_list]
    return words


dataset['text'] = dataset['text'].apply(lambda x: lemmatizer(x))

In [7]:
#Showing 20 first twits
dataset['text'][0:20]

0     [MENTION, URL, awww, that, s, a, bummer, you, ...
1     [is, upset, that, he, can, t, update, his, fac...
2     [MENTION, i, dived, many, time, for, the, ball...
3     [my, whole, body, feel, itchy, and, like, it, ...
4     [MENTION, no, it, s, not, behaving, at, all, i...
5                      [MENTION, not, the, whole, crew]
6                                        [need, a, hug]
7     [MENTION, hey, long, time, no, see, yes, rain,...
8              [MENTION, nope, they, didn, t, have, it]
9                             [MENTION, que, me, muera]
10     [spring, break, in, plain, city, it, s, snowing]
11                      [i, just, re, pierced, my, ear]
12    [MENTION, i, couldn, t, bear, to, watch, it, a...
13    [MENTION, it, it, count, idk, why, i, did, eit...
14    [MENTION, i, would, ve, been, the, first, but,...
15    [MENTION, i, wish, i, got, to, watch, it, with...
16    [hollis, death, scene, will, hurt, me, severel...
17                               [about, to, fil

In [8]:
from collections import Counter

def encode(tokens, word2idx):
    return [word2idx[token] for token in tokens if token in word2idx]


# Build vocabulary
dataset['tokens'] = dataset['text']
vocab = Counter()
for tokens in dataset['tokens']:
    vocab.update(tokens)


word2idx = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}
word2idx['<PAD>'] = 0  # Padding token

dataset['encoded'] = dataset['tokens'].apply(lambda x: encode(x, word2idx))
dataset['encoded'][:10]

0    [2, 41, 499, 17, 13, 5, 1149, 9, 3306, 54, 822...
1    [10, 757, 17, 85, 32, 15, 391, 195, 541, 130, ...
2    [2, 1, 47210, 320, 50, 12, 4, 944, 1641, 3, 85...
3          [7, 434, 771, 95, 2791, 8, 38, 6, 16, 1093]
4    [2, 42, 6, 13, 27, 9230, 26, 36, 1, 21, 597, 1...
5                                [2, 27, 4, 434, 2098]
6                                         [81, 5, 503]
7    [2, 158, 176, 50, 42, 71, 153, 252, 5, 249, 11...
8                         [2, 780, 78, 151, 15, 20, 6]
9                                 [2, 2400, 18, 99172]
Name: encoded, dtype: object

In [9]:
#label Encoding
max_len = max(dataset['encoded'].apply(len))

def pad_sequence(seq, max_len):
    return seq + [word2idx['<PAD>']] * (max_len - len(seq))

dataset['padded'] = dataset['encoded'].apply(lambda x: pad_sequence(x, max_len))
dataset['padded'][:10]

0    [2, 41, 499, 17, 13, 5, 1149, 9, 3306, 54, 822...
1    [10, 757, 17, 85, 32, 15, 391, 195, 541, 130, ...
2    [2, 1, 47210, 320, 50, 12, 4, 944, 1641, 3, 85...
3    [7, 434, 771, 95, 2791, 8, 38, 6, 16, 1093, 0,...
4    [2, 42, 6, 13, 27, 9230, 26, 36, 1, 21, 597, 1...
5    [2, 27, 4, 434, 2098, 0, 0, 0, 0, 0, 0, 0, 0, ...
6    [81, 5, 503, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7    [2, 158, 176, 50, 42, 71, 153, 252, 5, 249, 11...
8    [2, 780, 78, 151, 15, 20, 6, 0, 0, 0, 0, 0, 0,...
9    [2, 2400, 18, 99172, 0, 0, 0, 0, 0, 0, 0, 0, 0...
Name: padded, dtype: object

In [10]:
#label Encoding
label2idx = {label: idx for idx, label in enumerate(dataset['sentiment'].unique())}
dataset['label_idx'] = dataset['sentiment'].map(label2idx)

In [11]:
# Split the data
from sklearn.model_selection import train_test_split


train_df, temp_df = train_test_split(dataset, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = TextDataset(train_df['padded'].tolist(), train_df['label_idx'].tolist())
val_dataset = TextDataset(val_df['padded'].tolist(), val_df['label_idx'].tolist())
test_dataset = TextDataset(test_df['padded'].tolist(), test_df['label_idx'].tolist())

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [12]:
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

In [13]:
embedding_dim = 300
embedding_matrix = np.random.normal(size=(len(word2idx), embedding_dim))

for word, idx in word2idx.items():
    if word in word2vec_model:
        embedding_matrix[idx] = word2vec_model[word]
    else:
        embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))

embedding_matrix[word2idx['<PAD>']] = np.zeros((embedding_dim,))  # Padding token

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class TextRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1, embedding_matrix=None):
        super(TextRNN, self).__init__()
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=word2idx['<PAD>'])
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [15]:
vocab_size = len(word2idx)
embed_size = embedding_dim  # 300
hidden_size = 128
output_size = len(label2idx)
num_layers = 2

model = TextRNN(vocab_size, embed_size, hidden_size, output_size, num_layers, embedding_matrix=embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(texts)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            output = model(texts)
            loss = criterion(output, labels)
            val_loss += loss.item()
            _, predicted = torch.max(output, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss/len(val_loader)}, Val Accuracy: {accuracy}%')


Epoch 1/5, Loss: 0.6688745021820068, Val Loss: 0.6991328447182973, Val Accuracy: 53.45875%
Epoch 2/5, Loss: 0.7004598379135132, Val Loss: 0.6955732303460439, Val Accuracy: 49.7725%
Epoch 3/5, Loss: 0.7044371962547302, Val Loss: 0.6986248676300049, Val Accuracy: 49.7725%
Epoch 4/5, Loss: 0.7030748128890991, Val Loss: 0.6977616627852122, Val Accuracy: 49.7725%
Epoch 5/5, Loss: 0.6946882605552673, Val Loss: 0.6934656785488129, Val Accuracy: 49.7725%


In [18]:
for x_test, y_test in test_loader:
    val_loss = 0
    total = 0
    correct = 0

    x_test, y_test = x_test.to(device), y_test.to(device)
    output = model(x_test)
    loss = criterion(output, y_test)
    val_loss += loss.item()
    _, predicted = torch.max(output, 1)
    total += y_test.size(0)
    correct += (predicted == y_test).sum().item()

    accuracy = 100 * correct / total
    print(f'Loss: {loss.item()}, Val Loss: {val_loss/len(val_loader)}, Val Accuracy: {accuracy}%')

Loss: 0.6933696866035461, Val Loss: 0.00018489858309427896, Val Accuracy: 50.0%
Loss: 0.6986439824104309, Val Loss: 0.0001863050619761149, Val Accuracy: 37.5%
Loss: 0.6953474283218384, Val Loss: 0.00018542598088582357, Val Accuracy: 45.3125%
Loss: 0.6940290331840515, Val Loss: 0.0001850744088490804, Val Accuracy: 48.4375%
Loss: 0.6894139647483826, Val Loss: 0.000183843723932902, Val Accuracy: 59.375%
Loss: 0.692051112651825, Val Loss: 0.00018454696337381999, Val Accuracy: 53.125%
Loss: 0.692051112651825, Val Loss: 0.00018454696337381999, Val Accuracy: 53.125%
Loss: 0.6966660022735596, Val Loss: 0.00018577760060628254, Val Accuracy: 42.1875%
Loss: 0.6940289735794067, Val Loss: 0.00018507439295450847, Val Accuracy: 48.4375%
Loss: 0.6907325387001038, Val Loss: 0.000184195343653361, Val Accuracy: 56.25%
Loss: 0.6973253488540649, Val Loss: 0.00018595342636108398, Val Accuracy: 40.625%
Loss: 0.6913918852806091, Val Loss: 0.00018437116940816243, Val Accuracy: 54.6875%
Loss: 0.6993032097816467