In [1]:
# Importing modules
import pandas as pd
from torchtext import data
import torch
from torch import nn, optim
from sklearn.metrics import classification_report

In [2]:
# Loading data | tweets dataframe
tdf = pd.read_csv("twitter_50K_training_data.csv", header=None, encoding="cp1252")

# Dataframe manipulation
tdf[0] = tdf[0].astype("category").cat.codes

# Saving Tweets dataframe
tdf.to_csv("training-processed.csv", header=None, index=None)

# Dataframe summary
tdf.head()

Unnamed: 0,0,1,2,3,4,5
0,1,2014432769,Wed Jun 03 01:30:41 PDT 2009,NO_QUERY,zxhoon,@chickpea981 happy bday
1,1,2178933826,Mon Jun 15 08:24:46 PDT 2009,NO_QUERY,kerimcinerney,@PamelaGlasner pleasure
2,1,1979723018,Sun May 31 04:14:43 PDT 2009,NO_QUERY,jessicaspence,"Watching perfum, eating cake, lots of cuddles...."
3,0,2069191661,Sun Jun 07 15:25:44 PDT 2009,NO_QUERY,nydia_nicole,And she say u have to press the break then pus...
4,1,1770778240,Mon May 11 22:13:18 PDT 2009,NO_QUERY,FireflyShop,vdoBug kids DVD navigator giveaway #2 winners ...


In [3]:
# Defining datatypes fields
LABEL = data.LabelField()
TWEET = data.Field(lower=True)

# Creating dataset
fields = [('label',LABEL), ('id',None),('date',None),('query',None),('name',None),('tweet', TWEET)]
tData = data.TabularDataset("training-processed.csv", format="CSV",fields=fields, skip_header=False)

# Splitting dataset into train, val and test
train, val, test = tData.split(split_ratio=[0.7, 0.2, 0.1])

In [4]:
# Building vocabulary
TWEET.build_vocab(train, max_size=20000)
LABEL.build_vocab(train)

In [5]:
# Defining device
device = torch.device("cpu")

In [6]:
# Dataloader
train_dataloader, val_dataloader, test_dataloader = data.BucketIterator.splits((train, val, test),
                                                                               batch_size=64, 
                                                                               device=device, 
                                                                               sort_key=lambda x: len(x.tweet), 
                                                                               sort_within_batch=True)

In [7]:
# LSTM model
class LSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)
    def forward(self, x):
        output, (hidden, _) = self.encoder(self.embedding(x))
        pred = self.predictor(hidden.squeeze(0))
        return pred

# Model
model = LSTM(100, 300, 20002)

# Loading model to device
model = model.to(device)

In [8]:
# Model architecture
model

LSTM(
  (embedding): Embedding(20002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

In [9]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

In [10]:
# Training
for epoch in range(10):

    # Training loop
    model.train()
    for batch_idx, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(batch.tweet)
        loss = criterion(outputs, batch.label)
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_dataloader):
            outputs = model(batch.tweet)
            loss = criterion(outputs, batch.label)
            val_loss += loss.data.item() * batch.tweet.size(0)

    average_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} | Validation Loss: {average_val_loss}")

Epoch 1 | Validation Loss: 7.573331027468549
Epoch 2 | Validation Loss: 7.9871179280401785
Epoch 3 | Validation Loss: 8.496222077291224
Epoch 4 | Validation Loss: 9.717823199833497
Epoch 5 | Validation Loss: 10.349119643244562
Epoch 6 | Validation Loss: 11.471383602558811
Epoch 7 | Validation Loss: 11.39102231362198
Epoch 8 | Validation Loss: 11.706954636905767
Epoch 9 | Validation Loss: 12.906469008590602
Epoch 10 | Validation Loss: 13.366947120503534


In [11]:
y_pred = []
y_test = []

# Testing loop
model.eval()
with torch.no_grad():
    for batch_idx, batch in enumerate(test_dataloader):
        outputs = model(batch.tweet)
        pred = torch.argmax(outputs, dim=1)
        y_pred.extend(pred.cpu().numpy())
        y_test.extend(batch.label.cpu().numpy())

In [12]:
# Evaluation metric
report = classification_report(y_test, y_pred, target_names=LABEL.vocab.freqs.keys())
print(report)

              precision    recall  f1-score   support

           1       0.73      0.74      0.73      5051
           0       0.73      0.72      0.72      4949

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000

