In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import pandas as pd
import numpy as np
import random
import time
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


In [2]:
train_df = pd.read_csv("../Dataset/Split/Three_Label/train_posnegnet_tweetcleannew600-only.csv", header=None, sep=';')
valid_df = pd.read_csv("../Dataset/Split/Three_Label/valid_posnegnet_tweetcleannew600-only.csv", header=None, sep=';')
test_df = pd.read_csv("../Dataset/Split/Three_Label/test_posnegnet_tweetcleannew600-only.csv", header=None, sep=';')

In [3]:
frames = [train_df, valid_df, test_df]
df = pd.concat(frames, axis=0)
df.head()

Unnamed: 0,0,1
0,negara ngutang bngun infrastruktur udah dipake...,-1
1,tenang msih ngoceh aja ttp jokowi harga mati,-1
2,prabowo sandi sepakat ambil gaji negara susah,-1
3,gak nginti lu to si abdillahtoha gak ngacaa ga...,-1
4,ummat islam yg waras cerdas senang amal coblos...,-1


In [4]:
df[2] = [simple_preprocess(line, deacc=True) for line in df[0]]
train_df[2] = [simple_preprocess(line, deacc=True) for line in train_df[0]]
valid_df[2] = [simple_preprocess(line, deacc=True) for line in valid_df[0]]
test_df[2] = [simple_preprocess(line, deacc=True) for line in test_df[0]]

In [5]:
max_len = df[2].map(len).max()

In [6]:
model_path = '../Machine Learning/featureExtraction/word2vec/models_all/model_sg_hs_300.model'
model_sg = Word2Vec.load(model_path)

words = list(model_sg.wv.key_to_index.keys())

In [7]:
padding_idx = model_sg.wv.key_to_index.get('pad')

def use_w2v(text):
    vec = [padding_idx for i in range(max_len)]
    i = 0
    for word in text:
        if word not in model_sg.wv.key_to_index.keys():
            vec[i] = 0
            print(word)
        else:
            vec[i] = model_sg.wv.key_to_index.get(word)
        i += 1
    return torch.tensor(vec, dtype=torch.long, device=device).view(1, -1)

24931


In [8]:
def make_target(label, n_class):
    if n_class == 2:
        if label == 0:
            return torch.tensor([0], dtype=torch.int64, device=device)
        elif label == 1:
            return torch.tensor([1], dtype=torch.int64, device=device)  
    if n_class == 3:
        if label == -1:
            return torch.tensor([0], dtype=torch.int64, device=device)
        elif label == 0:
            return torch.tensor([1], dtype=torch.int64, device=device)
        elif label == 1:
            return torch.tensor([2], dtype=torch.int64, device=device)

In [9]:
model_sg.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f50668e35e0>

In [10]:
class LSTM(nn.Module):
    def __init__(self, pretrained_embedding, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate):
        super().__init__()
        w2v = Word2Vec.load(pretrained_embedding)
        weights = w2v.wv
        vocab_size = len(w2v.wv.key_to_index.keys())
        pad_index = model_sg.wv.key_to_index.get('pad')
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights.vectors), padding_idx=model_sg.wv.key_to_index.get('pad'), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                            dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, ids):
        embedded = self.dropout(self.embedding(ids))
        packed_output, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
        else:
            hidden = self.dropout(hidden[-1])
        prediction = self.fc(hidden)
        return prediction

In [11]:
lstm = LSTM(pretrained_embedding='../Machine Learning/featureExtraction/word2vec/models_all/model_sg_hs_300.model',
            embedding_dim = 300,
            hidden_dim = 300,
            output_dim = 3,
            n_layers = 2,
            bidirectional = False,
            dropout_rate =0.5)

lstm.to(device)
optimizer = optim.Adam(lstm.parameters(), lr=5e-4)

In [12]:
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=131221):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, train, valid, num_class, epochs):
    # Tracking best validation accuracy
    best_accuracy = 0

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)
    
    model.train()
    for epoch in range(epochs):
        # Tracking time and loss
        t0_epoch = time.time()
        train_loss = 0
        
        for index, row in train.iterrows():
            model.zero_grad()

            bow_vec = use_w2v(row[2])

            probs = model(bow_vec)

            target = make_target(train[1][index], num_class)

            loss = loss_fn(probs, target)
            train_loss += loss.item()

            loss.backward()

            optimizer.step()
        
        # Calculate the average loss over the entire training data
        avg_train_loss = train_loss / len(train)

        # After the completion of each training epoch, measure the model's
        # performance on our validation set.
        val_loss, val_accuracy = evaluate(model, valid, num_class)

        # Track the best accuracy
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy

        # Print performance over the entire training data
        time_elapsed = time.time() - t0_epoch
        print(f"{epoch + 1:^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.4f} | {time_elapsed:^9.2f}")
            
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")

def evaluate(model, test, num_class):
    val_accuracy = []
    val_loss = []

    for index, row in test.iterrows():
        # Compute logits
        with torch.no_grad():
            bow_vec = use_w2v(row[2])
            logits = model(bow_vec)
        
        target = make_target(test[1][index], num_class)
        
        # Compute loss
        loss = loss_fn(logits, target)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        correct = (preds == target).cpu().numpy()[0]
        val_accuracy.append(correct)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [13]:
set_seed(131221)
train(lstm, optimizer, train_df, valid_df, epochs=10, num_class = 3)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.202551   |  7.583991  |  0.1443   |   8.50   
   2    |   0.226577   |  6.496914  |  0.1443   |   7.35   
   3    |   0.233914   |  5.091451  |  0.1443   |   7.45   
   4    |   0.258338   |  5.657870  |  0.1443   |   7.34   
   5    |   0.244876   |  5.166702  |  0.1443   |   7.41   
   6    |   0.258134   |  5.063616  |  0.1443   |   7.35   
   7    |   0.274952   |  5.549274  |  0.1443   |   7.49   
   8    |   0.262178   |  5.766788  |  0.1443   |   7.22   
   9    |   0.243232   |  6.174631  |  0.1443   |   7.20   
  10    |   0.235021   |  6.090920  |  0.1443   |   7.37   


Training complete! Best accuracy: 0.14%.


In [14]:
bow_cnn_predictions = []
original_lables_cnn_bow = []
lstm.eval()

with torch.no_grad():
    for index, row in test_df.iterrows():
        bow_vec = use_w2v(row[2])
        probs = lstm(bow_vec)
        _, predicted = torch.max(probs.data, 1)
        bow_cnn_predictions.append(predicted.cpu().numpy()[0])
        original_lables_cnn_bow.append(make_target(test_df[1][index], 3).cpu().numpy()[0])
print(classification_report(original_lables_cnn_bow,bow_cnn_predictions, digits=4))

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       107
           1     0.0000    0.0000    0.0000       127
           2     0.3536    1.0000    0.5224       128

    accuracy                         0.3536       362
   macro avg     0.1179    0.3333    0.1741       362
weighted avg     0.1250    0.3536    0.1847       362



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
