In [None]:
# import package
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
# ! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
!pip install transformers

In [None]:
# setting seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
# import dataset
train= pd.read_csv('./train.csv')
test= pd.read_csv("./test.csv")
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

In [None]:
# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))
    
processing(train, stopwords, lemma)
processing(val, stopwords, lemma)
processing(test, stopwords, lemma)

In [None]:
train.head()

In [None]:
test.head()

## Assign unique index to each word, used for word embedding

In [None]:
# get all vocabulary
vocab_list = []
for sentence in train['text']:
    vocab_list.append(sentence)
vocab = build_vocab_from_iterator(vocab_list, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

### Dataset and DataLoader

In [None]:
class TweetDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y 
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        sentence = self.x[idx]
        x = torch.tensor(vocab(sentence), dtype=torch.long).to(device)
        y = torch.tensor(self.y[idx], dtype=torch.long).to(device)
        return x, y

In [None]:
model_config = {
    'vocab_size': len(vocab),
    'hidden_dim' : 256,
    'embedding_dim' : 200,
    'num_classes' : 2,
    'n_layers': 2,
    'dropout': 0.2
}

In [None]:
train_data = TweetDataset(train['text'], train['target'])
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

val_data = TweetDataset(val['text'], val['target'])
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)


In [None]:
for batch in train_dataloader:
    print(batch)
    break

# Model

# LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, config):
        super(LSTM, self).__init__()
        self.config = config
        self.embedding = nn.Embedding(config['vocab_size'], config['embedding_dim'])
        self.lstm = nn.LSTM(config['embedding_dim'], config['hidden_dim'], bidirectional=True)
        self.fc = nn.Linear(config['hidden_dim'] * 2, config['num_classes'])
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):
        embeds = self.embedding(X).permute(1, 0, 2)
        hidden_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        cell_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        output, (final_hidden_state, final_cell_state) = self.lstm(embeds, (hidden_state, cell_state))
        output = output.permute(1, 0, 2)
        output = self.fc(output[:, -1, :])
        return self.sigmoid(output)


In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    prediction = preds.argmax(dim=1)
    correct = (prediction == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# Training

In [None]:
model = LSTM(model_config)
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [None]:
model = LSTM(model_config)
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

train_loss = []
train_acc = []
valid_loss = []
valid_acc = []

for epoch in range(50):
    # Training loop
    model.train()
    train_epoch_loss = 0
    train_epoch_acc = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        inputs, target = batch
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        train_epoch_loss += loss.item()
        train_epoch_acc += binary_accuracy(outputs, target).item()

    train_loss.append(train_epoch_loss / len(train_dataloader))
    train_acc.append(train_epoch_acc / len(train_dataloader))

    # Validation loop
    model.eval()
    valid_epoch_loss = 0
    valid_epoch_acc = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            inputs, target = batch
            outputs = model(inputs)
            loss = criterion(outputs, target)
            valid_epoch_loss += loss.item()
            valid_epoch_acc += binary_accuracy(outputs, target).item()

    valid_loss.append(valid_epoch_loss / len(valid_dataloader))
    valid_acc.append(valid_epoch_acc / len(valid_dataloader))

    # Print results
    print("Epoch {}:".format(epoch))
    print("\tTrain Loss: {:.4f} | Train Acc: {:.4f}".format(train_loss[-1], train_acc[-1]))
    print("\tVal. Loss: {:.4f} | Val. Acc: {:.4f}".format(valid_loss[-1], valid_acc[-1]))


In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [None]:
def generate_prediction(inputs, model):
    inputs = torch.tensor(vocab(inputs), dtype=torch.long).to(device).unsqueeze(0)
    outpus =  model(inputs).argmax(dim=1).item()
    return outpus

test['target'] = test['text'].apply(lambda sentence: generate_prediction(sentence, model))

In [None]:
submission = test[['id', 'target']]
submission.to_csv('submission_lstm.csv', index=False)
print(sum(test['target']))