In [1]:
# import package
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# ! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [4]:
# setting seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
# import dataset
train= pd.read_csv('./train.csv')
test= pd.read_csv("./test.csv")

In [6]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

In [7]:
# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

In [8]:
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))
    
processing(train, stopwords, lemma)
processing(test, stopwords, lemma)

In [9]:
train.head()

Unnamed: 0,id,keyword,location,text,target,length,length_padding
0,1,,,"[deeds, reason, earthquake, may, allah, forgiv...",1,7,30
1,4,,,"[forest, fire, near, la, ronge, sask, canada, ...",1,7,30
2,5,,,"[residents, ask, shelter, place, notify, offic...",1,11,30
3,6,,,"[13000, people, receive, wildfires, evacuation...",1,7,30
4,7,,,"[get, send, photo, ruby, alaska, smoke, wildfi...",1,9,30


In [10]:
test.head()

Unnamed: 0,id,keyword,location,text,length,length_padding
0,0,,,"[happen, terrible, car, crash, 0, 0, 0, 0, 0, ...",4,30
1,2,,,"[hear, earthquake, different, cities, stay, sa...",7,30
2,3,,,"[forest, fire, spot, pond, geese, flee, across...",10,30
3,9,,,"[apocalypse, light, spokane, wildfires, 0, 0, ...",4,30
4,11,,,"[typhoon, soudelor, kill, 28, china, taiwan, 0...",6,30


## Assign unique index to each word, used for word embedding

In [11]:
# get all vocabulary
vocab_list = []
for sentence in train['text']:
    vocab_list.append(sentence)
vocab = build_vocab_from_iterator(vocab_list, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

### Dataset and DataLoader

In [12]:
class TweetDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y 
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        sentence = self.x[idx]
        x = torch.tensor(vocab(sentence), dtype=torch.long).to(device)
        y = torch.tensor(self.y[idx], dtype=torch.long).to(device)
        return x, y

In [13]:
model_config = {
    'vocab_size': len(vocab),
    'hidden_dim' : 256,
    'embedding_dim' : 200,
    'num_classes' : 2,
    'n_layers': 2,
    'dropout': 0.2
}

In [14]:
train_data = TweetDataset(train['text'], train['target'])
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

In [15]:
for batch in train_dataloader:
    print(batch)
    break

[tensor([[  191,  1122,  2990,  ...,     1,     1,     1],
        [    1,   274, 11380,  ...,     1,     1,     1],
        [   54,   518,  4206,  ...,     1,     1,     1],
        ...,
        [ 4056,   605,  1295,  ...,     1,     1,     1],
        [    3,    67,    24,  ...,     1,     1,     1],
        [  292,   483,  3151,  ...,     1,     1,     1]], device='cuda:0'), tensor([1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')]


# Model

# GRU

In [16]:
class GRU(nn.Module):
    def __init__(self, config):
        super(GRU, self).__init__()
        self.config = config
        self.embedding = nn.Embedding(config['vocab_size'], config['embedding_dim'])
        self.gru = nn.GRU(config['embedding_dim'], config['hidden_dim'], bidirectional=True)
        self.fc = nn.Linear(config['hidden_dim'] * 2, config['num_classes'])
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):
        embeds = self.embedding(X).permute(1, 0, 2)
        hidden_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        cell_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        output, (final_hidden_state, final_cell_state) = self.gru(embeds, hidden_state)
        output = output.permute(1, 0, 2)
        output = self.fc(output[:, -1, :])
        return self.sigmoid(output)


In [17]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    prediction = preds.argmax(dim=1)
    correct = (prediction == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# Training

In [18]:
model = GRU(model_config)
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [19]:


epoch_loss = 0
epoch_acc = 0
for epoch in range(20):
    model.train()
    optimizer.zero_grad()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        inputs, target = batch
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        
        acc = binary_accuracy(outputs, target)
        epoch_acc += acc.item()
    
    print("The training loss at epoch {} is {}; The training accuracy is {}".format(epoch, epoch_loss / len(train_dataloader), 
                                                                                    round(epoch_acc/len(train_dataloader), 3)))
    epoch_loss = 0
    epoch_acc = 0


100%|██████████| 119/119 [00:05<00:00, 21.04it/s]


The training loss at epoch 0 is 0.6646230816841125; The training accuracy is 0.603


100%|██████████| 119/119 [00:01<00:00, 71.90it/s]


The training loss at epoch 1 is 0.5495632886886597; The training accuracy is 0.753


100%|██████████| 119/119 [00:01<00:00, 95.39it/s]


The training loss at epoch 2 is 0.4871671199798584; The training accuracy is 0.823


100%|██████████| 119/119 [00:01<00:00, 96.55it/s]


The training loss at epoch 3 is 0.4495519995689392; The training accuracy is 0.861


100%|██████████| 119/119 [00:01<00:00, 95.70it/s]


The training loss at epoch 4 is 0.4283429682254791; The training accuracy is 0.884


100%|██████████| 119/119 [00:01<00:00, 84.07it/s]


The training loss at epoch 5 is 0.41792744398117065; The training accuracy is 0.895


100%|██████████| 119/119 [00:01<00:00, 75.76it/s]


The training loss at epoch 6 is 0.40679749846458435; The training accuracy is 0.906


100%|██████████| 119/119 [00:01<00:00, 79.08it/s]


The training loss at epoch 7 is 0.3979574143886566; The training accuracy is 0.915


100%|██████████| 119/119 [00:01<00:00, 98.50it/s]


The training loss at epoch 8 is 0.3983617126941681; The training accuracy is 0.914


100%|██████████| 119/119 [00:01<00:00, 97.05it/s]


The training loss at epoch 9 is 0.3901844024658203; The training accuracy is 0.923


100%|██████████| 119/119 [00:01<00:00, 99.35it/s] 


The training loss at epoch 10 is 0.38898152112960815; The training accuracy is 0.924


100%|██████████| 119/119 [00:01<00:00, 98.39it/s]


The training loss at epoch 11 is 0.384949654340744; The training accuracy is 0.928


100%|██████████| 119/119 [00:01<00:00, 98.43it/s]


The training loss at epoch 12 is 0.3805049657821655; The training accuracy is 0.933


100%|██████████| 119/119 [00:01<00:00, 98.63it/s]


The training loss at epoch 13 is 0.3820948004722595; The training accuracy is 0.931


100%|██████████| 119/119 [00:01<00:00, 97.86it/s]


The training loss at epoch 14 is 0.38267531991004944; The training accuracy is 0.93


100%|██████████| 119/119 [00:01<00:00, 88.72it/s]


The training loss at epoch 15 is 0.3784236013889313; The training accuracy is 0.934


100%|██████████| 119/119 [00:01<00:00, 80.48it/s]


The training loss at epoch 16 is 0.3800906240940094; The training accuracy is 0.933


100%|██████████| 119/119 [00:01<00:00, 71.82it/s]


The training loss at epoch 17 is 0.3785455822944641; The training accuracy is 0.935


100%|██████████| 119/119 [00:01<00:00, 98.02it/s]


The training loss at epoch 18 is 0.3777284026145935; The training accuracy is 0.935


100%|██████████| 119/119 [00:01<00:00, 97.37it/s]

The training loss at epoch 19 is 0.3743835985660553; The training accuracy is 0.939





In [20]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [21]:
def generate_prediction(inputs, model):
    inputs = torch.tensor(vocab(inputs), dtype=torch.long).to(device).unsqueeze(0)
    outpus =  model(inputs).argmax(dim=1).item()
    return outpus

test['target'] = test['text'].apply(lambda sentence: generate_prediction(sentence, model))

In [24]:
submission = test[['id', 'target']]
submission.to_csv('submission_gru.csv', index=False)
print(sum(test['target']))

1355
