In [189]:
# import package
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [190]:
# ! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [191]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [192]:
# setting seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [193]:
# import dataset
train= pd.read_csv('./train.csv')
test= pd.read_csv("./test.csv")

In [194]:
# data cleaning
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

In [195]:
# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

In [196]:
# processing data cleaning
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))
    
processing(train, stopwords, lemma)
processing(test, stopwords, lemma)

In [197]:
train.head()

Unnamed: 0,id,keyword,location,text,target,length,length_padding
0,1,,,"[deeds, reason, earthquake, may, allah, forgiv...",1,7,30
1,4,,,"[forest, fire, near, la, ronge, sask, canada, ...",1,7,30
2,5,,,"[residents, ask, shelter, place, notify, offic...",1,11,30
3,6,,,"[13000, people, receive, wildfires, evacuation...",1,7,30
4,7,,,"[get, send, photo, ruby, alaska, smoke, wildfi...",1,9,30


In [198]:
test.head()

Unnamed: 0,id,keyword,location,text,length,length_padding
0,0,,,"[happen, terrible, car, crash, 0, 0, 0, 0, 0, ...",4,30
1,2,,,"[hear, earthquake, different, cities, stay, sa...",7,30
2,3,,,"[forest, fire, spot, pond, geese, flee, across...",10,30
3,9,,,"[apocalypse, light, spokane, wildfires, 0, 0, ...",4,30
4,11,,,"[typhoon, soudelor, kill, 28, china, taiwan, 0...",6,30


## Assign unique index to each word, used for word embedding

In [199]:
# get all vocabulary
vocab_list = []
for sentence in train['text']:
    vocab_list.append(sentence)
vocab = build_vocab_from_iterator(vocab_list, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

### Dataset and DataLoader

In [200]:
class TweetDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y 
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        sentence = self.x[idx]
        x = torch.tensor(vocab(sentence), dtype=torch.long).to(device)
        y = torch.tensor(self.y[idx], dtype=torch.long).to(device)
        return x, y

In [201]:
# model configuration
model_config = {
    'vocab_size': len(vocab),
    'hidden_dim' : 256,
    'embedding_dim' : 200,
    'num_classes' : 2,
    'n_layers': 2,
    'dropout': 0.2
}

In [202]:
train_data = TweetDataset(train['text'], train['target'])
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

In [203]:
# batch test
for batch in train_dataloader:
    print(batch)
    break

[tensor([[  191,  1122,  2990,  ...,     1,     1,     1],
        [    1,   274, 11380,  ...,     1,     1,     1],
        [   54,   518,  4206,  ...,     1,     1,     1],
        ...,
        [ 4056,   605,  1295,  ...,     1,     1,     1],
        [    3,    67,    24,  ...,     1,     1,     1],
        [  292,   483,  3151,  ...,     1,     1,     1]], device='cuda:0'), tensor([1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')]


# Model

# LSTM

In [204]:
# LSTM
class LSTM(nn.Module):
    def __init__(self, config):
        super(LSTM, self).__init__()
        self.config = config
        self.embedding = nn.Embedding(config['vocab_size'], config['embedding_dim'])
        self.lstm = nn.LSTM(config['embedding_dim'], config['hidden_dim'], bidirectional=True)
        self.fc = nn.Linear(config['hidden_dim'] * 2, config['num_classes'])
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):
        embeds = self.embedding(X).permute(1, 0, 2)
        hidden_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        cell_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        output, (final_hidden_state, final_cell_state) = self.lstm(embeds, (hidden_state, cell_state))
        output = output.permute(1, 0, 2)
        output = self.fc(output[:, -1, :])
        return self.sigmoid(output)


In [205]:
# calculate accuracy
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    prediction = preds.argmax(dim=1)
    correct = (prediction == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# Training

In [206]:
# make prediction
model = LSTM(model_config)
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [207]:
# training model
epoch_loss = 0
epoch_acc = 0
for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        inputs, target = batch
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        
        acc = binary_accuracy(outputs, target)
        epoch_acc += acc.item()
    
    print("The training loss at epoch {} is {}; The training accuracy is {}".format(epoch, epoch_loss / len(train_dataloader), 
                                                                                    round(epoch_acc/len(train_dataloader), 3)))
    epoch_loss = 0
    epoch_acc = 0


100%|██████████| 119/119 [00:02<00:00, 57.48it/s]


The training loss at epoch 0 is 0.6837310791015625; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 71.90it/s]


The training loss at epoch 1 is 0.6843092441558838; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 70.28it/s]


The training loss at epoch 2 is 0.6833924651145935; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 89.83it/s]


The training loss at epoch 3 is 0.6833621859550476; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 76.34it/s]


The training loss at epoch 4 is 0.6833123564720154; The training accuracy is 0.57


100%|██████████| 119/119 [00:02<00:00, 49.54it/s]


The training loss at epoch 5 is 0.6834205389022827; The training accuracy is 0.57


100%|██████████| 119/119 [00:02<00:00, 54.45it/s]


The training loss at epoch 6 is 0.6836889982223511; The training accuracy is 0.57


100%|██████████| 119/119 [00:02<00:00, 44.27it/s]


The training loss at epoch 7 is 0.6834478378295898; The training accuracy is 0.57


100%|██████████| 119/119 [00:04<00:00, 28.89it/s]


The training loss at epoch 8 is 0.6834715604782104; The training accuracy is 0.57


100%|██████████| 119/119 [00:02<00:00, 55.24it/s]


The training loss at epoch 9 is 0.683359682559967; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 90.22it/s]


The training loss at epoch 10 is 0.683571457862854; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 90.61it/s]


The training loss at epoch 11 is 0.6836053133010864; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 90.49it/s]


The training loss at epoch 12 is 0.6835066080093384; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 88.52it/s]


The training loss at epoch 13 is 0.683297336101532; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 90.79it/s]


The training loss at epoch 14 is 0.683529794216156; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 84.34it/s]


The training loss at epoch 15 is 0.6833383440971375; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 75.31it/s]


The training loss at epoch 16 is 0.6833451390266418; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 70.87it/s]


The training loss at epoch 17 is 0.683358371257782; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 83.75it/s]


The training loss at epoch 18 is 0.6835150122642517; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 88.93it/s]


The training loss at epoch 19 is 0.6833348870277405; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 88.84it/s]


The training loss at epoch 20 is 0.6835203766822815; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 89.71it/s]


The training loss at epoch 21 is 0.6833205819129944; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 88.26it/s]


The training loss at epoch 22 is 0.6834391355514526; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 89.28it/s]


The training loss at epoch 23 is 0.6831712126731873; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 88.71it/s]


The training loss at epoch 24 is 0.683596670627594; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 83.77it/s]


The training loss at epoch 25 is 0.6831653118133545; The training accuracy is 0.57


100%|██████████| 119/119 [00:01<00:00, 74.27it/s]


The training loss at epoch 26 is 0.6389124989509583; The training accuracy is 0.641


100%|██████████| 119/119 [00:01<00:00, 69.20it/s]


The training loss at epoch 27 is 0.6010403633117676; The training accuracy is 0.711


100%|██████████| 119/119 [00:01<00:00, 84.97it/s]


The training loss at epoch 28 is 0.5904188752174377; The training accuracy is 0.716


100%|██████████| 119/119 [00:01<00:00, 89.56it/s]


The training loss at epoch 29 is 0.5624426007270813; The training accuracy is 0.737


100%|██████████| 119/119 [00:01<00:00, 89.46it/s]


The training loss at epoch 30 is 0.5403903126716614; The training accuracy is 0.768


100%|██████████| 119/119 [00:01<00:00, 89.56it/s]


The training loss at epoch 31 is 0.5246132016181946; The training accuracy is 0.782


100%|██████████| 119/119 [00:01<00:00, 89.59it/s]


The training loss at epoch 32 is 0.514898955821991; The training accuracy is 0.796


100%|██████████| 119/119 [00:01<00:00, 87.51it/s]


The training loss at epoch 33 is 0.500304102897644; The training accuracy is 0.813


100%|██████████| 119/119 [00:01<00:00, 88.97it/s]


The training loss at epoch 34 is 0.4919881820678711; The training accuracy is 0.821


100%|██████████| 119/119 [00:01<00:00, 82.01it/s]


The training loss at epoch 35 is 0.48344212770462036; The training accuracy is 0.829


100%|██████████| 119/119 [00:01<00:00, 75.42it/s]


The training loss at epoch 36 is 0.4818303883075714; The training accuracy is 0.831


100%|██████████| 119/119 [00:01<00:00, 69.67it/s]


The training loss at epoch 37 is 0.4815959632396698; The training accuracy is 0.831


100%|██████████| 119/119 [00:01<00:00, 88.33it/s]


The training loss at epoch 38 is 0.47745808959007263; The training accuracy is 0.835


100%|██████████| 119/119 [00:01<00:00, 89.10it/s]


The training loss at epoch 39 is 0.47385215759277344; The training accuracy is 0.84


100%|██████████| 119/119 [00:01<00:00, 89.64it/s]


The training loss at epoch 40 is 0.4665173590183258; The training accuracy is 0.846


100%|██████████| 119/119 [00:01<00:00, 89.88it/s]


The training loss at epoch 41 is 0.46056485176086426; The training accuracy is 0.853


100%|██████████| 119/119 [00:01<00:00, 89.75it/s]


The training loss at epoch 42 is 0.45922642946243286; The training accuracy is 0.854


100%|██████████| 119/119 [00:01<00:00, 88.89it/s]


The training loss at epoch 43 is 0.45935872197151184; The training accuracy is 0.854


100%|██████████| 119/119 [00:01<00:00, 88.63it/s]


The training loss at epoch 44 is 0.4595402181148529; The training accuracy is 0.854


100%|██████████| 119/119 [00:01<00:00, 80.42it/s]


The training loss at epoch 45 is 0.45841580629348755; The training accuracy is 0.854


100%|██████████| 119/119 [00:01<00:00, 74.68it/s]


The training loss at epoch 46 is 0.45642560720443726; The training accuracy is 0.857


100%|██████████| 119/119 [00:01<00:00, 68.74it/s]


The training loss at epoch 47 is 0.4601326584815979; The training accuracy is 0.853


100%|██████████| 119/119 [00:01<00:00, 88.41it/s]


The training loss at epoch 48 is 0.4955037236213684; The training accuracy is 0.817


100%|██████████| 119/119 [00:01<00:00, 89.40it/s]

The training loss at epoch 49 is 0.4859210252761841; The training accuracy is 0.827





In [208]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [209]:
# make prediction
def generate_prediction(inputs, model):
    inputs = torch.tensor(vocab(inputs), dtype=torch.long).to(device).unsqueeze(0)
    outpus =  model(inputs).argmax(dim=1).item()
    return outpus

test['target'] = test['text'].apply(lambda sentence: generate_prediction(sentence, model))

In [210]:
# store result
submission = test[['id', 'target']]
submission.to_csv('submission_lstm.csv', index=False)
print(sum(test['target']))

1242


In [1]:
# calculate test accuracy
import pandas as pd
# Load the target and submission dataframes
target_df = pd.read_csv('./check.csv')
submission_df = pd.read_csv('./submission_lstm.csv')
# Merge the dataframes on the 'id' column
merged_df = pd.merge(target_df, submission_df, on='id')
# Calculate the accuracy rate
accuracy_rate = (merged_df['target_x'] == merged_df['target_y']).mean()
print('Accuracy rate:', accuracy_rate)

Accuracy rate: 0.7225628448804414
