In [1]:
# import package
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# # unzip nltk_data, remember to change path
# ! unzip /Users/tom/nltk_data/corpora/wordnet.zip -d /Users/tom/nltk_data/corpora/

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [4]:
# setting seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
MAX_LENGTH =80
# selecting devices
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
# import dataset
train= pd.read_csv('./train.csv')
test= pd.read_csv("./test.csv")

In [6]:
# data cleaning
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

In [7]:
# processing data cleaning
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    
    
processing(train, stopwords, lemma)
processing(test, stopwords, lemma)

In [8]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [9]:
from transformers import BertTokenizer
token = BertTokenizer.from_pretrained('bert-large-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [10]:
from transformers import BertModel

class TweetDataset(Dataset):
    def __init__(self, df):
        self.x = df['text']
        self.y = df['target']
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        sentence = self.x[idx]
        return sentence, self.y[idx]
    
train_dataset = TweetDataset(train)

In [11]:
def collate_fn(data):
    sentences = []
    labels = []
    for item in data:
        sentences.append(item[0])
        labels.append(item[1])
    
    data = token.batch_encode_plus(batch_text_or_text_pairs=sentences,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=80,
                                   return_tensors='pt')
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    labels = torch.LongTensor(labels).to(device)
    return input_ids, attention_mask, token_type_ids, labels

In [12]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=32,
                          collate_fn=collate_fn,
                          shuffle=True,
                          drop_last=True)

In [13]:
from transformers import BertModel
pretrained = BertModel.from_pretrained('bert-large-uncased')
pretrained.to(device)

for param in pretrained.parameters():
    param.requires_grad_(False)

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# bert
class bert_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(1024, 2)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids).last_hidden_state[:, 0]
        out = self.fc(out).softmax(dim=1)
        return out

In [15]:
# calculate accuracy
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    prediction = preds.argmax(dim=1)
    correct = (prediction == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [16]:
# model to device
model = bert_model()
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [17]:
# training model
epoch_loss = 0
epoch_acc = 0
for epoch in range(20):
    model.train()
    optimizer.zero_grad()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, token_type_ids, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        
        acc = binary_accuracy(outputs, labels)
        epoch_acc += acc.item()
    
    print("The training loss at epoch {} is {}; The training accuracy is {}".format(epoch, epoch_loss / len(train_loader), 
                                                                                    round(epoch_acc/len(train_loader), 3)))
    epoch_loss = 0
    epoch_acc = 0

100%|██████████| 237/237 [01:49<00:00,  2.16it/s]


The training loss at epoch 0 is 0.5596101880073547; The training accuracy is 0.75


100%|██████████| 237/237 [01:48<00:00,  2.18it/s]


The training loss at epoch 1 is 0.5164142847061157; The training accuracy is 0.792


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 2 is 0.5055453777313232; The training accuracy is 0.805


100%|██████████| 237/237 [01:49<00:00,  2.16it/s]


The training loss at epoch 3 is 0.5004920959472656; The training accuracy is 0.808


100%|██████████| 237/237 [01:48<00:00,  2.17it/s]


The training loss at epoch 4 is 0.49401524662971497; The training accuracy is 0.815


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 5 is 0.49154043197631836; The training accuracy is 0.819


100%|██████████| 237/237 [01:49<00:00,  2.16it/s]


The training loss at epoch 6 is 0.4875645041465759; The training accuracy is 0.824


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 7 is 0.48855310678482056; The training accuracy is 0.82


100%|██████████| 237/237 [01:49<00:00,  2.16it/s]


The training loss at epoch 8 is 0.4846612215042114; The training accuracy is 0.827


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 9 is 0.48334434628486633; The training accuracy is 0.828


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 10 is 0.4804660975933075; The training accuracy is 0.834


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 11 is 0.4774971604347229; The training accuracy is 0.835


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 12 is 0.4768003821372986; The training accuracy is 0.837


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 13 is 0.47495773434638977; The training accuracy is 0.838


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 14 is 0.47551217675209045; The training accuracy is 0.838


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 15 is 0.4714680314064026; The training accuracy is 0.842


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 16 is 0.4708549380302429; The training accuracy is 0.843


100%|██████████| 237/237 [01:49<00:00,  2.16it/s]


The training loss at epoch 17 is 0.47035136818885803; The training accuracy is 0.845


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]


The training loss at epoch 18 is 0.46965038776397705; The training accuracy is 0.845


100%|██████████| 237/237 [01:49<00:00,  2.17it/s]

The training loss at epoch 19 is 0.46813592314720154; The training accuracy is 0.848





# prediction

In [19]:
# make prediction
data = token.batch_encode_plus(batch_text_or_text_pairs=test['text'], truncation=True, padding='max_length', max_length=30, return_tensors='pt')
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
token_type_ids = data['token_type_ids'].to(device)
prediction = model(input_ids, attention_mask, token_type_ids).argmax(dim=1).cpu()

In [20]:
# store result
test['target'] = prediction
submission = test[['id', 'target']]
submission.to_csv('submission_bert.csv', index=False)
print(sum(test['target']))

1172


In [21]:
# calculate test accuracy
import pandas as pd
# Load the target and submission dataframes
target_df = pd.read_csv('./check.csv')
submission_df = pd.read_csv('./submission_bert.csv')
# Merge the dataframes on the 'id' column
merged_df = pd.merge(target_df, submission_df, on='id')
# Calculate the accuracy rate
accuracy_rate = (merged_df['target_x'] == merged_df['target_y']).mean()
print('Accuracy rate:', accuracy_rate)

Accuracy rate: 0.799202942979767
