In [None]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using %s" % (device))

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
def clean_text(text):
    text = text.lower()  # lowercase

    text = re.sub(r'[!]+', '!', text)
    text = re.sub(r'[?]+', '?', text)
    text = re.sub(r'[.]+', '.', text)
    text = re.sub(r"'", "", text)
    text = re.sub('\s+', ' ', text).strip()  # Remove and double spaces
    text = re.sub(r'&amp;?', r'and', text)  # replace & -> and
    text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text)  # Remove URLs
    # remove some puncts (except . ! # ?)
    text = re.sub(r'[:"$%&\*+,-/:;<=>@\\^_`{|}~]+', '', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'EMOJI', text)

    return text

In [None]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

train['text_new'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

train_texts = list(train["text"])
train_labels = list(train["target"])
train_keywords = list(train['keyword'].fillna(''))
res_texts = list(test["text"])
res_keywords = list(test['keyword'].fillna(''))

x_train, x_test, train_label, test_label, train_keyword, test_keyword =  train_test_split(train_texts, train_labels, train_keywords, test_size=0.2)
print(len(res_texts))

In [None]:
# train[["text", "text_new"]]


In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encoding = tokenizer(x_train, truncation=True, padding=True)
test_encoding = tokenizer(x_test, truncation=True, padding=True)
res_encoding = tokenizer(res_texts, truncation=True, padding=True)
train_keyword_en = tokenizer(train_keyword, truncation=True, padding=True)
test_keyword_en = tokenizer(test_keyword, truncation=True, padding=True)

In [None]:
# train_keyword_en
# list(train_encoding.items())[1]

In [None]:
class TwitterDataset(Dataset):
    def __init__(self, encodings, labels, keywords):
        self.encodings = encodings
        self.labels = labels
        self.keywords = keywords

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['keyword'] = {key: torch.tensor(val[idx]) for key, val in self.keywords.items()}
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = TwitterDataset(train_encoding, train_label, train_keyword_en)
test_dataset = TwitterDataset(test_encoding, test_label, test_keyword_en)

In [None]:
train_dataset[0]

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Smoke testing
step_nums = 30

from transformers import DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

model.to(device)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = step_nums)

first_batch = next(iter(train_loader))
for batch_idx, batch in enumerate([first_batch] * step_nums):
    model.train()
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optim.step()
    scheduler.step()
    
    # test to make sure model can overfit one batch. 
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
    loss = outputs.loss
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    print(f"step: {batch_idx:d}, loss: {loss.item():.2f}, accuracy: {flat_accuracy(logits, label_ids)*100:.2f}%")

In [None]:
print(list(model.parameters()))

In [None]:
# Train and keep the best model
epoth_num=5

from transformers import DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epoth_num
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

best_accuracy=0

for epoch in range(epoth_num):
    print("------------Epoch: %d ----------------" % epoch)
    
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        scheduler.step()
        
        iter_num += 1
        if(iter_num % 100==0):
            print(f"epoth: {epoch}, iter_num: {iter_num}, loss: {loss.item():.4f}, {iter_num/total_iter*100:.2f}%")
        
    print(f"Epoch: {epoch}, Average training loss: {total_train_loss/len(train_loader):.4f}")
    
    
    print("")
    print("Running Validation...")
    
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            
        loss = outputs.loss
        logits = outputs.logits

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    
    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        print("Best model till now.")
        torch.save(model.state_dict(), './model.weights')
    else:
        model.load_state_dict(torch.load("./model.weights"))
    
    print(f"Accuracy: {avg_val_accuracy:.4f}")
    print(f"Average testing loss: {total_eval_loss/len(test_dataloader):.4f}")
    print("-------------------------------")
    

In [None]:
print("Restoring the best model weights.")
model.load_state_dict(torch.load("./model.weights"))
model.eval()
class TwitterValDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    
    def __len__(self):
        return len(self.encodings['input_ids'])
# print(res_encoding.items())
# for k,v in res_encoding.items():
#     print(k)
val_dataset = TwitterValDataset(res_encoding)
# print(len(val_dataset))
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
result = []
for batch in val_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    pred_flat = np.argmax(logits, axis=1).flatten()
    result.extend(pred_flat)
df = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
df['target'] = result
df.to_csv("submission.csv", index=False)