In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlpprojinput/train.csv


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
import random
import time
import datetime
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df = pd.read_csv('../input/nlpprojinput/train.csv')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def tokenize_map(sentence,labels):       
    input_ids = []
    attention_masks = []
    
    for text in sentence:
        encoded_dict = tokenizer.encode_plus(
                            text,                      
                            add_special_tokens = True, 
                            truncation='longest_first', 
                            max_length = 84,           
                            pad_to_max_length = True, 
                            return_attention_mask = True,  
                            return_tensors = 'pt')
        input_ids.append(encoded_dict['input_ids'])        
        attention_masks.append(encoded_dict['attention_mask'])
            
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0) 
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

In [7]:
labels = df['label'].values
df = df.fillna(' ') 
text = df['text'].values
print(text.shape)
input_ids, attention_masks, labels = tokenize_map(text, labels)

(20800,)




In [8]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.20)
print(len(train_dataset),'training samples', len(val_dataset),  'validation samples')

16640 training samples 4160 validation samples


In [9]:
batch_size = 15

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size)

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size )

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = BertForSequenceClassification.from_pretrained(
    'bert-large-uncased',
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False,
)
model.to(device)
optimizer = AdamW(model.parameters(),lr = 6e-6,eps = 1e-8 )

cuda:0


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [11]:
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

In [12]:
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

In [13]:

training_stats = []
for epoch_i in range(epochs):

    print('Epoch {} / {} '.format(epoch_i + 1, epochs))
    print('Training')
    
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):   
        if step % 100 == 0:
            print("Step completed:", step)
        
        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()        
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)[0]
        logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[1]
        total_train_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)            
    print('Average training loss:', avg_train_loss)
    
    #validation
    print('Validation')
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
                
        with torch.no_grad():        
            loss = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[0]

            logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[1]
        
        total_eval_loss += loss.item() 
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += calc_accuracy(logits, label_ids)
            
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('Accuracy', avg_val_accuracy)    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print('Loss:', avg_val_loss)
    
    training_stats.append({'epoch': epoch_i + 1,'train_loss': avg_train_loss,'val_loss': avg_val_loss,'val_acc.': avg_val_accuracy})
print("DONE")

Epoch 1 / 3 
Training
Step completed: 0
Step completed: 100
Step completed: 200
Step completed: 300
Step completed: 400
Step completed: 500
Step completed: 600
Step completed: 700
Step completed: 800
Step completed: 900
Step completed: 1000
Step completed: 1100
Average training loss: 0.1390891531834469
Validation
Accuracy 0.9784172661870512
Loss: 0.10225286170421084
Epoch 2 / 3 
Training
Step completed: 0
Step completed: 100
Step completed: 200
Step completed: 300
Step completed: 400
Step completed: 500
Step completed: 600
Step completed: 700
Step completed: 800
Step completed: 900
Step completed: 1000
Step completed: 1100
Average training loss: 0.03466083063148953
Validation
Accuracy 0.982973621103118
Loss: 0.09205385164573959
Epoch 3 / 3 
Training
Step completed: 0
Step completed: 100
Step completed: 200
Step completed: 300
Step completed: 400
Step completed: 500
Step completed: 600
Step completed: 700
Step completed: 800
Step completed: 900
Step completed: 1000
Step completed: 1100


In [15]:
# calculating metrics

from sklearn.metrics import precision_score, recall_score
def calc_f1(preds, labels):    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat)

def calc_precision(preds, labels):    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat)

def calc_recall(preds, labels):    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat) 

model.eval()
accuracy_sum = 0
total_eval_loss = 0
f1_sum = 0
precision_sum = 0
recall_sum = 0
nb_eval_steps = 0
    
for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():        
        loss = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[0]

        logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)[1]

    total_eval_loss += loss.item() 
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    accuracy_sum += calc_accuracy(logits, label_ids)
    f1_sum += calc_f1(logits, label_ids)
    precision_sum += calc_precision(logits, label_ids)
    recall_sum += calc_recall(logits, label_ids)


print('Accuracy',accuracy_sum / len(validation_dataloader))
print('Precision', precision_sum / len(validation_dataloader))
print('Recall', recall_sum / len(validation_dataloader))
print('F1', f1_sum / len(validation_dataloader))
print('Loss:', total_eval_loss / len(validation_dataloader))

Accuracy 0.9848920863309357
Precision 0.9913967527276881
Recall 0.9913967527276881
F1 0.9833987518233149
Loss: 0.09388610958160452
