In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
train_data = pd.read_csv("../input/nlp-getting-started/train.csv")
test_data = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
train_data.head()

In [None]:
train_data['keyword'].value_counts()

In [None]:
train_data['keyword'].nunique()


In [None]:
train_data['location'].value_counts()


In [None]:
train_data["text"][4]

In [None]:
import seaborn as sns
import matplotlib as plt
ax = sns.countplot(train_data.target)

In [None]:
import string
import re
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

# data preprocessing
def clean_text(text):
    text = text.lower()
    # remove hyperlinks
    text = re.sub(r"http\S+", "", text)
    # remove spcl characters
    text = "".join([word for word in text if word not in string.punctuation])
    text = re.sub("\W", " ", str(text))
    # remove stopwords
    text = [word for word in text.split() if word not in stopwords]
    # remove any numeric characters
    text = [word for word in text if re.search("\d", word)== None]
    # convert split to text again
    text = ' '.join(word for word in text)
    return text

# train_data['text_clean'] = train_data['text'].apply(lambda x: clean_text(x))
train_data['text_clean'] = train_data['text'].apply(clean_text)

test_data['text_clean'] = test_data['text'].apply(clean_text)

In [None]:

train_data.head()

In [None]:
train_data.drop(['id','keyword','location', 'text'], axis = 1) 


In [None]:
test_data.head()

In [None]:
test_data.drop(['id','keyword','location', 'text'], axis = 1) 


In [None]:
train_data["target"].value_counts()


In [None]:
data = train_data['text_clean'].values
labels = train_data['target'].values

ELECTRA 

In [None]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification,AdamW
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator',num_labels=2)
model.cuda()


In [None]:
#important to know the max len of each sentence

import matplotlib.pyplot as plt
def plot_sentence_embeddings_length(text_list, tokenizer):
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t), text_list))
    tokenized_texts_len = list(map(lambda t: len(t), tokenized_texts))
    fig, ax = plt.subplots(figsize=(8, 5));
    ax.hist(tokenized_texts_len, bins=60);
    ax.set_xlabel("Length of Comment Embeddings");
    ax.set_ylabel("Number of Comments");
    return max(tokenized_texts_len)


plot_sentence_embeddings_length(data, tokenizer)

In [None]:
token_lens = []
for txt in data:
    
    tokens = tokenizer.encode(txt, max_length=70)
    token_lens.append(len(tokens))

sns.distplot(token_lens)
plt.xlim([0, 40]);
plt.xlabel('Token count')

In [None]:
max(token_lens)

From the graph we can conclude that the max number of tweets have less than 30 tokens. so let us take the max_len as 36.

In [None]:
indices=tokenizer.batch_encode_plus(data,max_length=38,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)

In [None]:
indices.keys()

In [None]:
input_ids=indices["input_ids"]
attention_masks=indices["attention_mask"]

In [None]:
from sklearn.model_selection import train_test_split

# Use 99% for training and 1% for validation.
train_ids, val_ids, train_labels, val_labels = train_test_split(input_ids, labels, 
                                                            random_state=42, test_size=0.2)
# Do the same for the masks.
train_masks, val_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=42, test_size=0.2)

In [None]:
len(train_ids)
# len(train_labels)

convert data to tensors

In [None]:
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [None]:

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#TRAINING DATA

train_data = TensorDataset(train_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler = train_sampler,batch_size = 32)

In [None]:
len(train_dataloader)

In [None]:
train_iter3 = iter(train_dataloader)
print(type(train_iter3))

print(len(train_iter3))

In [None]:
#Validation Data

val_data = TensorDataset(val_ids, val_masks, val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data,sampler = val_sampler,batch_size = 32)

In [None]:
optimizer = AdamW(model.parameters(),lr = 6e-6, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                 )


In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
import numpy as np


In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def acc_score(y_pred, y_true):
    # correct labels = 0
    cor = 0
    # loop over all the entries in test data
    for i in range(len(y_pred)):
        # if predicted = actual label, add 1 to correct labels
        if(y_pred[i] == y_true[i]):
            cor +=1
    # return accuracy score
    return cor/len(y_pred)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Store the average loss after each epoch so we can plot them.
loss_values = []


In [None]:
for epoch_i in range(epochs):
    print("epoch is" + str(epoch_i))
    print("training...")
    t0 = time.time()
    total_loss = 0
    model.train()
    for step,batch in enumerate(train_dataloader): # total steps are 191... runs from step 0 to steps 190
        print("step",step)
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        outputs = model(b_input_ids,token_type_ids = None, attention_mask = b_input_mask,labels = b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss. backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print("avg_train_loss",avg_train_loss)
        loss_values.append(avg_train_loss)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("Training complete!")


Validation

In [None]:
model.eval()

pred = []
true = []
eval_acc = 0
nb_eval_steps = 0

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids,token_type_ids = None, attention_mask = b_input_mask)
        logits = outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        pred.append(logits)
        true.append(label_ids)
        temp_eval_acc = flat_accuracy(logits,label_ids)
        eval_acc += temp_eval_acc
        nb_eval_steps += 1
        
print("  Accuracy: {0:.2f}".format(eval_acc/nb_eval_steps))

In [None]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in pred for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true for item in sublist]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(flat_predictions,flat_true_labels))

Testing

In [None]:
model.eval()

pred = []
true = []
eval_acc = 0
nb_eval_steps = 0

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids,token_type_ids = None, attention_mask = b_input_mask)
        logits = outputs[0]
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        pred.append(logits)
        true.append(label_ids)
        temp_eval_acc = flat_accuracy(logits,label_ids)
        eval_acc += temp_eval_acc
        nb_eval_steps += 1
        
print("  Accuracy: {0:.2f}".format(eval_acc/nb_eval_steps))

In [None]:
test_data = test_data.text.values

indices=tokenizer.batch_encode_plus(test_data,max_length=38,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)

In [None]:
input_ids = indices["input_ids"]
att_mask = indices["attention_mask"]

In [None]:
test_ids = torch.tensor(input_ids)
test_mask = torch.tensor(att_mask)

In [None]:
batch_size = 32


prediction_data = TensorDataset(test_ids, test_mask)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


In [None]:
len(test_ids)

In [None]:
model.eval()

predictions = []

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids,token_type_ids = None, attention_mask = b_input_mask)
        logits = outputs[0]
        
        logits = logits.detach().cpu().numpy()
        
        predictions.append(logits)
        
        
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
sample_sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submit=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':flat_predictions})
submit.to_csv('submission.csv',index=False)

In [None]:
submit.head()