In [None]:
!pip install transformers

In [None]:
!pip install torch torchtext torchaudio
!pip install pandas

In [None]:
!nvidia-smi

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset,random_split,DataLoader,RandomSampler,SequentialSampler
import tensorflow as tf
import random
from transformers import AdamW 
from transformers import BertForSequenceClassification, BertConfig
from transformers import BertTokenizer as bt
from transformers import get_linear_schedule_with_warmup
import time, datetime

In [None]:
df = pd.read_csv("train.csv")
print('Number of training sentences: ', len(df))
df.sample(5)

In [None]:
# Print some negative sample tweets
for txt in df[df.target==0].text.sample(10).values:
  print(txt)

In [None]:
# Print some negative sample tweets
for txt in df[df.target==1].text.sample(10).values:
  print(txt)

In [None]:
df.text.isna().sum()

In [None]:
print("Positive data: {:.2f}%".format(len(df[df.target==1])*100/len(df)))
print("Negative data: {:.2f}%".format(len(df[df.target==0])*100/len(df)))

In [None]:
tweets = df.text.values
labels = df.target.values

In [None]:
print("{} out of {} tweets have a http://... link within itself. ({:.2f}%)".format(len([t for t in tweets if "http://" in t]),len(tweets),len([t for t in tweets if "http://" in t])*100/len(tweets))) 

In [None]:
# Print tweets which has url's in it.
[t for t in tweets if "http://" in t][:2]

In [None]:
# Print tweets which have URL's at the end of the tweets
[t for t in [t for t in tweets if "http://" in t] if "http://" in t.split()[-1]][:5]

In [None]:
# Print no of tweets which has a @ in tweets
print(len([t for t in tweets if "@" in t]))
[t for t in tweets if "@" in t][:5]

In [None]:
print("{} out of {} tweets have a @ user_id tag within itself. ({:.2f}%)".format(len([t for t in tweets if "@" in t]),len(tweets),len([t for t in tweets if "@" in t])*100/len(tweets)))

In [None]:
print("percentage of POSITIVE samples containing @user_id tag: {:.2f}%".format(len([t for t in df[df['target']==1]['text'] if "@" in t])*100/len(df[df['target']==1])))
print("percentage of NEGATIVE samples containing @user_id tag: {:.2f}%".format(len([t for t in df[df['target']==0]['text'] if "@" in t])*100/len(df[df['target']==0])))

In [None]:
print("{} out of {} tweets has # tag within itself ({:.2f}%)".format(len([t for t in tweets if "#" in t]),len(tweets),len([t for t in tweets if "#" in t])*100/len(tweets)))

In [None]:
print("percentage of POSITIVE samples containing # hash_tag: {:.2f}%".format(len([t for t in df[df['target']==1]['text'] if "#" in t])*100/len(df[df['target']==1])))
print("percentage of NEGATIVE samples containing # hash_tag: {:.2f}%".format(len([t for t in df[df['target']==0]['text'] if "#" in t])*100/len(df[df['target']==0])))

In [None]:
tokenizer = bt.from_pretrained("bert-base-uncased",do_lower_case=True) 

In [None]:
print(' Tweets and Labels: ', tweets[0], labels[0])
print('Bert Tokenizer output: ', tokenizer.tokenize(tweets[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

In [None]:
tweets = [" ".join([word if 'http://' not in word else "http" for word in t.split()]) for t in tweets]
tweets[-1]

In [None]:
print(' Original: ', tweets[-17])
print('   Target: ', labels[-17])
print('Tokenized: ', tokenizer.tokenize(tweets[-17]))

In [None]:
tweets = [" ".join([word if '@' not in word else word.replace("@", " ") for word in t.split()]) for t in tweets]
tweets[-4]

In [None]:
enc_tweets = [tokenizer.encode(t) for t in tweets]
lens = np.array([len(t) for t in tweets])

print("# of Sentences:",len(tweets))
print("Max Sentence Length:",max(lens))
print("Average Sentence Length:",np.mean(lens))
print("Median Sentence Length:",np.median(lens))

In [None]:
unique = list(set(lens))
unique.sort()
count = [sum([1 if l==u else 0 for l in lens]) for u in unique]
plt.bar(unique, count)

In [None]:
def encode(sentences,labels,tokenize,max_len):
      encode_dicts = []
      for sentence in sentences:
          encode_dicts.append(tokenize.encode_plus(sentence,        # sentence to encode.
                                                   add_special_tokens=True, # append <cls>,<sep> token. 
                                                   max_length = max_len, # append maximum length to the sentence.
                                                   padding='max_length', # append <pad> token till maximum length.
                                                   return_attention_mask=True, # construct attention mask
                                                   return_tensors='pt')) # tensor to return.
      input_ids = torch.cat([d['input_ids'] for d in encode_dicts],dim=0)
      attention_mask = torch.cat([d['attention_mask'] for d in encode_dicts],dim=0)
      labels = torch.tensor(labels)

      return input_ids,attention_mask,labels

In [None]:
input_ids,attention_masks,labels = encode(tweets,labels,tokenizer,max_len=160)

In [None]:
print('Original: ', tweets[1],'\n')
print('Token IDs:', input_ids[1])

In [None]:
print(len(input_ids[0]))
tokenizer.convert_ids_to_tokens(input_ids[0][:30])

In [None]:
def makeDataLoader(input_ids,attention_masks,labels,split=True):
      dataset = TensorDataset(input_ids,attention_masks,labels)
      if(split):
        train_size = int(0.6*len(dataset))
        val_size = len(dataset)-train_size
        trainData,valData = random_split(dataset,[train_size,val_size])
      else:
        trainData = dataset
      batch_size = 32

      if(split):
        train_dataloader = DataLoader(trainData,sampler = RandomSampler(trainData),batch_size = batch_size)
        valid_dataloader = DataLoader(valData,sampler = SequentialSampler(valData),batch_size = batch_size)
        return train_dataloader,valid_dataloader
      else:
        train_dataloader = DataLoader(trainData,sampler = RandomSampler(trainData),batch_size = batch_size)
      return train_dataloader

In [None]:
train_dataloader, validation_dataloader = makeDataLoader(input_ids, attention_masks, labels)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2,output_attentions=False,output_hidden_states=False,)

In [None]:
model.parameters()

In [None]:
# Note: AdamW is a class from the huggingface library (not pytorch)- 'W'= 'Weight Decay fix"
optimizer = AdamW(
                    model.parameters(),
                    lr = 5e-5,         # default 
                    eps = 1e-8,
                    no_deprecation_warning=True         # default 
                )

In [None]:
def flat_accuracy(preds,labels):
      flat_pred = np.argmax(preds,axis=1).flatten()
      flat_labels = labels.flatten()
      return np.sum(flat_pred==flat_labels)/len(flat_labels)

In [None]:
def rand_seed():
    seed_val = 50
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)

In [None]:
def format_time(elapsed):
    elapsed_time = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_time))

In [None]:
len(train_dataloader), len(validation_dataloader)

In [None]:
def train_bert(train_loader,val_loader,model,optimizer,n_epochs,output_hidden=0):
      rand_seed()
      total_time  = time.time()
      training_stats = []
      hidden_states = []

      n_tr_btchs = len(train_loader)
      scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=n_tr_btchs*n_epochs)

      for i in range(n_epochs):
          start_time = time.time()
          total_train_loss,total_train_accuracy = 0,0
          model.train()

          for step,batch in enumerate(train_loader):
              input_ids,attn_masks,labels = batch
              model.zero_grad()
              if(output_hidden):
                  print("Inside If - Training")
                  print(model(input_ids,
                                        token_type_ids=None,
                                        attention_mask = attn_masks,
                                        labels = labels))
                  loss, logits, h = model(input_ids,
                                        token_type_ids=None,
                                        attention_mask = attn_masks,
                                        labels = labels)
                  h = [layer.detach().cpu().numpy() for layer in h]
                  if(i==n_epochs-1):
                      hidden_states.append(h[-1])
              else:  
                  print("Inside Else - Training")
                  loss = model(input_ids,
                                      token_type_ids=None,
                                      attention_mask= attn_masks,
                                      labels=labels)['loss']
                  logits = model(input_ids,
                                      token_type_ids=None,
                                      attention_mask= attn_masks,
                                      labels=labels)['logits']
              total_train_loss+=loss.item()
              total_train_accuracy+=flat_accuracy(logits.detach().cpu().numpy(),labels.detach().cpu().numpy())
              loss.backward()
              torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
              optimizer.step()
              scheduler.step()
          print("Epoch: {}/{}".format((i+1), n_epochs),
              "  Train loss: {0:.4f}".format(total_train_loss/n_tr_btchs),
              "  Train Acc: {0:.4f}".format(total_train_accuracy/n_tr_btchs),
              "  ({:})".format(format_time(time.time() - start_time)))
          training_stats.append("Epoch: {}/{}".format((i+1), n_epochs),
              "  Train loss: {0:.4f}".format(total_train_loss/n_tr_btchs),
              "  Train Acc: {0:.4f}".format(total_train_accuracy/n_tr_btchs),
              "  ({:})".format(format_time(time.time() - start_time)))
          if val_loader is not None:
              n_valid_btchs = len(val_loader)
              start_time = time.time()
              model.eval()
              total_eval_accuracy,total_eval_loss = 0,0

              for step,batch in enumerate(val_loader):
                  input_ids,attn_masks,labels = batch
                  with torch.no_grad():
                    if(output_hidden):
                        print("Inside If - Validation")
                        loss,logits,val_h = model(input_ids,token_type_ids=None,attention_mask=attn_masks,labels=labels)
                        val_h = [layer.detach().cpu().numpy() for layer in val_h]
                    else:
                        print("Inside Else - Validation")
                        loss = model(input_ids,token_type_ids=None,attention_mask = attn_masks,labels = labels)['loss']
                        logits = model(input_ids,token_type_ids=None,attention_mask = attn_masks,labels = labels)['logits']
                  total_eval_loss+=loss.item()
                  logits = logits.detach().cpu().numpy()
                  labels = labels.detach().cpu().numpy()
                  total_eval_accuracy+=flat_accuracy(logits,labels)
              print("Valid Loss: {0:.4f}".format(total_eval_loss/n_valid_btchs),
                  "Valid Acc: {0:.4f}".format(total_eval_accuracy/n_valid_btchs),
                  "({:})".format(format_time(time.time()-start_time)))

              training_stats.append({'Valid. Loss':total_eval_loss/n_valid_btchs,
                                    'Valid. Acc':   total_eval_accuracy/n_valid_btchs,
                                    'Validation Time': format_time(time.time()-start_time)})
      print("\nTraining complete.")
      print("Duration: {:} (h:mm:ss)".format(format_time(time.time()-total_time)))

      if output_hidden:
        return training_stats, hidden_states
      else:
        return training_stats        

In [None]:
training_stats = train_bert(train_dataloader, validation_dataloader, 
                            model=model, optimizer=optimizer, 
                            n_epochs=2,output_hidden=0)

In [None]:
train_dataloader = makeDataLoader(input_ids, attention_masks, labels, split=0)

In [None]:
model = BertForSequenceClassification.from_pretrained(
              "bert-base-uncased",          # 12-layer BERT base model, w/ uncased vocab
              num_labels = 2,               # number of output labels (2 for binary classification)  
              output_attentions = False,    # Whether the model returns attentions weights.
              output_hidden_states = False, # Whether the model returns all hidden-states.
        )

In [None]:
optimizer = AdamW(  model.parameters(), lr = 5e-5, eps = 1e-8)

In [None]:
training_stats = train_bert(train_dataloader, None,
                            model=model, optimizer=optimizer, 
                            n_epochs=2)

# Preparing Test Data for Prediction 

In [None]:
test_df = pd.read_csv("test.csv")
test_sentences = test_df.text.values
test_sentences = [" ".join([word if('http://' or 'https://') not in word else 'http' for word in t.split()]) for t in test_sentences]
test_sentences = ["".join([word for word in t.split() if '@' not in word]) for t in test_sentences]
test_encoded_sentences = [tokenizer.encode(sentence) for sentence in test_sentences]
test_sent_lens = np.array([len(s) for s in test_encoded_sentences])

print("# of sentences:",len(test_sentences))
print('Max Sentence Length',max(test_sent_lens))
print('Avg sentence Length',np.mean(test_sent_lens))
print('Median sentence Length',np.median(test_sent_lens))

In [None]:
test_encoded_dicts = [tokenizer.encode_plus( sent,
                                             add_special_tokens=True,
                                             max_length = 100,
                                             pad_to_max_length = True,
                                             return_attention_mask = True,
                                             return_tensors='pt') for sent in test_sentences]
input_ids = [d['input_ids'] for d in test_encoded_dicts]
input_ids = torch.cat(input_ids,dim=0)
attention_masks = [d['attention_mask'] for d in test_encoded_dicts]
attention_masks = torch.cat(attention_masks,dim=0)
prediction_data = TensorDataset(input_ids,attention_masks)
prediction_dataloader = DataLoader(dataset = prediction_data,
                                   sample = SequentialSampler(prediction_data),
                                   batch_size=32)                   

In [None]:
len(prediction_dataloader)

In [None]:
print('Predicting Labels for {:} Test Sentences'.format(len(input_ids)))
model.eval()
predictions,true_labels = [],[]
for batch in prediction_dataloader:
    batch = tuple(token for token in batch)
    b_input_ids,b_input_mask = batch
    with torch.no_grad():
        logits = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask)['logits']
    logits = logits[0].detach().cpu().numpy()
    predictions.append(logits)

print('Predictions Done!....')

In [None]:
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
print(flat_predictions)