In [None]:
!pip install SentencePiece
!pip install transformers
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade

Collecting git+https://github.com/PytorchLightning/pytorch-lightning.git@master
  Cloning https://github.com/PytorchLightning/pytorch-lightning.git (to revision master) to /tmp/pip-req-build-mn37_fku
  Running command git clone -q https://github.com/PytorchLightning/pytorch-lightning.git /tmp/pip-req-build-mn37_fku
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [None]:
import pandas as pd
import numpy as np
import re
import sklearn
import sklearn.model_selection
import transformers
import torch
import torch.nn as nn
import pytorch_lightning as pl
from google.colab import drive
from torch.utils.data import random_split, DataLoader, TensorDataset, RandomSampler
from transformers import AlbertTokenizer

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
trd = pd.read_csv('/content/drive/My Drive/bert-tf/Khilnani_LP_train_data.csv')
ted = pd.read_csv('/content/drive/My Drive/bert-tf/Khilnani_LP_test_data.csv')

In [None]:
trd.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


In [None]:
trd['tweet'][:3], trd.columns, trd['label'][:3]

(0     @user when a father is dysfunctional and is s...
 1    @user @user thanks for #lyft credit i can't us...
 2                                  bihday your majesty
 Name: tweet, dtype: object,
 Index(['id', 'label', 'tweet'], dtype='object'),
 0    0
 1    0
 2    0
 Name: label, dtype: int64)

In [None]:
ted.describe(), len(ted)

(                 id
 count  17197.000000
 mean   40561.000000
 std     4964.490625
 min    31963.000000
 25%    36262.000000
 50%    40561.000000
 75%    44860.000000
 max    49159.000000, 17197)

In [None]:
trd.head(1), ted.head(5)

(   id  label                                              tweet
 0   1      0   @user when a father is dysfunctional and is s...,
       id                                              tweet
 0  31963  #studiolife #aislife #requires #passion #dedic...
 1  31964   @user #white #supremacists want everyone to s...
 2  31965  safe ways to heal your #acne!!    #altwaystohe...
 3  31966  is the hp and the cursed child book up for res...
 4  31967    3rd #bihday to my amazing, hilarious #nephew...)

In [None]:
def remove_mentions(tweets_list):
  proc_list = []
  for ind, d in enumerate(tweets_list):
    l = re.sub(r'(\s|^)@\w+', '', d)
    proc_list.append(l.strip())
  return proc_list

In [None]:
train_full = remove_mentions(trd['tweet'])
trlabels = [int(i) for i in trd['label']]
test_data = remove_mentions(ted['tweet'])
# test_labels = [int(i) for i in ted['label']]

In [None]:
class DataModule(pl.LightningDataModule):
  def __init__(self, train_data, train_labels,
               test_data, batch_size=32):
    super().__init__()
    self.train_data = train_data
    self.train_labels = train_labels
    self.test_data = test_data
    self.tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    self.batch_size = batch_size
  def setup(self):
    trk = self.tokenizer(
            self.train_data,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.
            padding='max_length',
            truncation=True,
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt'  # Return pytorch tensors.
        )
 
    tek = self.tokenizer(
            self.test_data,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.
            padding='max_length',
            truncation=True,
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt'  # Return pytorch tensors.
        )
    print(trk.keys())
    tr_inp_ids = trk['input_ids']
    tr_attn_mask = trk['attention_mask']
    tr_token_type_ids = trk['token_type_ids']
    te_inp_ids = tek['input_ids']
    te_attn_mask = tek['attention_mask']
    te_token_type_ids = tek['token_type_ids']
    tr_labels = torch.tensor(self.train_labels)
    # print(list(map(type, [tr_inp_ids, tr_attn_mask, tr_token_type_ids, tr_labels])))
    tr_dataset = TensorDataset(tr_inp_ids, tr_attn_mask, tr_token_type_ids, tr_labels)
    train_size = int(0.9 * len(tr_dataset))
    val_size = len(tr_dataset) - train_size
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))

    self.train_dataset, self.val_dataset = random_split(
        tr_dataset, [train_size, val_size],
        generator=torch.Generator().manual_seed(42))

    self.test_dataset = TensorDataset(te_inp_ids, te_attn_mask, te_token_type_ids)
  def train_dataloader(self):
    return DataLoader(
            self.train_dataset,  # The training samples.
            sampler=RandomSampler(
                self.train_dataset),  # Select batches randomly
            batch_size=self.batch_size  # Trains with this batch size.
        )
  def val_dataloader(self):
    return DataLoader(
            self.val_dataset,  # The training samples.
            sampler=RandomSampler(self.val_dataset),  # Select batches randomly
            batch_size=self.batch_size,  # Trains with this batch size.
            shuffle=False)
  def test_dataloader(self):
    return DataLoader(
            self.test_dataset,  # The training samples.
            sampler=RandomSampler(self.test_dataset),  # Select batches randomly
            batch_size=self.batch_size,  # Trains with this batch size.
            shuffle=False)

In [None]:
dls = DataModule(train_full, trlabels, test_data)
dls.setup()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
28,765 training samples
3,197 validation samples


In [None]:
a = next(iter(dls.val_dataloader()))
a[0].shape, a[1].shape, a[2].shape, a[3].shape

(torch.Size([32, 64]),
 torch.Size([32, 64]),
 torch.Size([32, 64]),
 torch.Size([32]))

In [None]:
from transformers import AlbertModel

In [None]:
class HSModel(pl.LightningModule):
  def __init__(self, *args, **kwargs):
    super().__init__()
    self.model = AlbertModel.from_pretrained('albert-base-v2')
    self.loss_fn = nn.NLLLoss()
    self.linear1 = nn.Linear(768, 768)
    self.relu1 = nn.ReLU()
    self.dropout1 = nn.Dropout(0.5)
    self.linear2 = nn.Linear(768, 2)
  def forward(self, batch):
    input_ids = batch[0]
    attention_mask = batch[1]
    token_type_ids = batch[2]
    labels = batch[3] if len(batch) > 3 else None
    
    outs = self.model(input_ids = input_ids,
                           attention_mask = attention_mask,
                           token_type_ids = token_type_ids
                           )

    out1 = self.linear1(outs['pooler_output'])
    out1 = self.relu1(out1)
    out1 = self.dropout1(out1)
    out2 = self.linear2(out1)
    logits = nn.functional.log_softmax(out2)
    loss = None
    if labels is not None:
      loss = self.loss_fn(logits, labels)
    return loss, logits
  def training_step(self, batch, idx):
    loss, logits = self(batch)
    return loss
  def validation_step(self, batch, idx):
    loss, logits = self(batch)
    return loss
  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=1e-5)

In [None]:
m = HSModel()
trainer = pl.Trainer(max_epochs=3, logger=True, gpus=1)
trainer.fit(m, dls)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | 

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  f'Your {mode}_dataloader has `shuffle=True`, it is best practice to turn'




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [None]:
m

HSModel(
  (model): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
# evaluation
m.eval()
val_dl = dls.val_dataloader()

In [None]:
# validation data
vbatch = next(iter(val_dl))
_, vlogits = m(vbatch)
vlogits
vpreds = torch.argmax(vlogits, dim=1)
preds_cpu, true_labels_cpu = vpreds.cpu().numpy(), vbatch[3].cpu().numpy()
hs_indices = np.where(true_labels_cpu==1)
hs_ins = len(hs_indices)
tp_and_fp_indices = np.where(preds_cpu==1)
tp_and_fp_ins = len(tp_and_fp_indices)
tp_ins = sum(preds_cpu[hs_indices] == 1)
print(f'tp_ins: {tp_ins}, tp_and_fp_indices: {tp_and_fp_indices}, {len(tp_and_fp_indices)}, preds: {preds_cpu}')
precision = None
if tp_and_fp_ins > 0:
  precision = tp_ins/tp_and_fp_ins
print(f'precision : {precision}')

tp_ins: 2, tp_and_fp_indices: (array([24, 27]),), 1, preds: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0]
precision : 2.0




In [None]:
# test data
test_dl = dls.test_dataloader()
tbatch = next(iter(test_dl))
_, tlogits = m(tbatch)
tlogits, torch.argmax(tlogits)



(tensor([[-1.7927e-03, -6.3249e+00],
         [-1.0201e-03, -6.8883e+00],
         [-1.2352e-03, -6.6972e+00],
         [-1.2228e-03, -6.7072e+00],
         [-1.0712e+00, -4.1945e-01],
         [-3.2001e-03, -5.7462e+00],
         [-8.1207e-03, -4.8174e+00],
         [-8.9748e-04, -7.0164e+00],
         [-1.0635e-03, -6.8468e+00],
         [-1.5389e-03, -6.4775e+00],
         [-1.3275e-03, -6.6251e+00],
         [-2.2297e-03, -6.1070e+00],
         [-3.2696e-03, -5.7247e+00],
         [-1.1627e-03, -6.7576e+00],
         [-1.5774e-03, -6.4527e+00],
         [-7.6715e-03, -4.8741e+00],
         [-1.2627e-02, -4.3783e+00],
         [-6.7981e-03, -4.9945e+00],
         [-1.2526e-01, -2.1393e+00],
         [-1.1488e-03, -6.7697e+00],
         [-1.1527e-03, -6.7662e+00],
         [-1.0201e-03, -6.8883e+00],
         [-5.7087e-03, -5.1686e+00],
         [-1.0792e-03, -6.8321e+00],
         [-1.1772e-03, -6.7452e+00],
         [-1.2027e-03, -6.7238e+00],
         [-2.1867e-02, -3.8337e+00],
 