In [None]:
!pip install SentencePiece
!pip install transformers
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade

In [None]:
import pandas as pd
import re
import sklearn
import sklearn.model_selection
import transformers
import torch
import pytorch_lightning as pl
from google.colab import drive
from torch.utils.data import random_split, DataLoader, TensorDataset, RandomSampler
from transformers import AlbertTokenizer

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
trd = pd.read_csv('/content/drive/My Drive/bert-tf/Khilnani_LP_train_data.csv')
ted = pd.read_csv('/content/drive/My Drive/bert-tf/Khilnani_LP_test_data.csv')

In [None]:
trd.describe(), trd['tweet'][3], len(trd), trd.columns

(                 id         label
 count  31962.000000  31962.000000
 mean   15981.500000      0.070146
 std     9226.778988      0.255397
 min        1.000000      0.000000
 25%     7991.250000      0.000000
 50%    15981.500000      0.000000
 75%    23971.750000      0.000000
 max    31962.000000      1.000000,
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 31962,
 Index(['id', 'label', 'tweet'], dtype='object'))

In [None]:
ted.describe(), len(ted)

(                 id
 count  17197.000000
 mean   40561.000000
 std     4964.490625
 min    31963.000000
 25%    36262.000000
 50%    40561.000000
 75%    44860.000000
 max    49159.000000, 17197)

In [None]:
trd.head(1), ted.head(5)

(   id  label                                              tweet
 0   1      0   @user when a father is dysfunctional and is s...,
       id                                              tweet
 0  31963  #studiolife #aislife #requires #passion #dedic...
 1  31964   @user #white #supremacists want everyone to s...
 2  31965  safe ways to heal your #acne!!    #altwaystohe...
 3  31966  is the hp and the cursed child book up for res...
 4  31967    3rd #bihday to my amazing, hilarious #nephew...)

In [None]:
def remove_mentions(tweets_list):
  proc_list = []
  for ind, d in enumerate(tweets_list):
    l = re.sub(r'(\s|^)@\w+', '', d)
    proc_list.append(l.strip())
  return proc_list

In [None]:
train_full = remove_mentions(trd['tweet'])
trlabels = [int(i) for i in trd['label']]
test_data = remove_mentions(ted['tweet'])
# test_labels = [int(i) for i in ted['label']]

In [None]:
class DataModule(pl.LightningDataModule):
  def __init__(self, train_data, train_labels,
               test_data, batch_size=32):
    super().__init__()
    self.train_data = train_data
    self.train_labels = train_labels
    self.test_data = test_data
    self.tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    self.batch_size = batch_size
  def setup(self):
    trk = self.tokenizer(
            self.train_data,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.
            padding='max_length',
            truncation=True,
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt'  # Return pytorch tensors.
        )
 
    tek = self.tokenizer(
            self.test_data,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.
            padding='max_length',
            truncation=True,
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt'  # Return pytorch tensors.
        )
    print(trk.keys())
    tr_inp_ids = trk['input_ids']
    tr_attn_mask = trk['attention_mask']
    tr_token_type_ids = trk['token_type_ids']
    te_inp_ids = tek['input_ids']
    te_attn_mask = tek['attention_mask']
    te_token_type_ids = tek['token_type_ids']
    tr_labels = torch.tensor(self.train_labels)
    # print(list(map(type, [tr_inp_ids, tr_attn_mask, tr_token_type_ids, tr_labels])))
    tr_dataset = TensorDataset(tr_inp_ids, tr_attn_mask, tr_token_type_ids, tr_labels)
    train_size = int(0.9 * len(tr_dataset))
    val_size = len(tr_dataset) - train_size
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))

    self.train_dataset, self.val_dataset = random_split(
        tr_dataset, [train_size, val_size],
        generator=torch.Generator().manual_seed(42))

    self.test_dataset = TensorDataset(te_inp_ids, te_attn_mask, te_token_type_ids)
  def train_dataloader(self):
    return DataLoader(
            self.train_dataset,  # The training samples.
            sampler=RandomSampler(
                self.train_dataset),  # Select batches randomly
            batch_size=self.batch_size  # Trains with this batch size.
        )
  def val_dataloader(self):
    return DataLoader(
            self.val_dataset,  # The training samples.
            sampler=RandomSampler(self.val_dataset),  # Select batches randomly
            batch_size=self.batch_size,  # Trains with this batch size.
            shuffle=False)
  def test_dataloader(self):
    return DataLoader(
            self.test_dataset,  # The training samples.
            sampler=RandomSampler(self.test_dataset),  # Select batches randomly
            batch_size=self.batch_size,  # Trains with this batch size.
            shuffle=False)

In [None]:
dls = DataModule(train_full, trlabels, test_data)
dls.setup()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
28,765 training samples
3,197 validation samples


In [None]:
a = next(iter(dls.val_dataloader()))
a[0].shape, a[1].shape, a[2].shape, a[3].shape

(torch.Size([32, 64]),
 torch.Size([32, 64]),
 torch.Size([32, 64]),
 torch.Size([32]))