##Installing necessary Dependencies

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 20.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling 

In [None]:
!nvidia-smi

Wed Aug  3 15:26:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

##T5 fine-tuning

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import tensorflow as tf
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

## Model

We'll be using the awesome [pytorch-lightning](https://github.com/PytorchLightning/pytorch-lightning) library for training. Most of the below code is adapted from here https://github.com/huggingface/transformers/blob/master/examples/lightning_base.py

The trainer is generic and can be used for any text-2-text task. You'll just need to change the dataset. Rest of the code will stay unchanged for all the tasks.


In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.save_hyperparameters(hparams)
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.global_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    #return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
    return None

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
  #   if self.trainer.use_tpu:
  #     xm.optimizer_step(optimizer)
  #   else:
  #     optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()

    def optimizer_step(self,
                     epoch=None, 
                    batch_idx=None, 
                    optimizer=None, 
                    optimizer_idx=None, 
                    optimizer_closure=None, 
                    on_tpu=None, 
                    using_native_amp=None, 
                    using_lbfgs=None
                     ):
      optimizer.step(closure=optimizer_closure)
      optimizer.zero_grad()
      self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

Let's define the hyperparameters and other arguments. You can overide this `dict` for specific task as needed. While in most of cases you'll only need to change the `data_dir`and `output_dir`.

Here the batch size is 4 and gradient_accumulation_steps are 16 so the effective batch size is 128

In [None]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=16,
    eval_batch_size=16,
    test_batch_size=16,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

##Twitter Data

####Reading Data

In [None]:
#importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


In [None]:
#importing Dataset
data = pd.read_csv("/content/complaints-data.csv")

In [None]:
#naming columns
data.set_axis(['id', 'tweet', 'target','apparel'], axis='columns', inplace=True)

In [None]:
#labeling target variable
data = data.replace({'target': {0: 'Not', 
                                 1: 'Com'}})

In [None]:
data.shape

(3448, 4)

In [None]:
data.head()

Unnamed: 0,id,tweet,target,apparel
0,23364081385734144,@FC_Help Hi - I'm writing a piece for MSN Him ...,Not,apparel
1,25550410899005441,@FC_Help i need to check my order,Not,apparel
2,29494955818876928,@FC_Help I need to get in contact with someone...,Com,apparel
3,32523695972945920,@FC_Help How can I get a hold of you so we can...,Not,apparel
4,34600441576824832,@FC_Help Will you be getting the wendy cotton ...,Com,apparel


In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

  


In [None]:
data.tweet

0       @FC_Help Hi - I'm writing a piece for MSN Him and wondered who I could talk in the PR dept about this season's clothes/trends, etc? Help!   
1       @FC_Help   i need to check my order                                                                                                         
2       @FC_Help I need to get in contact with someone regarding the fc.com website. I work for a major search engine. Tried enquiries@ and web form
3       @FC_Help How can I get a hold of you so we can discuss the problem I am havnig with my coat?                                                
4       @FC_Help Will you be getting the wendy cotton v neck dress in pavlova back in stock on the site?                                            
                                                      ...                                                                                           
3443    @AmazonHelp You did not upload the ep14 of Middle series S9 again?? It was deleted ?              

In [None]:
data["target"].value_counts()

Not    2216
Com    1232
Name: target, dtype: int64

In [None]:
from torch import nn
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, AutoModel
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score

In [None]:
class TweetDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=512):
        self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column_1 = 'tweet'
        self.target_column = 'target'
        self.data = pd.read_csv(self.path)
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            input_,target = self.data.loc[idx, self.source_column_1],self.data.loc[idx, self.target_column]

            input_ =input_ + "</s>"
            target =target + "</s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, padding='max_length', truncation= True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, padding='do_not_pad', truncation=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

### spliting data to train and validation

In [None]:
train = data.sample(frac = 0.9,random_state = 1)
val = data.sample(frac = 0.1,random_state = 41)

In [None]:
train.shape,val.shape

((3103, 4), (345, 4))

In [None]:
train.columns

Index(['id', 'tweet', 'target', 'apparel'], dtype='object')

In [None]:
drop_list = ['id', 'apparel']
train.drop(drop_list,axis = 1,inplace = True)
val.drop(drop_list,axis = 1,inplace = True)

In [None]:
!pwd

/content


In [None]:
!mkdir data
train.to_csv("/content/data/train.csv",index = False)
val.to_csv("/content/data/val.csv",index = False)

### Prepare Dataset

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base', truncation = True)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [None]:
train.columns

Index(['tweet', 'target'], dtype='object')

The dataset below takes care of reading the tweets  and processing the examples in text-2-text format.

It cleans the tweet. It also appends the eos token `</s>` at the end of input and target as required by the T5 model 

For T5 max input length is 512 and we can choose the max length for target sequence depending upon our dataset. The `T5Tokenizer` encodes both '>>Complaint' and 'NoComplaint' as a single ids so I chose the max target length 2, extra 1 for the `</s>` token

In [None]:
!pwd

/content


All the examples are converted in the text-2-text format as shown in the paper. However I didn't use any task prefix here. The examples are encoded as follows,
if the tweet is complaint then the target is '>>Complaint' else 'NoComplaint'

**input**:  @FC_Help Hi - I'm writing a piece for MSN Him and wondered who I could talk in the PR dept about this season's clothes/trends, etc? Help!

**target**: NoComplaint

**input**:  @FC_Help I need to get in contact with someone regarding the fc.com website. I work for a major search engine. Tried enquiries@ and web form

**target**: >>Complaint

In [None]:
dataset = TweetDataset(tokenizer, '/content/data/', 'val', 128)
print("Val dataset: ",len(dataset))

Val dataset:  345


  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


In [None]:
data = dataset[70]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

@SubaruCustCare sitting at DMV for last 2 hours, 60 people still ahead of me, thanks Subaru for not turning in my old plates.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
>>Complaint</s>


In [None]:
if not os.path.exists('t5_data'): 
    os.makedirs('t5_data')

In [None]:
!ls

complaints-data.csv  data  sample_data	t5_data


## Train

In [None]:
!ls t5_data

In [None]:
args_dict.update({'data_dir': '/content/data/', 'output_dir': 't5_data', 'num_train_epochs':2,'max_seq_length':128})
args = argparse.Namespace(**args_dict)
print(args_dict)

{'data_dir': '/content/data/', 'output_dir': 't5_data', 'model_name_or_path': 't5-base', 'tokenizer_name_or_path': 't5-base', 'max_seq_length': 128, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 4, 'eval_batch_size': 4, 'num_train_epochs': 2, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}


In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, filename="{epoch}-checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    #accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    #amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)


Define the `get_dataset` function to return the dataset. The model calls this function to get the train and val datasets. We are defining a dataset function so that we won't need to modify the model code at all. Redefine the function to return different dataset according to the problem. While this is not the best solution for now this works 

In [None]:
def get_dataset(tokenizer, type_path, args):
  return TweetDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

**Initialize model**

In [None]:
model = T5FineTuner(args)

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

**Initialize trainer**

In [None]:
trainer = pl.Trainer(**train_params)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


**start fine-tuning**

In [None]:
trainer.fit(model)

Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  cpuset_checked))


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
print ("training finished")

print ("Saving model")
model.model.save_pretrained('t5_data')

print ("Saved model")

training finished
Saving model
Saved model


## Evaluation

In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [None]:
!pwd

/content


In [None]:
dataset =  TweetDataset(tokenizer, data_dir='/content/data/', type_path='val')
loader = DataLoader(dataset, batch_size=32, num_workers=4)

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  cpuset_checked))


In [None]:
outputs = []
targets = []
for batch in tqdm(loader):
  outs = model.model.generate(input_ids=batch['source_ids'], 
                              attention_mask=batch['source_mask'], 
                              max_length=20)

  dec = [tokenizer.decode(ids) for ids in outs]
  tweets = [tokenizer.decode(ids) for ids in batch['source_ids']]
  target = [tokenizer.decode(ids) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)

  cpuset_checked))


  0%|          | 0/11 [00:00<?, ?it/s]

Let's visualize few predictions on test dataset

In [None]:
# for i in range(10):
#     lines = textwrap.wrap("Tweets:\n%s\n" % tweets[i], width=100)
#     print("\n".join(lines))
#     print("\nActual sentiment: %s" % targets[i])
#     print("Predicted sentiment: %s" % dec[i])
#     print("=====================================================================\n")

In [None]:
outputs[:8]

['<pad> NoComplaint</s>',
 '<pad> NoComplaint</s>',
 '<pad> NoComplaint</s>',
 '<pad> NoComplaint</s>',
 '<pad> >>Complaint</s>',
 '<pad> NoComplaint</s>',
 '<pad> NoComplaint</s>',
 '<pad> >>Complaint</s>']

In [None]:
#filtering outputs for evaluating with targets
results = []
for i in range(len(outputs)):
  results.append(outputs[i][6:])   #removing <pad> from each output and storing it in results

print(results[:8])

['NoComplaint</s>', 'NoComplaint</s>', 'NoComplaint</s>', 'NoComplaint</s>', '>>Complaint</s>', 'NoComplaint</s>', 'NoComplaint</s>', '>>Complaint</s>']


In [None]:
targets[:6]

['NoComplaint</s>',
 'NoComplaint</s>',
 'NoComplaint</s>',
 'NoComplaint</s>',
 '>>Complaint</s>',
 'NoComplaint</s>']

###Metrics

In [None]:
print("accuracy : ",metrics.accuracy_score(targets, results))
#print("roc_auc: ",metrics.roc_auc_score(targets, results))
print("precision: ",metrics.precision_score(targets, results, pos_label='>>Complaint</s>'))
print("recall   : ",metrics.recall_score(targets, results, pos_label='>>Complaint</s>'))
print("f1       : ",metrics.f1_score(targets, results, pos_label='>>Complaint</s>'))

accuracy :  0.9565217391304348
precision:  0.9349593495934959
recall   :  0.9426229508196722
f1       :  0.9387755102040818


In [None]:
print(metrics.classification_report(targets, results))

                 precision    recall  f1-score   support

>>Complaint</s>       0.93      0.94      0.94       122
NoComplaint</s>       0.97      0.96      0.97       223

       accuracy                           0.96       345
      macro avg       0.95      0.95      0.95       345
   weighted avg       0.96      0.96      0.96       345



### Custom Evaluations

In [None]:
# def topic(string):
#     text = "Text : " + string + "</s>"
#     encoding = tokenizer.encode_plus(text,pad_to_max_length=False, return_tensors="pt")
# #     input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
#     outs = model.model.generate(input_ids=encoding["input_ids"], 
#                               attention_mask=encoding["attention_mask"], 
#                               max_length=20)
#     print ("\nOriginal Text ::")
#     print (string)
#     print ("Topic :: ")
#     string_final = [tokenizer.decode(ids) for ids in outs]    
#     return(" ".join(string_final))


    

In [None]:
# str1 = " @FC_Help Hi - I'm writing a piece for MSN Him and wondered who I could talk in the PR dept about this season's clothes/trends, etc? Help!"
# topic(str1)

In [None]:
# str1 = "@FC_Help I need to get in contact with someone regarding the fc.com website. I work for a major search engine. Tried enquiries@ and web form"
# topic(str1)

In [None]:
# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda

# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

# free_gpu_cache()    