## Previous Configurations

In [7]:
#Install dependencies and set configuration variables
def install_dependecies():
  !pip install sentencepiece
  !pip install transformers
  !pip install pytorch-lightning
  !pip install scikit-multilearn

install_dependecies()

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 9.9MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 50.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 33.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=614b956

In [8]:
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig

from re import T
import pandas as pd
from torch import cuda
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import transformers
import numpy as np
from sklearn import metrics
from sklearn.metrics import accuracy_score
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from sklearn.metrics import accuracy_score
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from joblib import dump, load
from skmultilearn.problem_transform import BinaryRelevance
import abc

## Helper Classes

In [10]:
#Helper Classes
class CustomModel(pl.LightningModule):

    def __init__(self, hparams, training_dataset, validation_dataset, labels, model_to_use):
        super().__init__()

        self.hparams = hparams
        self.training_dataset = training_dataset
        self.validation_dataset = validation_dataset
        self.labels = labels

        self.define_model(model_to_use)

    @abc.abstractmethod
    def define_model(self, model_to_use):
        pass

    @abc.abstractmethod
    def forward(self, ids, mask, token_type_ids):
        pass

    def loss_fn(self, outputs, targets):
      return torch.nn.BCEWithLogitsLoss()(outputs, targets)

    def general_step(self, batch, batch_idx, mode):
      ids = batch['ids']
      mask = batch['mask']
      token_type_ids = batch['token_type_ids']
      targets = batch['targets']

      outputs = self.forward(ids, mask, token_type_ids)

      return {'outputs': outputs, 'targets': targets}

    #******Training******
    #This method runs on each GPU
    def training_step(self, batch, batch_idx):
      return self.general_step(batch, batch_idx, "train")
    
    #This method aggregates the results of training_step in the different GPUs
    def training_step_end(self, aggregated_outputs):
      loss = self.loss_fn(aggregated_outputs["outputs"], aggregated_outputs["targets"])
      self.log('training_loss',loss)
      return {'loss':loss}
    
    #This method runs at the end of each epoch
    def training_epoch_end(self, results_of_each_batch):
      pass

    #******Validation******
    #This method runs in each GPU
    def validation_step(self, batch, batch_idx):
      return self.general_step(batch, batch_idx, "val")

    #This method aggregates the results of validation_step in the different GPUs
    def validation_step_end (self, aggregated_outputs):
      outputs = torch.sigmoid(aggregated_outputs['outputs']).cpu().detach().numpy().tolist()
      predictions = (np.array(outputs) >= 0.5).astype(int)

      targets = aggregated_outputs['targets'].cpu().detach().numpy()

      return {'predictions': predictions, 'targets': targets}

    #This method runs at the end of each epoch
    def validation_epoch_end(self, results_of_each_batch):
      predictions = np.empty([0,len(self.labels)])
      targets = np.empty([0,len(self.labels)])

      for result in results_of_each_batch:
        predictions = np.concatenate((predictions,result['predictions']))
        targets = np.concatenate((targets,result['targets']))
      
      total_accuracy, accuracy_per_label = self.evaluate_results(predictions, targets)
      self.log('total_accuracy', total_accuracy)
      self.log('accuracy_per_label',accuracy_per_label)
    
    #******Test******
    #This method runs in each GPU
    def test_step(self, batch, batch_idx):
      return self.general_step(batch, batch_idx, "val")
    
    #This method aggregates the results of validation_step in the different GPUs
    def test_step_end (self, aggregated_outputs):
      outputs = torch.sigmoid(aggregated_outputs['outputs']).cpu().detach().numpy().tolist()
      predictions = (np.array(outputs) >= 0.5).astype(int)

      targets = aggregated_outputs['targets'].cpu().detach().numpy()

      return {'predictions': predictions, 'targets': targets}
    
    #This method runs at the end of each epoch
    def test_epoch_end(self, results_of_each_batch):
      predictions = np.empty([0,len(self.labels)])
      targets = np.empty([0,len(self.labels)])

      for result in results_of_each_batch:
        predictions = np.concatenate((predictions,result['predictions']))
        targets = np.concatenate((targets,result['targets']))
      
      total_accuracy, accuracy_per_label = self.evaluate_results(predictions, targets)
      print(f"Total accuracy: {total_accuracy}")
      print(f"Accuracy per label: {accuracy_per_label}")

    def evaluate_results(self, predictions, targets):
      #binary relevance
      total_accuracy = ModelEvaluator.get_total_accuracy(targets, predictions)
      #one vs rest
      accuracy_per_label = ModelEvaluator.get_accuracy_per_label(self.labels,targets,predictions)
      return total_accuracy, accuracy_per_label

    def configure_optimizers(self):
      return torch.optim.Adam(params = self.parameters(), lr=self.hparams["learning_rate"])

    #******Dataloaders******
    def train_dataloader(self):
      return DataLoader(self.training_dataset, batch_size=self.hparams["train_batch_size"], shuffle= self.hparams["shuffle"], num_workers=32)

    def val_dataloader(self):
      return DataLoader(self.validation_dataset, batch_size=self.hparams["validation_batch_size"], shuffle= False, num_workers=32)

class XLNetCustomModel(CustomModel):

    def define_model(self, model_to_use):
        self.l1 = transformers.XLNetModel.from_pretrained(model_to_use)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, len(self.labels))

    def forward(self, ids, mask, token_type_ids):
        last_hidden_state = self.l1(input_ids = ids, token_type_ids = token_type_ids, attention_mask= mask).last_hidden_state
        output_1 = self.pool_hidden_state(last_hidden_state)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
    def pool_hidden_state(self,last_hidden_state):
      return torch.mean(last_hidden_state, 1)

class RoBERTaCustomModel(CustomModel):

    def define_model(self, model_to_use):
        self.l1 = transformers.RobertaModel.from_pretrained(model_to_use)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, len(self.labels))

    def forward(self, ids, mask, token_type_ids):
        last_hidden_state = self.l1(input_ids = ids, token_type_ids = token_type_ids, attention_mask= mask).last_hidden_state
        output_1 = self.pool_hidden_state(last_hidden_state)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
    
    def pool_hidden_state(self,last_hidden_state):
      return torch.mean(last_hidden_state, 1)

class BERTCustomModel(CustomModel):

    def define_model(self, model_to_use):
        self.l1 = transformers.BertModel.from_pretrained(model_to_use)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, len(self.labels))

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

class DataProcessor():

    @classmethod
    def obtain_boolean_mask_from_dataset(cls,dataset):
        dataset_topics = cls.get_column_indexes_as_list(dataset["topic"])
        boolean_mask = cls.generate_boolean_mask(dataset["topic"],dataset_topics).astype(int)
        return boolean_mask
    
    @classmethod
    def generate_boolean_mask(cls,item_lists, unique_items):
        bool_dict = {}
        
        # Loop through all the tags
        for i, item in enumerate(unique_items):
            
            # Apply boolean mask
            bool_dict[item] = item_lists.apply(lambda x: item in x)
                
        # Return the results as a dataframe
        return pd.DataFrame(bool_dict)
    
    @classmethod
    def get_column_indexes_as_list(cls,column):
        return cls.convert_list_to_series(column).value_counts().index.tolist()
    
    @classmethod
    def get_column_indexes(cls,column):
        return cls.convert_list_to_series(column).value_counts()

    @classmethod
    def convert_list_to_series(cls,list):
        return pd.Series([x for _list in list for x in _list])
    
    @classmethod
    def get_underrepresented_topics(cls,dataset,threshold):
        dataset_topics = cls.get_column_indexes(dataset["topic"])
        return dataset_topics[dataset_topics < threshold].index.tolist()
    
    @classmethod
    def remove_topics_from_dataset(cls,dataset,boolean_mask,topics):

        dataset = dataset.join(boolean_mask)

        for topic in topics:
            dataset = dataset[~cls.row_contains_only_this_topic(dataset,topic)]
        
        dataset = dataset.drop(columns=topics)
        #dataset["list"] = dataset.iloc[:,9:].values.tolist()

        remaining_topics = dataset.iloc[:,8:].columns

        return dataset, remaining_topics

    @classmethod
    def row_contains_only_this_topic(cls,dataset, topic):
        row_contains_topic = (dataset[topic] == 1)
        row_has_only_one_topic = ((dataset.iloc[:,9:].drop(columns=topic) == 0).all(axis = 1))
        return row_contains_topic & row_has_only_one_topic

class ModelEvaluator():
    
    @classmethod
    def get_total_accuracy(cls, targets, predictions):
        accuracy = accuracy_score(targets, predictions)
        return accuracy
    
    @classmethod
    def get_accuracy_per_label(cls, labels, targets, predictions):
        accuracy_per_label = {}
        i = 0
        for label in labels:
            label_targets = targets[:,i]
            label_predicitons = predictions[:,i]

            label_accuracy = accuracy_score(label_targets, label_predicitons)

            accuracy_per_label[label] = label_accuracy

            i += 1
        
        accuracy_per_label["no topic"] = cls.get_accuracy_comments_with_no_topic(targets, predictions)
        
        return accuracy_per_label
    
    @classmethod
    def get_accuracy_comments_with_no_topic(cls, targets, predictions):

        targets = pd.DataFrame(targets).reset_index(drop=True)
        predictions = pd.DataFrame(predictions).reset_index(drop=True)

        #Get all comments with no topic
        targets = targets.loc[(targets==0).all(axis=1)]

        #Get the predicitions of those comments with no topic
        predictions = predictions[predictions.index.isin(targets.index)]

        accuracy = accuracy_score(targets, predictions)

        return accuracy

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.conversation
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        
        comment_text = str(self.comment_text[index])

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
          	padding='max_length',
            return_token_type_ids=True,
            truncation = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
import pandas as pd

class DataProcessor():

    @classmethod
    def obtain_boolean_mask_from_dataset(cls,dataset):
        dataset_topics = cls.get_column_indexes_as_list(dataset["topic"])
        boolean_mask = cls.generate_boolean_mask(dataset["topic"],dataset_topics).astype(int)
        return boolean_mask
    
    @classmethod
    def generate_boolean_mask(cls,item_lists, unique_items):
        bool_dict = {}
        
        # Loop through all the tags
        for i, item in enumerate(unique_items):
            
            # Apply boolean mask
            bool_dict[item] = item_lists.apply(lambda x: item in x)
                
        # Return the results as a dataframe
        return pd.DataFrame(bool_dict)
    
    @classmethod
    def get_column_indexes_as_list(cls,column):
        return cls.convert_list_to_series(column).value_counts().index.tolist()
    
    @classmethod
    def get_column_indexes(cls,column):
        return cls.convert_list_to_series(column).value_counts()

    @classmethod
    def convert_list_to_series(cls,list):
        return pd.Series([x for _list in list for x in _list])
    
    @classmethod
    def get_underrepresented_topics(cls,dataset,threshold):
        dataset_topics = cls.get_column_indexes(dataset["topic"])
        return dataset_topics[dataset_topics < threshold].index.tolist()
    
    @classmethod
    def remove_topics_from_dataset(cls,dataset,boolean_mask,topics):

        dataset = dataset.join(boolean_mask)

        for topic in topics:
            dataset = dataset[~cls.row_contains_only_this_topic(dataset,topic)]
        
        dataset = dataset.drop(columns=topics)
        #dataset["list"] = dataset.iloc[:,9:].values.tolist()

        remaining_topics = dataset.iloc[:,8:].columns

        return dataset, remaining_topics

    @classmethod
    def row_contains_only_this_topic(cls,dataset, topic):
        row_contains_topic = (dataset[topic] == 1)
        row_has_only_one_topic = ((dataset.iloc[:,9:].drop(columns=topic) == 0).all(axis = 1))
        return row_contains_topic & row_has_only_one_topic

## Configuration Variables

In [11]:
#Configuration Variables
root_PATH = '/tmp'
gpus_to_use = [0]
MAX_LEN = 200
remaining_topics = [
                    'Satisfied users',
                    'Bugs',
                    'Design & UX',
                    'Dissatisfied users',
                    'Performance',
                    'Use cases',
                    'Gaming',
                    'Feature Requests',
                    'Complexity',
                    'Pricing',
                    'Security & Accounts',
                    'Update',
                    'Camera & Photos',
                    'Video',
                    'Customer Support',
                    'Notifications & Alerts',
                    'Frequency',
                    'Advertising',
                    'Payment',
                    'Connectivity',
                    'Devices',
                    'Audio',
                    'Sign Up & Login',
                    'Location Services',
                    'Privacy',
                    'Internationalization'
                    ]

## Donwloading the models

In [12]:
#Download and load classical models
models_location = root_PATH
classical_models = {}

#Naive Bayes
!gdown --id 1yZGIl7kugEb-u_JdHBQOQudFJ2xAq4iz -O $models_location/naive_bayes.joblib
classical_models["Naive Bayes"] = load(models_location + '/naive_bayes.joblib')

#SVC
!gdown --id 1SbaEIgkYWIHRhyx5lsX2bRhtkVbM9zBV -O $models_location/SVC.joblib
classical_models["SVC"] = load(models_location + '/SVC.joblib')

#Logistic-regression
!gdown --id 12pESs9-j_BwcmuArgqr1mXFKDHrXh5XZ -O $models_location/logistic_regression.joblib
classical_models["Logistic Regression"] = load(models_location + '/logistic_regression.joblib')

#KNN
!gdown --id 1MTMqHhTxdgqvPwVA4X5ixhxp3g3kCu1D -O $models_location/KNN.joblib
classical_models["KNN"] = load(models_location + '/KNN.joblib')




Downloading...
From: https://drive.google.com/uc?id=1yZGIl7kugEb-u_JdHBQOQudFJ2xAq4iz
To: /tmp/naive_bayes.joblib
117MB [00:01, 105MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1SbaEIgkYWIHRhyx5lsX2bRhtkVbM9zBV
To: /tmp/SVC.joblib
36.1MB [00:00, 98.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=12pESs9-j_BwcmuArgqr1mXFKDHrXh5XZ
To: /tmp/logistic_regression.joblib
36.1MB [00:00, 77.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1MTMqHhTxdgqvPwVA4X5ixhxp3g3kCu1D
To: /tmp/KNN.joblib
118MB [00:00, 129MB/s] 


In [13]:
#Download and load deep learning models

dl_tokenizers={}
dl_models = {}

#########BERT#########
model_to_use = 'bert-base-uncased'
model_path = root_PATH + '/BERT.ckpt'

#Donwload trained model
!gdown --id 1O4UFoxTYUWrvfWYSCD6LkH3yeb23qbrT -O $model_path

#Load model and tokenizer
dl_tokenizers["BERT"] = BertTokenizer.from_pretrained(model_to_use)

dl_models["BERT"] = BERTCustomModel.load_from_checkpoint(
    model_path, 
    hparams = {}, 
    training_dataset=None, 
    validation_dataset=None, 
    labels=remaining_topics, 
    model_to_use=model_to_use
    )

#########RoBERTa#########
model_to_use = 'roberta-base'
model_path = root_PATH + '/RoBERTa.ckpt'

#Donwload trained model
!gdown --id 19lHUriPF1w6j1Q4ggF4VRzAQ7hqhJwam -O $model_path

#Load model and tokenizer
dl_tokenizers["RoBERTa"] = RobertaTokenizer.from_pretrained(model_to_use)

dl_models["RoBERTa"] = RoBERTaCustomModel.load_from_checkpoint(
    model_path, 
    hparams = {}, 
    training_dataset=None, 
    validation_dataset=None, 
    labels=remaining_topics, 
    model_to_use=model_to_use
    )

#########XLNet#########
model_to_use = 'xlnet-base-cased'
model_path = root_PATH + '/XLNet.ckpt'

#Donwload trained model
!gdown --id 1CHOabPIIpeWZzQ9q9ysa5hJFKGvroBDo -O $model_path

#Load model and tokenizer
dl_tokenizers["XLNet"] = XLNetTokenizer.from_pretrained(model_to_use)

dl_models["XLNet"] = XLNetCustomModel.load_from_checkpoint(
    model_path, 
    hparams = {}, 
    training_dataset=None, 
    validation_dataset=None, 
    labels=remaining_topics, 
    model_to_use=model_to_use
    )

Downloading...
From: https://drive.google.com/uc?id=1O4UFoxTYUWrvfWYSCD6LkH3yeb23qbrT
To: /tmp/BERT.ckpt
1.31GB [00:14, 87.7MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Downloading...
From: https://drive.google.com/uc?id=19lHUriPF1w6j1Q4ggF4VRzAQ7hqhJwam
To: /tmp/RoBERTa.ckpt
1.50GB [00:16, 92.6MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…


Downloading...
From: https://drive.google.com/uc?id=1CHOabPIIpeWZzQ9q9ysa5hJFKGvroBDo
To: /tmp/XLNet.ckpt
1.40GB [00:15, 92.1MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [18]:
#Put all deep-learning models in evaluation mode
for key,model in dl_models.items():
  model.eval()

## Sentences Topic Prediction

In [19]:
#Sentences to evaluate
sentences = [
             "Poor photo management, Need more feature I need to backup all phote than delete them",
             "I had an appointment for pickup between 1:30-1:40 today. At approx 1:20 I was notified that driver cancelled and new one would be dispatched. Unfortunately, the new drivers ETA was 1:50 or later. I cancelled and had to make other arrangements get to airport to make my transcontinental flight. I am apoplectic that after this stressful inconvenience you are Charging me $20. Please fix this or I will delete the Uber app and use another car service.",
             "the app is amazing, but it is too expensive",
             "It will send some emails and never a picture   Pic shows in my outlook but will not get to  my home computer. Not intuitive at all. Maybe iphone issu?  In any wvent waste of time",
             "i like it but i loged out and now i cant login"
             ]

In [20]:
def print_predictions(predictions):
  i=0
  predicted_topics = []
  for prediction in predictions[0]:
    if prediction == 1:
      predicted_topics.append(remaining_topics[i])
    i += 1
  print(f'{key}: {predicted_topics}')

In [21]:
for sentence in sentences:
  print(f'Sentence: {sentence}')
  for key,model in classical_models.items():
    
    predictions = model.predict([sentence]).toarray()

    print_predictions(predictions)

  for key,model in dl_models.items():

    tokenizer = dl_tokenizers.get(key)
    inputs = tokenizer(sentence, return_tensors="pt")
    ids = inputs.get('input_ids')
    mask = inputs.get('attention_mask')
    token_type_ids = inputs.get("token_type_ids")

    outputs = model(ids,mask,token_type_ids).cpu().detach().numpy().tolist()
    predictions = (np.array(outputs) >= 0.5).astype(int)

    print_predictions(predictions)
    
  print('-------------------------------------------------')

Sentence: Poor photo management, Need more feature I need to backup all phote than delete them
Naive Bayes: []
SVC: ['Camera & Photos']
Logistic Regression: ['Camera & Photos']
KNN: []
BERT: ['Feature Requests', 'Camera & Photos']
RoBERTa: ['Feature Requests', 'Camera & Photos']
XLNet: ['Feature Requests', 'Camera & Photos']
-------------------------------------------------
Sentence: I had an appointment for pickup between 1:30-1:40 today. At approx 1:20 I was notified that driver cancelled and new one would be dispatched. Unfortunately, the new drivers ETA was 1:50 or later. I cancelled and had to make other arrangements get to airport to make my transcontinental flight. I am apoplectic that after this stressful inconvenience you are Charging me $20. Please fix this or I will delete the Uber app and use another car service.
Naive Bayes: []
SVC: ['Bugs', 'Feature Requests', 'Pricing']
Logistic Regression: ['Bugs', 'Feature Requests']
KNN: []
BERT: ['Bugs', 'Dissatisfied users', 'Featur