In [51]:
#Mount Drive and install dependencies if running in Colab
def install_dependecies():
  !pip install sentencepiece
  !pip install transformers
  !pip install pytorch-lightning
  !pip install scikit-multilearn

from sys import path
import os
import sys

if 'google.colab' in str(get_ipython()):
  from google.colab import drive

  root_PATH = '/content/drive/My Drive/nlp-seminar/repository'
  drive_mount_location = '/content/drive'
  module_path = root_PATH + '/src'
  
  drive.mount(drive_mount_location, force_remount=True)
  path.append(root_PATH)

  install_dependecies()
else:
  root_PATH = os.path.abspath("../../..")
  module_path = os.path.abspath(os.path.join('../../../src'))

%load_ext autoreload
%autoreload 2

if module_path not in sys.path:
    sys.path.append(module_path)

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |███▊                            | 10kB 11.7MB/s eta 0:00:01[K     |███████▍                        | 20kB 17.1MB/s eta 0:00:01[K     |███████████                     | 30kB 20.5MB/s eta 0:00:01[K     |██████████████▊                 | 40kB 13.7MB/s eta 0:00:01[K     |██████████████████▍             | 51kB 9.4MB/s eta 0:00:01[K     |██████████████████████          | 61kB 9.5MB/s eta 0:00:01[K     |█████████████████████████▊      | 71kB 9.6MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81kB 9.6MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 4.8MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
from data_processor import DataProcessor
from model_evaluator import ModelEvaluator
from custom_dataset import CustomDataset
from models.bert_custom_model import BERTCustomModel
from models.roberta_custom_model import RoBERTaCustomModel
from models.xlnet_custom_model import XLNetCustomModel

from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig

import pandas as pd
from torch import cuda
from transformers import BertTokenizer, BertModel, BertConfig
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import transformers
import numpy as np
from sklearn import metrics
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from sklearn.metrics import accuracy_score
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from joblib import dump, load
from skmultilearn.problem_transform import BinaryRelevance

In [55]:
#Configuration Variables
root_PATH = '/tmp'
gpus_to_use = [0]
MAX_LEN = 200
remaining_topics = [
                    'Satisfied users',
                    'Bugs',
                    'Design & UX',
                    'Dissatisfied users',
                    'Performance',
                    'Use cases',
                    'Gaming',
                    'Feature Requests',
                    'Complexity',
                    'Pricing',
                    'Security & Accounts',
                    'Update',
                    'Camera & Photos',
                    'Video',
                    'Customer Support',
                    'Notifications & Alerts',
                    'Frequency',
                    'Advertising',
                    'Payment',
                    'Connectivity',
                    'Devices',
                    'Audio',
                    'Sign Up & Login',
                    'Location Services',
                    'Privacy',
                    'Internationalization'
                    ]

In [72]:
#Download and load classical models
models_location = root_PATH
classical_models = {}

#Naive Bayes
!gdown --id 1yZGIl7kugEb-u_JdHBQOQudFJ2xAq4iz -O $models_location/naive_bayes.joblib
classical_models["Naive Bayes"] = load(models_location + '/naive_bayes.joblib')

#SVC
!gdown --id 1SbaEIgkYWIHRhyx5lsX2bRhtkVbM9zBV -O $models_location/SVC.joblib
classical_models["SVC"] = load(models_location + '/SVC.joblib')

#Logistic-regression
!gdown --id 12pESs9-j_BwcmuArgqr1mXFKDHrXh5XZ -O $models_location/logistic_regression.joblib
classical_models["Logistic Regression"] = load(models_location + '/logistic_regression.joblib')

#KNN
!gdown --id 1MTMqHhTxdgqvPwVA4X5ixhxp3g3kCu1D -O $models_location/KNN.joblib
classical_models["KNN"] = load(models_location + '/KNN.joblib')




Downloading...
From: https://drive.google.com/uc?id=1yZGIl7kugEb-u_JdHBQOQudFJ2xAq4iz
To: /tmp/naive_bayes.joblib
117MB [00:00, 206MB/s]
Downloading...
From: https://drive.google.com/uc?id=1SbaEIgkYWIHRhyx5lsX2bRhtkVbM9zBV
To: /tmp/SVC.joblib
36.1MB [00:00, 114MB/s] 
Downloading...
From: https://drive.google.com/uc?id=12pESs9-j_BwcmuArgqr1mXFKDHrXh5XZ
To: /tmp/logistic_regression.joblib
36.1MB [00:00, 115MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1MTMqHhTxdgqvPwVA4X5ixhxp3g3kCu1D
To: /tmp/KNN.joblib
118MB [00:00, 168MB/s]


In [4]:
#Download and load deep learning models

dl_tokenizers={}
dl_models = {}

#########BERT#########
model_to_use = 'bert-base-uncased'
model_path = root_PATH + '/BERT.ckpt'

#Donwload trained model
!gdown --id 1O4UFoxTYUWrvfWYSCD6LkH3yeb23qbrT -O $model_path

#Load model and tokenizer
dl_tokenizers["BERT"] = BertTokenizer.from_pretrained(model_to_use)

dl_models["BERT"] = BERTCustomModel.load_from_checkpoint(
    model_path, 
    hparams = {}, 
    training_dataset=None, 
    validation_dataset=None, 
    labels=remaining_topics, 
    model_to_use=model_to_use
    )

#########RoBERTa#########
model_to_use = 'roberta-base'
model_path = root_PATH + '/RoBERTa.ckpt'

#Donwload trained model
!gdown --id 19lHUriPF1w6j1Q4ggF4VRzAQ7hqhJwam -O $model_path

#Load model and tokenizer
dl_tokenizers["RoBERTa"] = RobertaTokenizer.from_pretrained(model_to_use)

dl_models["RoBERTa"] = RoBERTaCustomModel.load_from_checkpoint(
    model_path, 
    hparams = {}, 
    training_dataset=None, 
    validation_dataset=None, 
    labels=remaining_topics, 
    model_to_use=model_to_use
    )

#########XLNet#########
model_to_use = 'xlnet-base-cased'
model_path = root_PATH + '/XLNet.ckpt'

#Donwload trained model
!gdown --id 1CHOabPIIpeWZzQ9q9ysa5hJFKGvroBDo -O $model_path

#Load model and tokenizer
dl_tokenizers["XLNet"] = XLNetTokenizer.from_pretrained(model_to_use)

dl_models["XLNet"] = XLNetCustomModel.load_from_checkpoint(
    model_path, 
    hparams = {}, 
    training_dataset=None, 
    validation_dataset=None, 
    labels=remaining_topics, 
    model_to_use=model_to_use
    )

Downloading...
From: https://drive.google.com/uc?id=1O4UFoxTYUWrvfWYSCD6LkH3yeb23qbrT
To: /tmp/BERT.ckpt
1.31GB [00:11, 118MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


Downloading...
From: https://drive.google.com/uc?id=19lHUriPF1w6j1Q4ggF4VRzAQ7hqhJwam
To: /tmp/RoBERTa.ckpt
1.50GB [00:17, 87.2MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…


Downloading...
From: https://drive.google.com/uc?id=1CHOabPIIpeWZzQ9q9ysa5hJFKGvroBDo
To: /tmp/XLNet.ckpt
1.40GB [00:14, 93.6MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [8]:
for key,model in dl_models.items():
  model.eval()

In [88]:
#sentences = ["Poor photo management, Need more feature I need to backup all phote than delete them"]
#sentences = ["I had an appointment for pickup between 1:30-1:40 today. At approx 1:20 I was notified that driver cancelled and new one would be dispatched. Unfortunately, the new drivers ETA was 1:50 or later. I cancelled and had to make other arrangements get to airport to make my transcontinental flight. I am apoplectic that after this stressful inconvenience you are Charging me $20. Please fix this or I will delete the Uber app and use another car service."]
#sentences = ["the app is amazing, but it is too expensive"]
#sentences = ["It will send some emails and never a picture   Pic shows in my outlook but will not get to  my home computer. Not intuitive at all. Maybe iphone issu?  In any wvent waste of time"]
#sentences = ["i like it but i loged out and now i cant login"]

In [95]:
sentences = [
             "Poor photo management, Need more feature I need to backup all phote than delete them",
             "I had an appointment for pickup between 1:30-1:40 today. At approx 1:20 I was notified that driver cancelled and new one would be dispatched. Unfortunately, the new drivers ETA was 1:50 or later. I cancelled and had to make other arrangements get to airport to make my transcontinental flight. I am apoplectic that after this stressful inconvenience you are Charging me $20. Please fix this or I will delete the Uber app and use another car service.",
             "the app is amazing, but it is too expensive",
             "It will send some emails and never a picture   Pic shows in my outlook but will not get to  my home computer. Not intuitive at all. Maybe iphone issu?  In any wvent waste of time",
             "i like it but i loged out and now i cant login"
             ]

In [98]:
for sentence in sentences:
  print(f'Sentence: {sentence}')
  for key,model in classical_models.items():
    
    predictions = model.predict([sentence]).toarray()

    i=0
    predicted_topics = []
    for prediction in predictions[0]:
      if prediction == 1:
        predicted_topics.append(remaining_topics[i])
      i += 1

    print(f'{key}: {predicted_topics}')

  for key,model in dl_models.items():

    tokenizer = dl_tokenizers.get(key)
    inputs = tokenizer(sentence, return_tensors="pt")
    ids = inputs.get('input_ids')
    mask = inputs.get('attention_mask')
    token_type_ids = inputs.get("token_type_ids")

    outputs = model(ids,mask,token_type_ids).cpu().detach().numpy().tolist()
    predictions = (np.array(outputs) >= 0.5).astype(int)

    i=0
    predicted_topics = []
    for prediction in predictions[0]:
      if prediction == 1:
        predicted_topics.append(remaining_topics[i])
      i += 1

    print(f'{key}: {predicted_topics}')
  print('-------------------------------------------------')

Sentence: Poor photo management, Need more feature I need to backup all phote than delete them
Naive Bayes: []
SVC: ['Camera & Photos']
Logistic Regression: ['Camera & Photos']
KNN: []
BERT: ['Feature Requests', 'Camera & Photos']
RoBERTa: ['Feature Requests', 'Camera & Photos']
XLNet: ['Feature Requests', 'Camera & Photos']
-------------------------------------------------
Sentence: I had an appointment for pickup between 1:30-1:40 today. At approx 1:20 I was notified that driver cancelled and new one would be dispatched. Unfortunately, the new drivers ETA was 1:50 or later. I cancelled and had to make other arrangements get to airport to make my transcontinental flight. I am apoplectic that after this stressful inconvenience you are Charging me $20. Please fix this or I will delete the Uber app and use another car service.
Naive Bayes: []
SVC: ['Bugs', 'Feature Requests', 'Pricing']
Logistic Regression: ['Bugs', 'Feature Requests']
KNN: []
BERT: ['Bugs', 'Dissatisfied users', 'Featur