In [None]:
!pip install transformers
!pip install pytorch-transformers

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 78.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 56.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import BertTokenizer, TFBertModel, BertForSequenceClassification, BertConfig

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Detect hardware, return appropriate distribution strategy (you can see that it is pretty easy to set up).
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set (always set in Kaggle)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 256
ARTIFACTS_PATH = '../artifacts/'

BATCH_SIZE = 8 * strategy.num_replicas_in_sync
EPOCHS = 3

if not os.path.exists(ARTIFACTS_PATH):
    os.makedirs(ARTIFACTS_PATH)

In [None]:
! gdown --id 1LxH5sC6AdrwzClPEGUAV2HVcPjWIPmc8  ##SupportTickets
#! gdown --id 1etLBrBTdokVHIuaxEmr1Koacos4IbMZL ## ConsumerComplaints

Downloading...
From: https://drive.google.com/uc?id=1LxH5sC6AdrwzClPEGUAV2HVcPjWIPmc8
To: /content/FinalFinalTwoTierSnowmirror.csv
100% 101M/101M [00:00<00:00, 217MB/s] 


In [None]:
import pandas as pd
import io
data = pd.read_csv('FinalFinalTwoTierSnowmirror.csv')

In [None]:
##  Creating a new Dataframe with the cols we're interested In
cols = ['FinalText', 'DV_CATEGORY']
#cols = ['TokenizedText', 'Product']
data = data[cols]
data.head(10)

Unnamed: 0,FinalText,DV_CATEGORY
0,mass data migration device rental invoice,Project Office
1,ibm cloud backup non successful hello use evau...,Compute
2,accounting request pro rate credit,Project Office
3,feature code purchase subscription receive sen...,Project Office
4,subscription account link ibm cloud bluemix si...,Project Office
5,virtual server high customer like replicate ib...,Compute
6,incorrectly early provision due error replace ...,Project Office
7,security compliance require vmware answer impl...,Compute
8,service cancellation,3rd Party Reseller
9,bluemix asset monitor compose postgresql,Platform / Console


In [None]:
data.DV_CATEGORY.unique()

array(['Project Office', 'Compute', '3rd Party Reseller',
       'Platform / Console', 'Services', 'VPC', 'Apps',
       'Security and Identity', 'Networking', 'Integration',
       'Project Office (internal)', 'Storage'], dtype=object)

In [None]:
X_data = data[['FinalText']].to_numpy().reshape(-1)
y_data = data[['DV_CATEGORY']].to_numpy().reshape(-1)

In [None]:
def bert_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(str(text))

        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])

        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN

        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')

        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
# Transform categories into numbers
category_to_id = {}
category_to_name = {}

for index, c in enumerate(y_data):
    if c in category_to_id:
        category_id = category_to_id[c]
    else:
        category_id = len(category_to_id)
        category_to_id[c] = category_id
        category_to_name[category_id] = c

    y_data[index] = category_id

# Display dictionary
category_to_name

{0: 'Project Office',
 1: 'Compute',
 2: '3rd Party Reseller',
 3: 'Platform / Console',
 4: 'Services',
 5: 'VPC',
 6: 'Apps',
 7: 'Security and Identity',
 8: 'Networking',
 9: 'Integration',
 10: 'Project Office (internal)',
 11: 'Storage'}

In [None]:
categories = data['DV_CATEGORY'].unique()
n_categories = len(categories)

In [None]:
n_categories

12

In [None]:
# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=777) # random_state to reproduce results

In [None]:
X_train

array(['increase upgrade bandwidth option sale agent hello need set server account expect consumption last month local ibm',
       'make lite account get confirmation mail hour address trial ibm must spam box think proceed',
       'user provide create dev environment access name hasan mohammad regard organization space resource field',
       ..., 'test ticket ignore',
       'confirmation behavior auto scale hello support team use node red like ask question automatically start instance number alive become low minimal least confirm scaling answer yes long take since',
       'request survey charge san provide usage report something check content renewal order total balance platform support month reflect amount overage investigate whether follow invoice correct'],
      dtype=object)

In [None]:
# Import tokenizer from HuggingFace
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
X_train = bert_encode(X_train, tokenizer)
X_test = bert_encode(X_test, tokenizer)

y_train = np.asarray(y_train, dtype='int32')
y_test = np.asarray(y_test, dtype='int32')

In [None]:
def build_model(n_categories):
    with strategy.scope():
        input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
        input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
        input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

        # Import BERT model from HuggingFace
        bert_model = TFBertModel.from_pretrained(MODEL_NAME)
        x = bert_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

        # Huggingface transformers have multiple outputs, embeddings are the first one,
        # so let's slice out the first position
        x = x[0]

        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

        model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=1e-5),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

        return model

In [None]:
with strategy.scope():
    model = build_model(n_categories)
    model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_3 (TFBertModel)   TFBaseModelOutputWit 109482240   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [None]:
with strategy.scope():
    print('Training...')
    history = model.fit(X_train,
                        y_train,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        verbose=1,
                        validation_data=(X_test, y_test))

Training...
Epoch 1/3


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None,) dtype=int32>]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None,) dtype=int32>]








INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 256) dtype=int32>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None,) dtype=int32>]


Epoch 2/3
Epoch 3/3


In [None]:
y_pred

array([[0.9132952 , 0.08670478],
       [0.09615043, 0.90384954],
       [0.90086865, 0.09913131],
       [0.91933906, 0.08066092],
       [0.12343215, 0.87656784],
       [0.8872177 , 0.11278225],
       [0.16934107, 0.830659  ],
       [0.90292263, 0.09707733],
       [0.04330614, 0.9566938 ],
       [0.9645212 , 0.03547875],
       [0.95503354, 0.04496641],
       [0.8570329 , 0.14296713],
       [0.06058738, 0.9394126 ],
       [0.8674663 , 0.13253374],
       [0.27896786, 0.72103214],
       [0.09123947, 0.9087604 ],
       [0.92922574, 0.07077418],
       [0.8877487 , 0.11225132],
       [0.26868767, 0.73131233],
       [0.06149247, 0.93850744],
       [0.85076886, 0.14923121],
       [0.2576749 , 0.7423251 ],
       [0.03756809, 0.96243197],
       [0.76388615, 0.23611389],
       [0.03847211, 0.9615279 ],
       [0.9822324 , 0.01776751],
       [0.04295966, 0.95704037],
       [0.83125776, 0.1687422 ],
       [0.88538367, 0.11461624],
       [0.8722662 , 0.12773384],
       [0.