In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm
2023-08-12 15:29:18.117651: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-12 15:29:18.180840: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-12 15:29:18.183384: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-12 15:29:18.183392: I tensorflow/stream_executor/cuda/cudart_stub.c

In [18]:
train_dataset = pd.read_csv('data/dataset/train.csv')
test_dataset = pd.read_csv('data/dataset/test.csv')
val_dataset = pd.read_csv('data/dataset/val.csv')

In [19]:
train_dataset.dropna(inplace = True)
val_dataset.dropna(inplace = True)
test_dataset.dropna(inplace = True)


In [20]:
train_dataset.reset_index(drop=True,inplace= True)
test_dataset.reset_index(drop=True,inplace= True)
val_dataset.reset_index(drop=True,inplace= True)


In [21]:

label_mapping = {'INFORMATION-TECHNOLOGY': 0,\
                 'ENGINEERING':1, \
                 'BUSINESS-DEVELOPMENT':2, \
                 'SALES':3,\
                 'HR':4, \
                 'FITNESS': 5 , \
                 'ARTS':6,\
                 'ADVOCATE':7,\
                 'CONSTRUCTION':8,\
                 'AVIATION':9,\
                 'FINANCE':10,\
                 'CHEF':11,\
                 'ACCOUNTANT':12,\
                 'BANKING':13,\
                 'HEALTHCARE':14,\
                 'CONSULTANT':15,\
                 'PUBLIC-RELATIONS':16,\
                 'DESIGNER':17, \
                 'TEACHER':18, \
                 'APPAREL':19, \
                 'DIGITAL-MEDIA':20,\
                 'AGRICULTURE':21, \
                 'AUTOMOBILE':22,\
                 'BPO':23
                 }

In [22]:
def labeling(label):
    return label_mapping[label]

In [23]:
train_dataset["Category"]= train_dataset["Category"].apply(labeling)


In [24]:

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [25]:

token = tokenizer.encode_plus(
    train_dataset['Resume_clean'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [26]:

token.input_ids

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  1900,  6213,  6678,  4160,   184,  8223, 10550,  4385,
         7087,  2377,  3645,  1718,  1788,  2377,  3645,  1718,  1788,
         3694, 11529,  1700,  2383,  1543,  4812, 11019,  9524,  2377,
         3645, 17092,  1788,  1992, 11147,  1671,  3693,  8918,  2989,
        25059,  6487,  5392,  2070,  1437,   180,  1643, 25247,   175,
         1306,  6442,  3285,   193,  1306,   177,  1830, 10182,  3094,
         2541,  1954,  1900,  6213,  6678,  1419,  1271,  1331,  1352,
         3689, 10700,  1359,  3044,  2818,  2731,  2319, 10209, 25344,
        11000,  3213,  1218,  4256,  1231,  5053,  7174,  7232,  3693,
         2134,   189, 20144, 21143, 14561,  4973,  1788, 14561,  5035,
         1844,  2057, 16878,  5494,  2134,  4422,  6298,  6678,  2344,
         9342,  2755,  1470,  4125,  3181,  6213,  6678,  2338,  1470,
         2029,  4291, 15187, 19396,  3881,  5911,  3294,  1934,  2394,
         2561,  2344,  8826, 

In [27]:
X_input_ids = np.zeros((len(train_dataset), 256))
X_attn_masks = np.zeros((len(train_dataset), 256))

In [28]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Resume_clean'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [29]:
X_input_ids, X_attn_masks = generate_training_data(train_dataset, X_input_ids, X_attn_masks, tokenizer)


2249it [00:08, 266.31it/s]


In [31]:
labels = np.zeros((len(train_dataset), 24))
labels.shape

(2249, 24)

In [33]:
labels[np.arange(len(train_dataset)), train_dataset['Category'].values] = 1 # one-hot encoded target tensor

In [34]:
labels

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(24,), dtype=tf.float64, name=None))>

In [36]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels
     

In [37]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [38]:

dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(24,), dtype=tf.float64, name=None))>

In [39]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor


In [41]:
p = 0.8
train_size = int((len(train_dataset)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.
     

In [42]:
train_size

112

In [43]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [45]:
from transformers import TFBertModel

In [46]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Downloading tf_model.h5: 100%|██████████| 527M/527M [01:03<00:00, 8.24MB/s] 
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [51]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(24, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [52]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [53]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [55]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50
)

Epoch 1/50

2023-08-12 15:53:03.589210: E tensorflow/core/framework/node_def_util.cc:675] NodeDef mentions attribute epsilon which is not in the op definition: Op<name=_MklFusedBatchMatMulV2; signature=x:T, y:T, args:num_args*T -> output:T; attr=T:type,allowed=[DT_BFLOAT16, DT_FLOAT]; attr=adj_x:bool,default=false; attr=adj_y:bool,default=false; attr=num_args:int,min=0; attr=fused_ops:list(string),default=[]> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node model_1/bert/encoder/layer_._0/attention/self/ArithmeticOptimizer/AddOpsRewrite_add_1}}


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [56]:
sentiment_model.save('model')



INFO:tensorflow:Assets written to: model/assets


INFO:tensorflow:Assets written to: model/assets
