In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
csv1 = pd.read_csv("../data/data_c.csv", index_col=0)
csv1 = csv1.iloc[:10000]
csv1.head()
csv1.shape

(10000, 2)

In [4]:
csv2 = pd.read_csv("../data/data_python.csv", index_col=0)
csv2 = csv2.iloc[:10000]
csv2.shape

(10000, 2)

In [5]:
df = pd.concat([csv1,csv2],ignore_index=True)
df.head()

Unnamed: 0,snippet,language
0,alignContentOffset + ABI38_0_0facebook::...,C
1,VMA_CALL_PRE void VMA_CALL_POST vmaFlushAlloca...,C
2,"q[9] = final_add(q[5], q[0]);\n q[6] = fina...",C
3,\nVKAPI_ATTR void VKAPI_CALL vkGetAcceleration...,C
4,#ifdef SPECIFIC_TX_POWER_SUPPORT\nINT Set_AP_P...,C


In [6]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,snippet,language
0,"Test pkgng.unlock\n """"""\n lock_cmd =...",Python
1,if not m:\n continu...,Python
2,"* particular, Hiragana, Katakana, and Han cha...",C
3,if self.base and self.base in all_clas...,Python
4,"def load_stream(self, stream):\n re...",Python


In [7]:
df.dropna(subset=['snippet'], inplace=True)
np.where(pd.isnull(df))

(array([], dtype=int64), array([], dtype=int64))

In [8]:
lang = {"Python":0 , "C":1}

df.language = [lang[item] for item in df.language]
df.head()

Unnamed: 0,snippet,language
0,"Test pkgng.unlock\n """"""\n lock_cmd =...",0
1,if not m:\n continu...,0
2,"* particular, Hiragana, Katakana, and Han cha...",1
3,if self.base and self.base in all_clas...,0
4,"def load_stream(self, stream):\n re...",0


In [11]:
seq_len = 512
num_samples = len(df)

Xids = np.zeros((num_samples,seq_len))
Xmask = np.zeros((num_samples,seq_len))

Xids.shape

(20000, 512)

In [12]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-win_amd64.whl (2.0 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.2.1 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.15.0


In [15]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i,snippet in enumerate(df['snippet']):
    tokens = tokenizer.encode_plus(snippet,max_length = seq_len,truncation = True,
                                    padding = 'max_length',add_special_tokens = True, return_tensors='tf')
    
    Xids[i,:] = tokens['input_ids']
    Xmask[i,:] = tokens['attention_mask']
    

In [16]:
Xids

array([[  101.,  5960.,   185., ...,     0.,     0.,     0.],
       [  101.,  1191.,  1136., ...,     0.,     0.,     0.],
       [  101.,   115.,  2440., ...,     0.,     0.,     0.],
       ...,
       [  101.,   196.,   188., ...,     0.,     0.,     0.],
       [  101., 13340.,   115., ...,     0.,     0.,     0.],
       [  101.,   107.,   107., ...,     0.,     0.,     0.]])

In [18]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [19]:
arr = df['language'].values
arr

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [20]:
labels = np.zeros((num_samples, arr.max()+1))

In [22]:
labels.shape

(20000, 2)

In [23]:
labels[np.arange(num_samples),arr] = 1
labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [27]:
import tensorflow as tf


In [28]:
dataset = tf.data.Dataset.from_tensor_slices((Xids,Xmask,labels))

dataset.take(1)

<TakeDataset shapes: ((512,), (512,), (2,)), types: (tf.float64, tf.float64, tf.float64)>

In [29]:
Xids[0,:]

array([  101.,  5960.,   185.,  1377., 17653.,  1403.,   119., 22436.,
         107.,   107.,   107.,  5842.,   168.,  3975.,  1181.,   134.,
        6734.,  2107.,  5559.,   113.,  1862.,   168.,  2860.,   134.,
         196.,   107.,   188.,  1204.,  2572.,  3818.,   107.,   131.,
         113.,   107.,   185.,  1377.,  2571.,   118.,   122.,   119.,
         121.,   165.,   183.,   107.,   107.,   185.,  1377.,  1403.,
        1830.,   118.,   123.,   119.,   121.,   165.,   183.,   107.,
         114.,   117.,   107.,  1231.,  1204., 13775.,   107.,   131.,
         121.,   198.,   114.,   102.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
      

In [30]:
def map_func(input_ids,masks,labels):
    return {'input_ids': input_ids,'attention_mask':masks}, labels

In [31]:
dataset = dataset.map(map_func)
dataset.take(1)

<TakeDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, (2,)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [32]:
batch_size = 16

dataset = dataset.shuffle(1000).batch(batch_size, drop_remainder = True)

In [33]:
dataset.take(1)

<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 2)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [40]:
split = 0.8

size = int((num_samples/batch_size)*0.8)
size

1000

In [37]:
train_ds = dataset.take(size)
test_ds = dataset.skip(size)

print(train_ds, test_ds)

<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 2)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)> <SkipDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 2)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>


In [42]:
from transformers import TFAutoModel
bert = TFAutoModel.from_pretrained('bert-base-uncased')

bert.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [46]:
input_ids = tf.keras.layers.Input(shape = (seq_len,),name = 'input_ids',dtype = 'int32')
mask = tf.keras.layers.Input(shape = (seq_len,),name = "attention_mask",dtype = 'int32')

embeddings = bert.bert(input_ids,attention_mask = mask)[1]

x = tf.keras.layers.Dense(1024,activation='relu')(embeddings)
y = tf.keras.layers.Dense(arr.max()+1,activation='softmax',name = 'outputs')(x)

In [53]:
model = tf.keras.Model(inputs=[input_ids,mask], outputs = y)
model.layers[2].trainable = False
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

In [54]:
optimizer = tf.keras.optimizers.Adam(learning_rate= 1e-5,decay = 1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')


In [55]:
model.compile(optimizer,loss,metrics=[acc])

In [56]:
history =  model.fit(train_ds,validation_data= test_ds,epochs = 5)

Epoch 1/5


ResourceExhaustedError:  OOM when allocating tensor with shape[16,12,512,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node model_1/bert/encoder/layer_._9/attention/self/dropout_65/dropout/random_uniform/RandomUniform
 (defined at C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\layers\core\dropout.py:105)
]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_96781]

Errors may have originated from an input operation.
Input Source operations connected to node model_1/bert/encoder/layer_._9/attention/self/dropout_65/dropout/random_uniform/RandomUniform:
In[0] model_1/bert/encoder/layer_._9/attention/self/dropout_65/dropout/Shape:

Operation defined at: (most recent call last)
>>>   File "c:\program files\python39\lib\runpy.py", line 197, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "c:\program files\python39\lib\runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 845, in launch_instance
>>>     app.start()
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelapp.py", line 619, in start
>>>     self.io_loop.start()
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\platform\asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "c:\program files\python39\lib\asyncio\base_events.py", line 596, in run_forever
>>>     self._run_once()
>>> 
>>>   File "c:\program files\python39\lib\asyncio\base_events.py", line 1890, in _run_once
>>>     handle._run()
>>> 
>>>   File "c:\program files\python39\lib\asyncio\events.py", line 80, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\ioloop.py", line 688, in <lambda>
>>>     lambda f: self._run_callback(functools.partial(callback, future))
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\ioloop.py", line 741, in _run_callback
>>>     ret = callback()
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 814, in inner
>>>     self.ctx_run(self.run)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 775, in run
>>>     yielded = self.gen.send(value)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 358, in process_one
>>>     yield gen.maybe_future(dispatch(*args))
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 261, in dispatch_shell
>>>     yield gen.maybe_future(handler(stream, idents, msg))
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 536, in execute_request
>>>     self.do_execute(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel\ipkernel.py", line 302, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\ipykernel\zmqshell.py", line 539, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 2898, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 2944, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3169, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3361, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3441, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "<ipython-input-56-b83625ee776c>", line 1, in <module>
>>>     history =  model.fit(train_ds,validation_data= test_ds,epochs = 5)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 808, in train_step
>>>     y_pred = self(x, training=True)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\functional.py", line 451, in call
>>>     return self._run_internal_graph(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\functional.py", line 589, in _run_internal_graph
>>>     outputs = node.layer(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_tf_bert.py", line 868, in call
>>>     encoder_outputs = self.encoder(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_tf_bert.py", line 535, in call
>>>     for i, layer_module in enumerate(self.layer):
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_tf_bert.py", line 541, in call
>>>     layer_outputs = layer_module(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_tf_bert.py", line 451, in call
>>>     self_attention_outputs = self.attention(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_tf_bert.py", line 367, in call
>>>     self_outputs = self.self_attention(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_tf_bert.py", line 310, in call
>>>     attention_probs = self.dropout(inputs=attention_probs, training=training)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\layers\core\dropout.py", line 111, in call
>>>     output = control_flow_util.smart_cond(training, dropped_inputs,
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\utils\control_flow_util.py", line 105, in smart_cond
>>>     return tf.__internal__.smart_cond.smart_cond(
>>> 
>>>   File "C:\Users\yashr\AppData\Roaming\Python\Python39\site-packages\keras\layers\core\dropout.py", line 105, in dropped_inputs
>>>     return tf.nn.dropout(
>>> 