In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os 
import shutil

In [4]:
csv1 = pd.read_csv("../data/data_c.csv", index_col=0)
csv1 = csv1.iloc[:10000]
csv1.head()
csv1.shape

(10000, 2)

In [6]:
csv2 = pd.read_csv("../data/data_python.csv", index_col=0)
csv2 = csv2.iloc[:10000]
csv2.shape

(10000, 2)

In [7]:
df = pd.concat([csv1,csv2],ignore_index=True)
df.head()

Unnamed: 0,snippet,language
0,alignContentOffset + ABI38_0_0facebook::...,C
1,VMA_CALL_PRE void VMA_CALL_POST vmaFlushAlloca...,C
2,"q[9] = final_add(q[5], q[0]);\n q[6] = fina...",C
3,\nVKAPI_ATTR void VKAPI_CALL vkGetAcceleration...,C
4,#ifdef SPECIFIC_TX_POWER_SUPPORT\nINT Set_AP_P...,C


In [9]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,snippet,language
0,\t\t\treturn (NULL);\n\t\t}\n\t}\n\tstcb = SCT...,C
1,\t\tbio->bi_status = BLK_STS_DM_REQUEUE;\n\t\t...,C
2,uint8_t *row = &frame->dat...,C
3,upb_fieldtype_t value_typ...,C
4,# ref attribute definitions\nREF_ATTRS = ['Phy...,Python


In [10]:
lang = {"Python":0 , "C":1}

df.language = [lang[item] for item in df.language]
df.head()

Unnamed: 0,snippet,language
0,\t\t\treturn (NULL);\n\t\t}\n\t}\n\tstcb = SCT...,1
1,\t\tbio->bi_status = BLK_STS_DM_REQUEUE;\n\t\t...,1
2,uint8_t *row = &frame->dat...,1
3,upb_fieldtype_t value_typ...,1
4,# ref attribute definitions\nREF_ATTRS = ['Phy...,0


In [11]:
df.dropna(subset=['snippet'], inplace=True)
np.where(pd.isnull(df))

(array([], dtype=int64), array([], dtype=int64))

In [12]:
df.shape

(20000, 2)

In [13]:
X_train, X_test, y_train,y_test = train_test_split(df['snippet'],df['language'],test_size=0.2,random_state=42)

X_train.head()

5894    }\n\nstatic void azx_shutdown(struct pci_dev *...
3728    \n\ndef test_regex_findall_index(hass):\n    "...
8958    #define AR_PHY_CCK_SPUR_MIT_CCK_SPUR_FREQ     ...
7671        clone_to_dir = tmpdir.mkdir('clone')\n\n  ...
5999    \n    arr_len = array_length(array)\n    if id...
Name: snippet, dtype: object

In [14]:
y_train.head(5)

5894    1
3728    0
8958    1
7671    0
5999    0
Name: language, dtype: int64

In [15]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

In [16]:
bert_preprocess_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3")

In [17]:
test_text = ['#include<iostream> using namespace std;']
text_preprocessed = bert_preprocess_model(test_text)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_mask', 'input_type_ids', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [  101   108  1511   133   178 15540 11811  1306   135  1606  2666 12204]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [18]:
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4",trainable=True)

In [19]:
bert_results = bert_model(text_preprocessed)

print('Loaded BERT: https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4
Pooled Outputs Shape:(1, 768)
Pooled Outputs Values:[-0.34715572  0.39496866  0.99831665 -0.95313525  0.84872997  0.9579382
  0.78001404 -0.99887675 -0.7550599  -0.21460736  0.83877933  0.9870802 ]
Sequence Outputs Shape:(1, 128, 768)
Sequence Outputs Values:[[ 0.59650564  0.16838269  0.16422091 ... -0.22680387  0.2929708
   0.27553   ]
 [ 0.5543074   0.27467299  0.78604084 ...  0.43241164  0.31683612
   0.80753905]
 [ 0.5708918   0.39346924  0.15118726 ...  0.74248165 -0.0434835
   0.47575456]
 ...
 [ 0.491039    0.5923927   0.3665024  ...  0.31747705  0.42565614
   0.8804261 ]
 [ 0.6511216   0.5373208   0.61582774 ...  0.42865884  0.05547375
   0.906674  ]
 [ 0.24195792  0.6665267  -0.38054124 ... -0.16309077 -0.8584406
   0.5680795 ]]


In [20]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3", name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4",trainable=True, name ='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [21]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(test_text))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.33707955]], shape=(1, 1), dtype=float32)


In [22]:
metrics = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

classifier_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=metrics)

In [25]:
history = classifier_model.fit(x=X_train,y=y_train,batch_size=8,epochs=2)

Epoch 1/2


AlreadyExistsError: 2 root error(s) found.
  (0) ALREADY_EXISTS:  Resource __per_step_0/Adam/gradients/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/bert_encoder/StatefulPartitionedCall_grad/PartitionedCall/gradients/AddN_49/tmp_var/frame:0/iter:0/struct tensorflow::TemporaryVariableOp::TmpVar
	 [[{{node Adam/gradients/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/bert_encoder/StatefulPartitionedCall_grad/PartitionedCall/gradients/AddN_49/tmp_var}}]]
	 [[model/preprocessing/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert_pack_inputs/PartitionedCall/RaggedConcat/concat/_124]]
  (1) ALREADY_EXISTS:  Resource __per_step_0/Adam/gradients/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/bert_encoder/StatefulPartitionedCall_grad/PartitionedCall/gradients/AddN_49/tmp_var/frame:0/iter:0/struct tensorflow::TemporaryVariableOp::TmpVar
	 [[{{node Adam/gradients/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/bert_encoder/StatefulPartitionedCall_grad/PartitionedCall/gradients/AddN_49/tmp_var}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_134498]

Function call stack:
train_function -> train_function
