In [1]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pickle
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import argparse
from tokenize import tokenize, untokenize, COMMENT, STRING, NEWLINE, ENCODING, ENDMARKER, NL, INDENT, NUMBER
from io import BytesIO
import json

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
file_name = "Project_CodeNet_LangClass.tar.gz"
data_url = f"https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/{file_name}"

# Download tar archive to local disk
with open(file_name, "wb") as f:
    f.write(requests.get(data_url).content)
    
# Extract contents of archive to local disk
if os.path.exists("data"):
    shutil.rmtree("data")    
with tarfile.open(file_name) as tfile:
    tfile.extractall()

In [4]:
lits = json.load(open("literals.json"))

def process_string(token, special_chars={" ": "U+0020", ",": "U+002C"}):
    str_quote_options = ["'''", '"""', "'", '"']
    start_quote = ""
    end_quote = ""
    qualifier_regex = r"^[a-z]+"
    qualifier_match = re.search(qualifier_regex, token)
    # string qualifiers like 'r' for regex, 'f' for formatted string, 'b' for bytes, 'u' for unicode, etc (or combination of them)
    qualifier = "" if not qualifier_match else qualifier_match[0]
    # token string without qualifiers
    token_string = re.sub(qualifier_regex, "", token)
    # string literal without quotes
    str_lit = token_string
    for q in str_quote_options:
        if token_string.startswith(q):
            start_quote = q
            str_lit = str_lit[len(q) :]
            if token_string.endswith(q):
                end_quote = q
                str_lit = str_lit[: -len(q)]
            break
    # if start_quote in str_quote_options[:2]:
    #     return ""
    for sc in special_chars:
        str_lit = str_lit.replace(sc, special_chars[sc])
    return (
        f"{qualifier}{start_quote}<STR_LIT:{str_lit}>{end_quote}"
        if str_lit in lits['str']
        else f"{qualifier}{start_quote}<STR_LIT>{end_quote}"
    )

def py_tokenize(file_type):
    file_paths = glob.glob(os.path.join(os.getcwd(),"data/"+file_type+"/Python","*.*"))
    wf = open(os.path.join(os.getcwd(), f"{file_type}.txt"), 'w')
    local_corpus = []
    for path in file_paths:
        try:
            code = open(path).read()
            token_gen = tokenize(BytesIO(bytes(code, "utf8")).readline)
            out_tokens = []
            prev_eol = False
            for toknum, tokval, _, _, _ in token_gen:
                tokval = " ".join(tokval.split())
                if toknum == STRING:
                    add_token = process_string(tokval)
                    out_tokens.append(add_token)
                    prev_eol = False
                elif toknum == NUMBER:
                    if tokval in lits['num']:
                        out_tokens.append(f"<NUM_LIT:{tokval}>")
                    else:
                        out_tokens.append(f"<NUM_LIT>")
                    prev_eol = False
                elif toknum in [NEWLINE, NL]:
                    if not prev_eol:
                        out_tokens.append("<EOL>")
                        prev_eol = True
                elif toknum in [COMMENT, INDENT, ENCODING, ENDMARKER] or len(tokval) == 0:
                    continue
                else:
                    out_tokens.append(tokval)
                    prev_eol = False
            if out_tokens[0] == "<EOL>":
                out_tokens = out_tokens[1:]
            if out_tokens[-1] == "<EOL>":
                out_tokens = out_tokens[:-1]
        except Exception:
            out_tokens = []
#         local_corpus.extend((" ".join(out_tokens)).split('<EOL>'))
#         out_tokens = ["<s>"] + out_tokens + ["</s>"]
        out = " ".join(out_tokens)
        local_corpus.append(out)
        wf.write(out+"\n")
    print(f"{file_type}: are done")
    wf.close()
    return local_corpus

def read_corpus(directory):
    corpus = py_tokenize(directory)
    full_corpus = ''.join(corpus)
    corpus_new = []
    for code in corpus:
        corpus_new.extend(code.split('<EOL>'))
        
    return pd.DataFrame(corpus_new)

In [5]:
train_corpus = read_corpus("train")
train_corpus['target']=1
test_corpus = read_corpus("test")
test_corpus['target']=1

train: are done
test: are done


In [6]:
test_corpus[0:5]

Unnamed: 0,0,target
0,"""""""<STR_LIT>""""""",1
1,import sys,1
2,from sys import stdin,1
3,input = stdin . readline,1
4,class Point ( object ) :,1


In [7]:
train_corpus[0:5]

Unnamed: 0,0,target
0,class vector ( object ) :,1
1,"def __init__ ( self , a , b ) :",1
2,self . x = b . x - a . x,1
3,self . y = b . y - a . y,1
4,@ staticmethod,1


In [8]:
import transformers
from transformers import AutoModel, AutoTokenizer

In [9]:
model_name = "bert-base-uncased" 
model_name_code = "microsoft/codebert-base"
# Download pytorch model
model = AutoModel.from_pretrained(model_name_code)
tokenizer = AutoTokenizer.from_pretrained(model_name_code)

In [13]:
def tokenize_text(data):
    encoded = tokenizer.batch_encode_plus(data,max_length = 25, padding=True,truncation=True, return_tensors='np')
    return encoded.data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_corpus[0],train_corpus['target'], test_size=0.2, random_state=42)
X_train.head()

839          cp3 = cross_product ( p3_to_p1 , p3_to_pp ) 
644                                    cnt = <NUM_LIT:0> 
1334                        def large ( x , y , area ) : 
2676                  def NO ( ) : print ( '<STR_LIT>' ) 
3651     if y + <NUM_LIT:2> <= self . size [ <NUM_LIT:...
Name: 0, dtype: object

In [15]:
train_data = tokenize_text(X_train.tolist())
test_data = tokenize_text(X_test.tolist())

In [16]:
train_data

{'input_ids': array([[    0, 46247,   246, ...,  4839,  1437,     2],
        [    0,   740,  3999, ...,     1,     1,     1],
        [    0,  3816,   739, ...,     1,     1,     1],
        ...,
        [    0,  1615,  1594, ...,     1,     1,     1],
        [    0,  1577,  5457, ...,   741,   646,     2],
        [    0,   386,  1215, ...,     1,     1,     1]]),
 'attention_mask': array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0]])}

In [17]:
!nvidia-smi

Fri Nov 19 18:13:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 470.42.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:02:00.0 Off |                    0 |
| N/A   23C    P0    29W / 250W |  11557MiB / 12198MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [22]:
from transformers import TFRobertaForSequenceClassification, TFTrainer, TFTrainingArguments

model = TFRobertaForSequenceClassification.from_pretrained(model_name_code)

ResourceExhaustedError: OOM when allocating tensor with shape[50265,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:TruncatedNormal]

In [None]:
training_args = TFTrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

trainer = TFTrainer(
    model=model,                        
    args=training_args,                 
    train_dataset=tf_train_data,    
    eval_dataset=tf_test_data       
)

trainer.train()