In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/AVATAR

/content/drive/MyDrive/AVATAR


In [10]:
%cd /content/drive/MyDrive/AVATAR

/content/drive/MyDrive/AVATAR


In [17]:
from codegen.preprocessing.lang_processors.tree_sitter_processor import (
    TreeSitterLangProcessor,
    NEW_LINE
)
from codegen.preprocessing.obfuscation.utils_deobfuscation import dico_to_string
from codegen.preprocessing.obfuscation import javalang_obfuscator

from codegen.preprocessing.lang_processors.tokenization_utils import (
    ind_iter,
    NEWLINE_TOKEN,
)
import re

JAVA_TOKEN2CHAR = {
    "STOKEN00": "//",
    "STOKEN01": "/*",
    "STOKEN02": "*/",
    "STOKEN03": "/**",
    "STOKEN04": "**/",
    "STOKEN05": '"""',
    "STOKEN06": "\\n",
    "STOKEN07": "\\r",
    "STOKEN08": ";",
    "STOKEN09": "{",
    "STOKEN10": "}",
    "STOKEN11": r"\'",
    "STOKEN12": r"\"",
    "STOKEN13": r"\\",
}
JAVA_CHAR2TOKEN = {value: " " + key + " " for key, value in JAVA_TOKEN2CHAR.items()}


class JavaProcessor(TreeSitterLangProcessor):
    def __init__(self, root_folder):
        super().__init__(
            language="java",
            ast_nodes_type_string=["comment", "string_literal", "character_literal"],
            stokens_to_chars=JAVA_TOKEN2CHAR,
            chars_to_stokens=JAVA_CHAR2TOKEN,
            root_folder=root_folder,
        )

    def obfuscate_code(self, code):
        res, dico = javalang_obfuscator.obfuscate(code)
        return res, dico_to_string(dico)

    def extract_functions(self, tokenized_code):
        """Extract functions from tokenized Java code"""
        if isinstance(tokenized_code, str):
            tokens = tokenized_code.split()
        else:
            assert isinstance(tokenized_code, list)
            tokens = tokenized_code
        i = ind_iter(len(tokens))
        functions_standalone = []
        functions_class = []
        try:
            token = tokens[i.i]
        except KeyboardInterrupt:
            raise
        except:
            return [], []
        while True:
            try:
                # detect function
                tokens_no_newline = []
                index = i.i
                while index < len(tokens) and len(tokens_no_newline) < 3:
                    index += 1
                    if tokens[index].startswith(NEWLINE_TOKEN):
                        continue
                    tokens_no_newline.append(tokens[index])

                if token == ")" and (
                        tokens_no_newline[0] == "{"
                        or (
                                tokens_no_newline[0] == "throws" and tokens_no_newline[2] == "{"
                        )
                ):
                    # go previous until the start of function
                    while token not in [";", "}", "{", "*/", "ENDCOM", NEW_LINE, "\n"]:
                        i.prev()
                        token = tokens[i.i]

                    if token == "*/":
                        while token != "/*":
                            i.prev()
                            token = tokens[i.i]
                        function = [token]
                        while token != "*/":
                            i.next()
                            token = tokens[i.i]
                            function.append(token)
                    elif token == "ENDCOM":
                        while token != "//":
                            i.prev()
                            token = tokens[i.i]
                        function = [token]
                        while token != "ENDCOM":
                            i.next()
                            token = tokens[i.i]
                            function.append(token)
                    else:
                        i.next()
                        token = tokens[i.i]
                        function = [token]

                    while token != "{":
                        i.next()
                        token = tokens[i.i]
                        function.append(token)
                    if token == "{":
                        number_indent = 1
                        while not (token == "}" and number_indent == 0):
                            try:
                                i.next()
                                token = tokens[i.i]
                                if token == "{":
                                    number_indent += 1
                                elif token == "}":
                                    number_indent -= 1
                                function.append(token)
                            except StopIteration:
                                break
                        if "static" in function[0: function.index("{")]:
                            functions_standalone.append(
                                self.remove_annotation(" ".join(function))
                            )
                        else:
                            functions_class.append(
                                self.remove_annotation(" ".join(function))
                            )
                i.next()
                token = tokens[i.i]
            except KeyboardInterrupt:
                raise
            except:
                break
        return functions_standalone, functions_class

    def remove_annotation(self, function):
        return re.sub(
            "^(@ (Override|Deprecated|SuppressWarnings) (\( .* \) )?)*", "", function
        )

    def get_function_name(self, function):
        return self.get_first_token_before_first_parenthesis(function)

    def extract_arguments(self, function):
        return self.extract_arguments_using_parentheses(function)

In [16]:
!pip install javalang

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [18]:
import os
java_codes_list = []
java_codes_dict = {}
i = 0
folder_path = '/content/drive/MyDrive/AVATAR/java'
for file_name in os.listdir(folder_path):
    names = file_name.split('.')
    #print(names[0])
    file_path = os.path.join(folder_path, file_name)
    jprocessor = JavaProcessor('/content/drive/MyDrive/leetcode/java/0051-n-queens.java')
    with open(file_path,'r') as file:
        code = file.read()
    code_tokens = jprocessor.tokenize_code(code)
    code = ' '.join(code_tokens)
    java_codes_list.append(code)
    lst = []
    lst.append(code)
    if names[0] in java_codes_dict:
        lst.append(code)
        java_codes_dict[names[0]] = lst
    else:
        java_codes_dict[names[0]] = lst

/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-ja

In [19]:
import os
java_codes_list = []
java_codes_dict = {}
java_dict_2 = {}
i = 0
folder_path = '/content/drive/MyDrive/AVATAR/java'
for file_name in os.listdir(folder_path):
    names = file_name.split('.')
    #print(names[0])
    file_path = os.path.join(folder_path, file_name)
    jprocessor = JavaProcessor('/content/drive/MyDrive/leetcode/java/0051-n-queens.java')
    with open(file_path,'r') as file:
        code = file.read()
    java_dict_2[names[0]] = code
    code_tokens = jprocessor.tokenize_code(code)
    code = ' '.join(code_tokens)
    java_codes_list.append(code)
    java_codes_dict[names[0]] = code

/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-java
/content/drive/MyDrive/AVATAR/third_party/tree-sitter-ja

In [21]:
class PythonProcessor():
    def __init__(self, root_folder=None):

        self.spetoken2char = {
            "STOKEN00": "#",
            "STOKEN1": "\\n",
            "STOKEN2": '"""',
            "STOKEN3": "'''",
        }
        self.char2spetoken = {
            value: " " + key + " " for key, value in self.spetoken2char.items()
        }
        self.language = "python"

    def tokenize_code(self, code, keep_comments=False, process_strings=True):
        assert isinstance(code, str)
        code = code.replace(r"\r", "")
        code = code.replace("\r", "")
        tokens = []

        try:
            iterator = tokenize.tokenize(BytesIO(code.encode("utf-8")).readline)
        except SyntaxError as excep:
            raise SyntaxError(excep)

        removed_docstr = 0
        while True:
            try:
                toktype, tok, _, _, line = next(iterator)
            except (
                    tokenize.TokenError,
                    IndentationError,
                    SyntaxError,
                    UnicodeDecodeError,
            ) as e:
                raise ValueError(
                    f'Impossible to parse tokens because of incorrect source code "{e}" ...'
                )
            except StopIteration:
                raise Exception(f"End of iterator before ENDMARKER token.")

            if toktype == tokenize.ENCODING or toktype == tokenize.NL:
                continue

            elif toktype == tokenize.NEWLINE:
                if removed_docstr == 1:
                    removed_docstr = 0
                    continue
                tokens.append("NEW_LINE")

            elif toktype == tokenize.COMMENT:
                if keep_comments:
                    com = process_string(
                        tok,
                        self.char2spetoken,
                        self.spetoken2char,
                        True,
                        do_whole_processing=process_strings,
                    )
                    if len(com) > 0:
                        tokens.append(com)
                else:
                    continue

            elif toktype == tokenize.STRING:
                if tok == line.strip():  # docstring
                    if not keep_comments:
                        removed_docstr = 1
                        continue
                    else:
                        coms = process_string(
                            tok,
                            self.char2spetoken,
                            self.spetoken2char,
                            True,
                            do_whole_processing=process_strings,
                        )
                        if len(coms) > 0:
                            tokens.append(coms)
                        else:
                            removed_docstr = 1
                else:
                    tokens.append(
                        process_string(
                            tok,
                            self.char2spetoken,
                            self.spetoken2char,
                            False,
                            do_whole_processing=process_strings,
                        )
                    )

            elif toktype == tokenize.INDENT:
                tokens.append("INDENT")

            elif toktype == tokenize.DEDENT:
                # empty block
                if tokens[-1] == "INDENT":
                    tokens = tokens[:-1]
                else:
                    tokens.append("DEDENT")

            elif toktype == tokenize.ENDMARKER:
                tokens.append("ENDMARKER")
                break

            else:
                tokens.append(tok)

        assert tokens[-1] == "ENDMARKER", "Error, no end marker"
        return tokens[:-1]

    def detokenize_code(self, code):
        # replace recreate lines with \n and appropriate indent / dedent
        # removing indent/ dedent tokens
        assert isinstance(code, str) or isinstance(code, list)
        if isinstance(code, list):
            code = " ".join(code)
        code = code.replace("ENDCOM", "NEW_LINE")
        code = code.replace("▁", "SPACETOKEN")
        lines = code.split("NEW_LINE")
        tabs = ""
        for i, line in enumerate(lines):
            line = line.strip()
            if line.startswith("INDENT "):
                tabs += "    "
                line = line.replace("INDENT ", tabs)
            elif line.startswith("DEDENT"):
                number_dedent = line.count("DEDENT")
                tabs = tabs[4 * number_dedent:]
                line = line.replace("DEDENT", "")
                line = line.strip()
                line = tabs + line
            elif line == "DEDENT":
                line = ""
            else:
                line = tabs + line
            lines[i] = line
        untok_s = "\n".join(lines)
        # find string and comment with parser and detokenize string correctly
        try:
            for toktype, tok, _, _, line in tokenize.tokenize(
                    BytesIO(untok_s.encode("utf-8")).readline
            ):
                if toktype == tokenize.STRING or toktype == tokenize.COMMENT:
                    tok_ = (
                        tok.replace("STRNEWLINE", "\n")
                            .replace("TABSYMBOL", "\t")
                            .replace(" ", "")
                            .replace("SPACETOKEN", " ")
                    )
                    untok_s = untok_s.replace(tok, tok_)
        except KeyboardInterrupt:
            raise
        except:
            # TODO raise ValueError(f'Invalid python function \n {code}\n')
            pass
        # detokenize imports
        untok_s = (
            untok_s.replace(". ", ".")
                .replace(" .", ".")
                .replace("import.", "import .")
                .replace("from.", "from .")
        )
        # special strings
        string_modifiers = ["r", "u", "f", "rf", "fr", "b", "rb", "br"]
        for modifier in string_modifiers + [s.upper() for s in string_modifiers]:
            untok_s = untok_s.replace(f" {modifier} '", f" {modifier}'").replace(
                f' {modifier} "', f' {modifier}"'
            )
        untok_s = untok_s.replace("> >", ">>").replace("< <", "<<")
        return untok_s

    def obfuscate_code(self, code):
        res, dico = obfuscateString(code, obfuscateNames=True, removeDocstrings=False)
        return res, dico_to_string(dico)

    def extract_functions(self, tokenized_code: str):
        """Extract functions from tokenized python code"""
        if isinstance(tokenized_code, str):
            tokenized_code = tokenized_code.split()
        else:
            assert isinstance(tokenized_code, list)
            tokenized_code = tokenized_code

        def filter_functions_python_2_3(function):
            if (
                    re.search("print [^(]", function) is None
                    and re.search("raise \w+ ,", function) is None
                    and re.search("except \w+ ,", function) is None
                    and re.search("[^ ]+ = \d+ L", function) is None
                    and ". iterkeys ( )" not in function
                    and ". itervalues ( )" not in function
                    and ". iteritems ( )" not in function
                    and "xrange (" not in function
                    and "imap (" not in function
            ):
                return function
            else:
                return None

        tokens = iter(tokenized_code)
        functions_standalone = []
        functions_class = []
        number_indent = 0
        try:
            token = next(tokens)
        except StopIteration:
            return [], []
        while True:
            try:
                if token == "def":
                    function = ["def"]
                    while not (token == "DEDENT" and number_indent == 0):
                        token = next(tokens)
                        if token == "INDENT":
                            number_indent += 1
                        elif token == "DEDENT":
                            number_indent -= 1
                        function.append(token)
                    try:
                        if function[function.index("(") + 1] == "self":
                            function = filter_functions_python_2_3(" ".join(function))
                            if function is not None:
                                functions_class.append(function)
                        else:
                            function = filter_functions_python_2_3(" ".join(function))
                            if function is not None:
                                functions_standalone.append(function)
                    except:
                        print(function)
                        token = next(tokens)
                else:
                    token = next(tokens)
            except StopIteration:
                break
        return functions_standalone, functions_class

    def get_function_name(self, function):
        assert isinstance(function, str) or isinstance(function, list)
        if isinstance(function, str):
            function = function.split()
        return function[function.index("def") + 1]


In [23]:
import tokenize
from io import BytesIO
import re
from codegen.preprocessing.lang_processors.tokenization_utils import (
    process_string,
)
python_codes_list = []
python_codes_dict = {}
py_dict_2 = {}
i = 0
folder_path = '/content/drive/MyDrive/AVATAR/python'
for file_name in os.listdir(folder_path):
    names = file_name.split('.')
    file_path = os.path.join(folder_path, file_name)
    pyprocessor = PythonProcessor()
    with open(file_path,'r') as file:
        code = file.read()
    py_dict_2[names[0]] = code
    code_tokens = pyprocessor.tokenize_code(code)
    code = ' '.join(code_tokens)
    python_codes_list.append(code)
    python_codes_dict[names[0]] = code

In [24]:
java_keys_list = list(java_codes_dict.keys())

In [27]:
final_dict = {}
for i in java_keys_list:
    if i in python_codes_dict:
        lst = []
        lst.append(java_codes_dict[i])
        lst.append(python_codes_dict[i])
        final_dict[i] = lst


In [28]:
final_keys_list = list(final_dict.keys())

In [29]:
import json
file_path = '/content/drive/MyDrive/data_file_1.jsonl'

def append_to_jsonl(dictionary, file_path):
    with open(file_path, 'a') as file:
        json.dump(dictionary, file)
        file.write('\n')


In [32]:
final_dict['0']

['class Solution { List & lt ; Integer & gt ; nodeValues = new ArrayList & lt ; & gt ; ( ) ; void dfs ( TreeNode node ) { if ( node == null ) { return ; } nodeValues . add ( node . val ) ; dfs ( node . left ) ; dfs ( node . right ) ; } int getMinimumDifference ( TreeNode root ) { dfs ( root ) ; Collections . sort ( nodeValues ) ; int minDifference = Integer . MAX_VALUE ; for ( int i = 1 ; i & lt ; nodeValues . size ( ) ; i ++ ) { minDifference = Math . min ( minDifference , nodeValues . get ( i ) - nodeValues . get ( i - 1 ) ) ; } return minDifference ; } } ;',
 'class Solution : NEW_LINE INDENT def getMinimumDifference ( self , root : Optional [ TreeNode ] ) - & gt ; int : NEW_LINE INDENT nodeValues = [ ] NEW_LINE def dfs ( node ) : NEW_LINE INDENT if node is None : NEW_LINE INDENT return NEW_LINE DEDENT nodeValues . append ( node . val ) NEW_LINE dfs ( node . left ) NEW_LINE dfs ( node . right ) NEW_LINE DEDENT dfs ( root ) NEW_LINE nodeValues . sort ( ) NEW_LINE minDifference = 1e9 

In [33]:
len(final_keys_list)

1069

In [50]:
 initial_dict = []
 req_dict_2 = []
 for i in final_keys_list:
    i_2 = {}
    i_dict = {}
    i_dict["id"] = i
    lst = []
    lst.append(java_dict_2[i])
    lst.append(py_dict_2[i])
    i_2[i] = lst
    req_dict_2.append(i_2)
    lst_1 = []
    lst_2 = []
    lst_1.append(final_dict[i][0])
    lst_2.append(final_dict[i][1])
    i_dict["java"] = lst_1
    i_dict["python"] = lst_2
    initial_dict.append(i_dict)
    #print(i_dict)
    append_to_jsonl(i_dict, file_path)


In [56]:
initial_dict[3]

{'id': '72',
 'java': ['class Solution { public int [ ] twoSum ( int [ ] nums , int target ) { for ( int i = 0 ; i & lt ; nums . length ; i ++ ) { for ( int j = i + 1 ; j & lt ; nums . length ; j ++ ) { if ( nums [ j ] == target - nums [ i ] ) { return new int [ ] { i , j } ; } } } return null ; } }'],
 'python': ['class Solution : NEW_LINE INDENT def twoSum ( self , nums : List [ int ] , target : int ) - & gt ; List [ int ] : NEW_LINE INDENT for i in range ( len ( nums ) ) : NEW_LINE INDENT for j in range ( i + 1 , len ( nums ) ) : NEW_LINE INDENT if nums [ j ] == target - nums [ i ] : NEW_LINE INDENT return [ i , j ] NEW_LINE DEDENT DEDENT DEDENT DEDENT DEDENT']}

In [55]:
req_dict_2[3]

{'72': ["class Solution {\n    public int[] twoSum(int[] nums, int target) {\n        for (int i = 0; i &lt; nums.length; i++) {\n            for (int j = i + 1; j &lt; nums.length; j++) {\n                if (nums[j] == target - nums[i]) {\n                    return new int[] { i, j };\n                }\n            }\n        }\n        // In case there is no solution, we'll just return null\n        return null;\n    }\n}",
  'class Solution:\n    def twoSum(self, nums: List[int], target: int) -&gt; List[int]:\n        for i in range(len(nums)):\n            for j in range(i + 1, len(nums)):\n                if nums[j] == target - nums[i]:\n                    return [i, j]']}

In [41]:
java_dict_2['70']

'class Solution {\n    public List&lt;List&lt;String&gt;&gt; groupAnagrams(String[] strs) {\n        if (strs.length == 0) return new ArrayList();\n        Map&lt;String, List&gt; ans = new HashMap&lt;String, List&gt;();\n        for (String s : strs) {\n            char[] ca = s.toCharArray();\n            Arrays.sort(ca);\n            String key = String.valueOf(ca);\n            if (!ans.containsKey(key)) ans.put(key, new ArrayList());\n            ans.get(key).add(s);\n        }\n        return new ArrayList(ans.values());\n    }\n}'

In [42]:
py_dict_2['70']

'class Solution(object):\n    def groupAnagrams(self, strs):\n        ans = collections.defaultdict(list)\n        for s in strs:\n            ans[tuple(sorted(s))].append(s)\n        return ans.values()'