In [144]:
import clang.cindex
from clang.cindex import CursorKind
from anytree import RenderTree
from tree_node import Node
import utils
from node_handler import *
import re
import subprocess
import pexpect
from tqdm import tqdm

In [4]:
import pandas as pd

In [5]:
solutions = pd.read_csv('../codeforces-scraper/data/contests_solutions_metadata/solutions_600.csv')

Unnamed: 0,solutionId,contestId,problem,programmingLanguage
30,106210928,1070,K,GNU C++17


In [46]:
problems = pd.read_csv('../codeforces-scraper/data/contests_problems/problems_data_600.csv')

In [6]:
class AST_parser:
    def __init__(self, clang_lib_file='/usr/lib/x86_64-linux-gnu/libclang-6.0.so.1'):
        try:
            clang.cindex.Config.set_library_file(clang_lib_file)
        except Exception as e:
            print(f'Skipped setting library file: {e}')

        self.index = clang.cindex.Index.create()


    def parse_ast(self, input_file_path):
        tu = self.index.parse(input_file_path)
        cursor_items = self.get_cursor_items(tu.cursor)
        root_node = Node('root', is_reserved=True)

        for cursor_item in cursor_items:
            self.parse_item(cursor_item, root_node)

        return root_node


    def get_cursor_items(self, cursor):
        cursor_items = []
        user_code_started = False
        root_node = Node('root', True)

        for child in cursor.get_children():
            if user_code_started:
                cursor_items.append(child)
            if child.kind == CursorKind.USING_DIRECTIVE:
                user_code_started = True

        return cursor_items


    def parse_item(self, ast_item, parent_node):
        # skip meaningless AST primitives
        if ast_item.kind == CursorKind.DECL_STMT or ast_item.kind == CursorKind.UNEXPOSED_EXPR \
            or ast_item.kind == CursorKind.TEMPLATE_REF or ast_item.kind == CursorKind.NAMESPACE_REF:
            pass


        # Parse typdef
        elif utils.is_typedef(ast_item):
            handle_typedef(ast_item, parent_node)


        # parse declaration
        elif ast_item.kind.is_declaration():
            parent_node = handle_declaration(ast_item, parent_node, self.parse_item)

        # parse operator
        elif utils.is_operator(ast_item):
            parent_node = handle_operator(ast_item, parent_node)

        # parse literal
        elif utils.is_literal(ast_item):
            handle_literal(ast_item, parent_node)


        # parse call expression
        elif utils.is_call_expr(ast_item):
            parent_node = handle_call_expr(ast_item, parent_node)


        # parse reference
        elif utils.is_reference(ast_item):
            handle_reference(ast_item, parent_node)
        

        # parse type ref
        elif ast_item.kind == CursorKind.TYPE_REF and parent_node.label != 'root':
            handle_type_ref(ast_item, parent_node)


        # if not one of the above -> create simple parent node of the kind of the item
        else:
            parent_node = Node(ast_item.kind.name, is_reserved=True, parent=parent_node)


        for child in ast_item.get_children():
            if child.kind != CursorKind.PARM_DECL:
                self.parse_item(child, parent_node)

In [7]:
ast_parser = AST_parser()
ast = ast_parser.parse_ast('../data/subset/cpp_preprocessed/104558927.cpp')

Skipped setting library file: library file must be set before before using any other functionalities in libclang.


NameError: name 'CursorKind' is not defined

In [None]:
for pre, fill, node in RenderTree(ast):
    treestr = u"%s%s" % (pre, node.label)
    print(treestr)

In [None]:
from anytree.exporter import JsonExporter

exporter = JsonExporter(indent=2)
with open('tree.json', 'w') as file:
    file.write(exporter.export(root_node))


In [124]:
for _,_,files in os.walk('../data/subset/ast_trees_to_code/'):
    print(files)

['104554194.cpp', '104554251.cpp', '104558927.cpp', '104560220.cpp', '104604737.cpp', '104606100.cpp', '104663473.cpp', '104696038.cpp', '104759177.cpp', '104759215.cpp', '104760965.cpp', '104766018.cpp', '104930101.cpp', '104931090.cpp', '104939468.cpp', '105028826.cpp', '105052007.cpp', '105066757.cpp', '105085785.cpp', '105087511.cpp', '105087586.cpp', '105113472.cpp', '105121226.cpp', '105129923.cpp', '106067757.cpp', '106076703.cpp', '106076991.cpp', '106079532.cpp', '106079587.cpp', '106081350.cpp', '106081812.cpp', '106085118.cpp', '106098695.cpp', '106181322.cpp', '106187233.cpp', '106188303.cpp', '106207179.cpp', '106209681.cpp', '106210901.cpp', '106210928.cpp', '106219172.cpp', '106283598.cpp', '106293015.cpp', '106295849.cpp', '106389953.cpp', '106395892.cpp', '106456255.cpp', '106461660.cpp', '106504120.cpp', '106508975.cpp', '106509000.cpp', '106534201.cpp', '106612149.cpp', '106637474.cpp', '106643718.cpp', '106696279.cpp', '106697152.cpp', '106701857.cpp', '106705445.cp

In [141]:
solution = 104554194


solutions[solutions['solutionId'] == solution]
solution_data = solutions[solutions['solutionId'] == solution]

tests = problems[(problems['contestId'] == solution_data['contestId'].iloc[0]) & (problems['problem'] == solution_data['problem'].iloc[0])]['allTests'].iloc[0]
print(solution_data)
tests[:500]

     solutionId  contestId problem programmingLanguage
107   104554194       1070       B           GNU C++11


"[('1\\n-149.154.167.99', '1\\n0.0.0.0/0'), ('4\\n-149.154.167.99\\n+149.154.167.100/30\\n+149.154.167.128/25\\n-149.154.167.120/29', '2\\n149.154.167.96/30\\n149.154.167.112/28'), ('5\\n-127.0.0.4/31\\n+127.0.0.8\\n+127.0.0.0/30\\n-195.82.146.208/29\\n-127.0.0.6/31', '2\\n127.0.0.4/30\\n128.0.0.0/1'), ('2\\n+127.0.0.1/32\\n-127.0.0.1', '-1'), ('1\\n-0.0.0.0/32', '1\\n0.0.0.0/0'), ('1\\n-0.0.0.10', '1\\n0.0.0.0/0'), ('1\\n-0.0.0.1', '1\\n0.0.0.0/0'), ('1\\n-0.0.0.13/32', '1\\n0.0.0.0/0'), ('1\\n-0.0.0.7', '1\\n0.0.0.0/0'),"

In [200]:

not_compiled = []
tests_failed = []
success = []

for dirs,_,files in os.walk('../data/subset/ast_trees_to_code/'):
    for file in tqdm(files):
        if file.endswith('.cpp'):
            solution = int(file.split('.')[0])
            proc = subprocess.Popen(['g++', f'{dirs}{file}', '-o', f'{dirs}{solution}.out'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            proc.wait()
            output, err = p.communicate()
            rc = p.returncode

            solution_data = solutions[solutions['solutionId'] == solution]

            tests = problems[(problems['contestId'] == solution_data['contestId'].iloc[0]) & (problems['problem'] == solution_data['problem'].iloc[0])]['allTests'].iloc[0]

            tests_split = re.split(r'([^()]+)', tests[1:-1])
            tests_split_filtered = list(filter(lambda a: a != '(' and a != ')' and a != ', ', tests_split))

            try:
                for test in tests_split_filtered:
                    program_input = test.replace('\\n', '\r\n').replace("'", '').split(', ')[0].strip()
                    program_output = test.replace('\\n', '\r\n').replace("'", '').split(', ')[1].strip()

                    # Scrip scraped tests that were cut off by codeforces (too long)
                    if not program_input.endswith('...'):
                        analyzer = pexpect.spawn(f'{dirs}{solution}.out', encoding='utf-8')

                        for inp in program_input.split('\n'):
                            analyzer.expect('')
                            analyzer.sendline(inp)
                        try:
                            analyzer.expect('program_output')
                            # print(f'Sucess on {file}')
                            success.append(file)
                        except Exception as e:
                            # print(f'Failed on {file}')
                            tests_failed.append(file)
                            break

            except Exception as e:
                not_compiled.append(file)
                # print(f'File {file} could not be compiled: {e}')


 91%|█████████▏| 115/126 [04:58<00:28,  2.59s/it]


KeyboardInterrupt: 

In [203]:
print(len(not_compiled))
print(len(tests_failed))
print(len(success))


63
26
0
