In [None]:
'''
Semantic clone detector.
This project aims to build a clone type-4 detector.
'''
import os
import csv
"""
source_data : original dataset of semantic clone bench
target_data : each files are divided into three parts.
------------------
Splitted naming convention followed: c_000A, c_000B and scb_c_000_AB
------------------
c_000A or c_000B
------------------
c programming language (c), file# 000,
first method/function(A), second method/function(B)
------------------
source_c_000_AB
------------------
stack overflow source for c language and clone pairs A, B for file 000
"""

def save_directories(data_paths, lang):
    '''
    :param data_paths: list of all directories of target files with labels
    :return:
    '''
    td = './dataset_splitted/_'+lang+"_dataset_all_positive.csv"
    with open(td, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(data_paths)
        print("saved as ", td)

def write2text(contents, fp):
    """
    :param contents: a chunk of texts(list of lines) to be written in file
    :param fp: directory path to write file
    :return: nothing
    """

    paths = [fp + '_source', fp + '01', fp + '02',
             fp + '_question_id', fp + '_first_fragment_answer_id', fp + '_second_fragment_answer_id',
             fp + '_label'
             ]
    #write to text file
    for content, path in zip(contents, paths):
        textfile = open(path, 'w')
        for element in content:
            textfile.write(element + "\n")


def extractor(file_path, line_number):
    """
    :param file_path: file to open
    :param line_number: line number to start reading
    :return: content and end line number
    """
    file = open(file_path, 'r')
    content = []
    qaa = []
    #print(file_path)

    for number, line in enumerate(file):
        if line_number < number:
            content.append(line)
            #java specific targe, might work for others
            if line[0:29] == '*  Stack overflow Question #:':
                qaa.append(line[29:])
            if line[0:27] == '*  Stack Overflow answer #:':
                qaa.append(line[27:])
            if line[0:30] == '*  And Stack Overflow answer#:':
                qaa.append(line[30:])
            #target ends

            if line[0:2] == '*/':
                break
            if len(line.strip()) == 0:
                if lang == "python" and line_number > 1:
                    continue
                break


    return content, number, qaa


def split_section(file_path, lang):
    """
    :param file_path: file to open
    :param lang: source programming language
    :return: spillted parts
   """
    question_id = 'u'
    answer_one_id = 'u'
    answer_two_id = 'u'

    if lang == 'java':
        ln = 0
    else:
        ln = -1

    source = ['source unavailable']
    if lang != 'python':
        source, ln, qaa = extractor(file_path, ln)
        question_id = qaa[0]
        answer_one_id = qaa[1]
        answer_two_id = qaa[2]
    #here i will add anohter list to get the question ID, answer IDs
    clone_a, ln, place_holder  = extractor(file_path, ln)
    #print(file_path, ln)
    clone_b, ln, place_holder  = extractor(file_path, ln)

    if lang=='java':
        clone_b.pop()


    return source, clone_a, clone_b, question_id, answer_one_id, answer_two_id




def process_all_files(source, target, lang):
    """
    :param source: source file directory
    :param target: target file directory
    :param lang: programming language in use
    :return: nothing
    """
    #list of list to save all directories with label as csv later
    data_paths = [["source", "first_fragment", "second_fragment","question_id",
                   "first_fragment_answer_id","second_fragment_answer_id", "label"]]
    for (root, dirs, files) in os.walk(source):
        print(root)
        for file in files:
            file_path = source + file
            contents = split_section(file_path, lang)
            fname = str.lower(str(file)[5:-4])
            ftarget = target + fname

            #append each one's path with a label
            #reqiures for saving text files, contents and paths_with_label must be same dimension
            paths_with_label = [ftarget + '_source.txt', ftarget + '01.txt', ftarget + '02.txt',
                                ftarget + '_question_id.txt',ftarget + '01_answer_id.txt',
                                ftarget + '02_answer_id.txt', ' 1'
                                ]
            #for the CSV
            # this one takes only path to 3 files
            to_save = paths_with_label[0:3]
            #add real IDs of question and answers
            if lang!='python':
                temp = [contents[3][:-1], contents[4][:-1],contents[5][:-1], 1]
            else:
                #u -> unavailable
                temp = ['u','u','u']
            to_save += temp
            data_paths.append(to_save)
            print(to_save)

            #writing each parts of a file in each iteration
            write2text(contents, ftarget)
    #outside loop
    #save all paths with label as csv
    save_directories(data_paths, lang)

In [None]:
if __name__ == '__main__':
    # change here only
    lang =  'python'#java python cs c
    source = './dataset_standalones/'+lang+'/' #change this when required to split files again
    target = './dataset_splitted/'+lang+'/'
    #uncomment the following to run
    #process_all_files(source, target, lang)