# Create datasets

In [1]:
import yaml
import os

import tqdm.autonotebook as tqdm
import pandas as pd

from utils.lang_enum import languages

DF_CACHE_PATH = "../../datasets/cache/all.pickle"
DF_PATHONLY_CACHE_PATH = "../../datasets/cache/path_only.pickle"

  import tqdm.autonotebook as tqdm


In [2]:
MAX_FILESIZE = 0.5 * 1024 * 1024 # 1MB

def collect_files_content(root_dir, yaml_path):
    with open(yaml_path, 'r') as f:
        mappings = yaml.safe_load(f)

    data_list = []

    for language_tag, language_mapping in tqdm.tqdm(mappings.items()):
        for language, extensions in language_mapping.items():
            if not isinstance(extensions, list):
                extensions = [extensions]

            for extension in extensions:
                lang_dir = os.path.join(root_dir, language)
                if not os.path.exists(lang_dir):
                    continue

                for dirpath, dirnames, filenames in os.walk(lang_dir, followlinks=True):
                    for filename in filenames:
                        if extension == ".*" or filename.endswith(extension):
                            try:
                                filesize = os.path.getsize(os.path.join(dirpath, filename))
                                if filesize < MAX_FILESIZE and filesize > 0:
                                    with open(os.path.join(dirpath, filename), 'r') as file:
                                        content = file.read()
                                        if len(content) > 0:
                                            abs_path = os.path.abspath(os.path.join(dirpath, filename))
                                            data_list.append((language_tag, language, content, abs_path))
                            except:
                                pass

    return data_list

### RosettaCodeData

In [3]:
datafile = "../../datasets/RosettaCodeData/Conf/nlang.yaml"
dataroot = "../../datasets/RosettaCodeData/Lang"

data = collect_files_content(dataroot, datafile)
rosetta_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
rosetta_df["source"] = "rosetta"

  0%|          | 0/20 [00:00<?, ?it/s]

### DLLDCodeData

In [4]:
datafile = "../../datasets/deep-learning-lang-detection/data/nlang.yaml"
dataroot = "../../datasets/deep-learning-lang-detection/data/all"

data = collect_files_content(dataroot, datafile)
dlld_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
dlld_df["source"] = "dlld"

  0%|          | 0/13 [00:00<?, ?it/s]

### GitHubCodeData

In [5]:
datafile = "../../datasets/github/langs/nlang.yaml"
dataroot = "../../datasets/github/langs/"

data = collect_files_content(dataroot, datafile)
github_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
github_df["source"] = "github_langs"

  0%|          | 0/28 [00:00<?, ?it/s]

In [6]:
datafile = "../../datasets/github/other/nlang.yaml"
dataroot = "../../datasets/github/other/"

data = collect_files_content(dataroot, datafile)
github_other_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
github_other_df["source"] = "github_other"

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
datafile = "../../datasets/github/other_langs/nlang.yaml"
dataroot = "../../datasets/github/other_langs/"

data = collect_files_content(dataroot, datafile)
github_other_langs_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
github_other_langs_df["source"] = "github_other_langs"

  0%|          | 0/1 [00:00<?, ?it/s]

### Generated

In [8]:
datafile = "../../datasets/generated/nlang.yaml"
dataroot = "../../datasets/generated/"

data = collect_files_content(dataroot, datafile)
gen_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
gen_df["source"] = "generated"

  0%|          | 0/2 [00:00<?, ?it/s]

### Tgdataset

In [9]:
datafile = "../../datasets/tgdataset/nlang.yaml"
dataroot = "../../datasets/tgdataset/"
data = collect_files_content(dataroot, datafile)
tgdataset_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
tgdataset_df["source"] = "tgdataset"

datafile = "../../datasets/tgdataset2/nlang.yaml"
dataroot = "../../datasets/tgdataset2/"
data = collect_files_content(dataroot, datafile)
tgdataset2_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
tgdataset2_df["source"] = "tgdataset2"

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

### Stackoverflow

In [10]:
datafile = "../../datasets/stackoverflow/nlang.yaml"
dataroot = "../../datasets/stackoverflow/"

data = collect_files_content(dataroot, datafile)
stack_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
stack_df["source"] = "stackoverflow"

  0%|          | 0/1 [00:00<?, ?it/s]

### LLAMA

In [11]:
#tasks
datafile = "../../datasets/llama/tasks/nlang.yaml"
dataroot = "../../datasets/llama/tasks"

data = collect_files_content(dataroot, datafile)
llama_tasks_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
llama_tasks_df["source"] = "llama_tasks"

#tasks2
datafile = "../../datasets/llama/tasks2/nlang.yaml"
dataroot = "../../datasets/llama/tasks2"

data = collect_files_content(dataroot, datafile)
llama_tasks2_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
llama_tasks2_df["source"] = "llama_tasks2"

#libs
datafile = "../../datasets/llama/libs/nlang.yaml"
dataroot = "../../datasets/llama/libs"

data = collect_files_content(dataroot, datafile)
llama_libs_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
llama_libs_df["source"] = "llama_libs"

#keywords
datafile = "../../datasets/llama/keywrods/nlang.yaml"
dataroot = "../../datasets/llama/keywrods"

data = collect_files_content(dataroot, datafile)
llama_keywords_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
llama_keywords_df["source"] = "llama_keywords"

#stories
datafile = "../../datasets/llama/stories/nlang.yaml"
dataroot = "../../datasets/llama/stories"

data = collect_files_content(dataroot, datafile)
llama_stories_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
llama_stories_df["source"] = "llama_stories"

#shell
datafile = "../../datasets/llama/shell/nlang.yaml"
dataroot = "../../datasets/llama/shell"

data = collect_files_content(dataroot, datafile)
llama_shell_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
llama_shell_df["source"] = "llama_shell"

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

### CPP_TEST

In [12]:
datafile = "../../datasets/CPP_TEST/nlang.yaml"
dataroot = "../../datasets/CPP_TEST/"

data = collect_files_content(dataroot, datafile)
cpp_test_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
cpp_test_df["source"] = "cpp_test"

  0%|          | 0/29 [00:00<?, ?it/s]

In [13]:
df = pd.concat([rosetta_df, dlld_df, github_df, gen_df, stack_df, github_other_df, github_other_langs_df,
                llama_tasks_df, llama_tasks2_df, llama_stories_df, llama_libs_df, llama_keywords_df, llama_shell_df,
                cpp_test_df, tgdataset_df, tgdataset2_df], ignore_index=True)
print(f"Number of samples: {len(df):,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum()/1024/1024/1024:.2f} GB")

Number of samples: 5,154,973
Memory usage: 32.20 GB


In [17]:
import random

CROP_LEN = int(4096)

def crop_text(text):
    if len(text) > CROP_LEN:
        start = random.randint(0, len(text) - CROP_LEN)
        return text[start:start + CROP_LEN]
    return text

df["code"] = df["code"].apply(crop_text)
print(f"Memory usage after slice: {df.memory_usage(deep=True).sum()/1024/1024/1024:.2f} GB")

Memory usage after slice: 11.63 GB


In [18]:
from utils.lang_enum import languages
for s in df.source.unique():
    existing_langs = set(df[df.source == s].language_tag.unique().tolist())
    missing_langs = set(languages) - existing_langs
    print("Source:", s)
    print("Existing langs:", existing_langs)
    print("Missing langs:", missing_langs)
    print()


Source: rosetta
Existing langs: {'TGLANG_LANGUAGE_KOTLIN', 'TGLANG_LANGUAGE_LUA', 'TGLANG_LANGUAGE_PYTHON', 'TGLANG_LANGUAGE_RUBY', 'TGLANG_LANGUAGE_RUST', 'TGLANG_LANGUAGE_DART', 'TGLANG_LANGUAGE_SHELL', 'TGLANG_LANGUAGE_CSHARP', 'TGLANG_LANGUAGE_POWERSHELL', 'TGLANG_LANGUAGE_C', 'TGLANG_LANGUAGE_OBJECTIVE_C', 'TGLANG_LANGUAGE_CPLUSPLUS', 'TGLANG_LANGUAGE_SQL', 'TGLANG_LANGUAGE_OTHER', 'TGLANG_LANGUAGE_JAVASCRIPT', 'TGLANG_LANGUAGE_JAVA', 'TGLANG_LANGUAGE_GO', 'TGLANG_LANGUAGE_SWIFT', 'TGLANG_LANGUAGE_PHP', 'TGLANG_LANGUAGE_TYPESCRIPT'}
Missing langs: {'TGLANG_LANGUAGE_JSON', 'TGLANG_LANGUAGE_CSS', 'TGLANG_LANGUAGE_DOCKER', 'TGLANG_LANGUAGE_FUNC', 'TGLANG_LANGUAGE_HTML', 'TGLANG_LANGUAGE_SOLIDITY', 'TGLANG_LANGUAGE_XML', 'TGLANG_LANGUAGE_TL', 'TGLANG_LANGUAGE_NGINX'}

Source: dlld
Existing langs: {'TGLANG_LANGUAGE_CSS', 'TGLANG_LANGUAGE_JAVA', 'TGLANG_LANGUAGE_RUBY', 'TGLANG_LANGUAGE_GO', 'TGLANG_LANGUAGE_SHELL', 'TGLANG_LANGUAGE_CPLUSPLUS', 'TGLANG_LANGUAGE_PYTHON', 'TGLANG_LANGUAGE_

In [46]:
print(df.groupby("language_tag").count().sort_values(by="code", ascending=True).to_string())

                             language     code     path   source
language_tag                                                    
TGLANG_LANGUAGE_FUNC             3112     3112     3112     3112
TGLANG_LANGUAGE_NGINX            3930     3930     3930     3930
TGLANG_LANGUAGE_CODE             3944     3944     3944     3944
TGLANG_LANGUAGE_TL              10071    10071    10071    10071
TGLANG_LANGUAGE_CSS             20972    20972    20972    20972
TGLANG_LANGUAGE_DOCKER          32124    32124    32124    32124
TGLANG_LANGUAGE_JAVASCRIPT      43645    43645    43645    43645
TGLANG_LANGUAGE_POWERSHELL      46920    46920    46920    46920
TGLANG_LANGUAGE_HTML            48679    48679    48679    48679
TGLANG_LANGUAGE_LUA             54067    54067    54067    54067
TGLANG_LANGUAGE_SQL             54486    54486    54486    54486
TGLANG_LANGUAGE_JSON            57001    57001    57001    57001
TGLANG_LANGUAGE_SOLIDITY        59440    59440    59440    59440
TGLANG_LANGUAGE_PYTHON   

In [19]:
import pickle

df["code_len"] = df.code.str.len()

with open("../../datasets/excluded_paths.txt", 'r') as f:
    excluded_paths = f.read().splitlines()
df = df[~df.path.isin(excluded_paths)]
print(f"Number of samples: {len(df):,}")

pickle.dump(df, open(DF_CACHE_PATH, "wb"))
print(f"Saved to {DF_CACHE_PATH}")

pickle.dump(df[["language_tag", "language", "path", "source", "code_len"]], open(DF_PATHONLY_CACHE_PATH, "wb"))
print(f"Saved to {DF_PATHONLY_CACHE_PATH}")

Number of samples: 5,154,971
Saved to ../../datasets/cache/all.pickle
Saved to ../../datasets/cache/path_only.pickle


# Export for submission

In [20]:
from pathlib import Path
import pickle
import tqdm
import pandas as pd

DF_CACHE_PATH = "../../datasets/cache/all.pickle"
with open(DF_CACHE_PATH, "rb") as f:
    df = pickle.load(f)

tdf = df.loc[:, ["language_tag", "code", "source"]]
print(f"Memory usage: {tdf.memory_usage(deep=True).sum()/1024/1024/1024:.2f} GB")

root_dir_for_save = "../../datasets/export"
for s in tqdm.tqdm(df.source.unique(), leave=True):
    subdf = df[df.source == s]
    source_dir = os.path.join(root_dir_for_save, s)
    Path(source_dir).mkdir(parents=True, exist_ok=True)
    for lang in tqdm.tqdm(subdf.language_tag.unique(), leave=True):
        subsubdf = subdf[subdf.language_tag == lang]
        file_save_dir = os.path.join(root_dir_for_save, s, lang)
        Path(file_save_dir).mkdir(parents=True, exist_ok=True)
        for i, row in subsubdf.iterrows():
            with open(os.path.join(file_save_dir, f"{i}.txt"), 'w') as f:
                f.write(row.code)

Memory usage: 10.56 GB


100%|██████████| 20/20 [00:05<00:00,  3.42it/s]
100%|██████████| 13/13 [00:04<00:00,  2.94it/s]
100%|██████████| 28/28 [04:46<00:00, 10.25s/it]
100%|██████████| 2/2 [00:01<00:00,  1.30it/s]t]
100%|██████████| 1/1 [01:44<00:00, 104.05s/it] 
100%|██████████| 1/1 [00:02<00:00,  2.83s/it]]
100%|██████████| 1/1 [01:55<00:00, 116.00s/it]
100%|██████████| 26/26 [00:00<00:00, 29.24it/s]
100%|██████████| 7/7 [00:03<00:00,  2.05it/s]]
100%|██████████| 1/1 [00:00<00:00,  1.33it/s]]
100%|██████████| 18/18 [00:00<00:00, 58.23it/s]
100%|██████████| 26/26 [00:01<00:00, 23.66it/s]
100%|██████████| 3/3 [00:01<00:00,  2.88it/s]t]
100%|██████████| 29/29 [00:00<00:00, 383.57it/s]
100%|██████████| 2/2 [00:02<00:00,  1.23s/it]t]
100%|██████████| 2/2 [00:02<00:00,  1.33s/it]t]
100%|██████████| 16/16 [08:57<00:00, 33.59s/it]


In [5]:
tdf.groupby("language_tag").count().sort_values(by="code", ascending=True)

Unnamed: 0_level_0,code,source
language_tag,Unnamed: 1_level_1,Unnamed: 2_level_1
TGLANG_LANGUAGE_NGINX,3930,3930
TGLANG_LANGUAGE_CODE,3944,3944
TGLANG_LANGUAGE_FUNC,4853,4853
TGLANG_LANGUAGE_TL,11172,11172
TGLANG_LANGUAGE_CSS,20972,20972
TGLANG_LANGUAGE_DOCKER,32124,32124
TGLANG_LANGUAGE_POWERSHELL,46920,46920
TGLANG_LANGUAGE_HTML,48679,48679
TGLANG_LANGUAGE_JAVASCRIPT,49722,49722
TGLANG_LANGUAGE_LUA,54067,54067


### Create df from exported

In [36]:
import os
import pickle
import pandas as pd

root_path = "../../datasets/tmp/export"

data = []
for root, dirs, files in os.walk(root_path):
    if root[len(root_path):].count(os.sep) == 2:
        for file in files:
            if file.endswith('.txt'):
                path_parts = root.split(os.sep)
                source = path_parts[-2]
                tag = path_parts[-1]
                absolute_path = os.path.abspath(os.path.join(root, file))
                data.append({'source': source, 'language_tag': tag, 'path': absolute_path})

df = pd.DataFrame(data)

with open("../../datasets/tmp/export.pickle", "wb") as f:
    pickle.dump(df, f)

# Generate frequency tables

In [1]:
import pickle
DF_CACHE_PATH = "../../datasets/cache/all.pickle"
df = pickle.load(open(DF_CACHE_PATH, "rb"))
iterator = df.code.str.lower().to_list()

In [2]:
from transformers import BertTokenizerFast
import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation + string.whitespace

new_tokenizer = BertTokenizerFast("../../datasets/vocab.txt", do_lower_case=True).train_new_from_iterator(iterator, vocab_size=5500, limit_alphabet=len(alphabet)+1, initial_alphabet=list(alphabet))
print(len(new_tokenizer.vocab))
new_tokenizer.save_vocabulary("../../datasets/")

In [2]:
import tqdm
import utils.lang_enum as lang_enum
from collections import Counter
from typing import List, Dict
import re

def tokenize_code(code: str) -> List[str]:
    return re.findall(r'\b\w+\b', code.lower())

def filter_vocab(vocab: Dict[str, int]) -> Dict[str, int]:
    vocab = {token: idx for token, idx in vocab.items() if len(token) > 1}
    vocab = {token: idx for token, idx in vocab.items() if not token.isdigit()}
    return vocab

def create_vocabulary(codes: List[str], min_freq: int, top_k: int) -> Dict[str, int]:
    token_counts = Counter()
    for code in codes:
        unique_tokens = set(tokenize_code(code.lower()))
        token_counts.update(unique_tokens)
    common_tokens = [token for token, count in token_counts.most_common(top_k) if count >= min_freq]
    vocabulary = {token: idx for idx, token in enumerate(common_tokens, start=1)}
    return vocabulary

from transformers import AutoTokenizer
def create_vocabulary_gpt(codes: List[str], min_freq: int, top_k: int) -> Dict[str, int]:
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    return tokenizer.train_new_from_iterator(codes, vocab_size=top_k).vocab

vocabs = {}
vocabs_gpt = {}
for lang in tqdm.tqdm(lang_enum.languages):
    ldf = df[df["language_tag"] == lang]
    vocab = create_vocabulary(ldf.code.str.lower().to_list(), int(len(ldf)*0.001), 500)
    vocabs[lang] = filter_vocab(vocab)

    size = 1000 if lang == "TGLANG_LANGUAGE_OTHER" else 600
    vocab_gpt = create_vocabulary_gpt(ldf.code.str.lower().to_list(), int(len(ldf)*0.001), size)
    vocabs_gpt[lang] = filter_vocab(vocab_gpt)

100%|██████████| 29/29 [13:48<00:00, 28.59s/it]  


In [3]:
import json
res_vocabs = {lang: list(d.keys()) for lang, d in vocabs.items()}
with open("../../datasets/vocabs.json", "w") as f:
    json.dump(res_vocabs, f)

res_vocabs_gpt = {lang: list(d.keys()) for lang, d in vocabs_gpt.items()}
with open("../../datasets/vocabs_gpt2.json", "w") as f:
    json.dump(res_vocabs_gpt, f)

# Generate vocabulary

In [1]:
import json
import re
import utils.lang_constructs
with open("../../datasets/vocabs.json", "r") as f:
    vocabs = json.load(f)
with open("../../datasets/vocabs_gpt2.json", "r") as f:
    vocabs_gpt2 = json.load(f)
for l in vocabs:
    vocabs_gpt2[l] = [t[1:] if t.startswith("Ġ") else t for t in vocabs_gpt2[l]]

with open("../../datasets/old_vocabulary.txt", "r") as f:
    old_vocab = f.read().split("\n")

with open("../../datasets/vocab.txt", "r") as f:
    bert_vocab = f.read().split("\n")
bert_vocab = [t for t in bert_vocab if len(t) > 1 and "#" not in t and t.isascii()]

libs_kws = []
for vs in utils.lang_constructs.lang_libs.values():
    for v in vs:
        name, acronym, libs = v
        if name.isalpha():
            libs_kws.append(name)
        libs_kws.append(acronym)
        libs_kws += libs

kws = utils.lang_constructs.lang_keywords

all_keywords = bert_vocab
for lang, vocab in vocabs.items():
    all_keywords += vocab
for lang, vocab in kws.items():
    all_keywords += vocab
all_keywords += old_vocab
all_keywords += libs_kws

def clean(kws):
    kws = [kw for kw in kws if len(kw) > 1]
    kws = [kw for kw in kws if len(kw) < 30]
    kws = [kw for kw in kws if not kw.isdigit()]
    kws = list(set(kws))
    kws = [kw for kw in kws if kw != ""]
    return kws

def remove_prefix(kws, prefix, new_vals=None):
    kws = [kw if not kw.startswith(prefix) else kw[len(prefix):] for kw in kws]
    if new_vals is not None:
        kws += new_vals

    return clean(kws)

def split_by(kws, sep, min_len=0):
    contain_spaces = []
    rest = []

    for kw in kws:
        if len(kw) < min_len:
            continue

        if sep in kw:
            contain_spaces.append(kw)

    rest = [kw for kw in kws if kw not in contain_spaces]
    for kw in contain_spaces:
        rest += kw.split(sep)

    return clean(rest)


all_keywords = [kw.lower() for kw in all_keywords]
all_keywords = [kw for kw in all_keywords if kw.isascii()]
all_keywords = split_by(all_keywords, " ")
all_keywords = split_by(all_keywords, "\n")
all_keywords = split_by(all_keywords, "\t")
all_keywords = split_by(all_keywords, "\\", min_len=5)
all_keywords = split_by(all_keywords, "/", min_len=5)
all_keywords = split_by(all_keywords, ".", min_len=5)
all_keywords = split_by(all_keywords, "_", min_len=5)
all_keywords = split_by(all_keywords, ":", min_len=5)
all_keywords = split_by(all_keywords, "()", min_len=5)
all_keywords = split_by(all_keywords, "}", min_len=5)
all_keywords = split_by(all_keywords, "{", min_len=5)
all_keywords = split_by(all_keywords, "::")
all_keywords += ["::", "std::", "dart::", "()", "-i", "##.h"]

all_keywords = [kw for kw in all_keywords if len(kw) < 15]
all_keywords = [kw for kw in all_keywords if len(re.findall(r"\d", kw)) < 2]

all_keywords = remove_prefix(all_keywords, "0x", ["0x"])
all_keywords = remove_prefix(all_keywords, "0b", ["0b"])
all_keywords = clean(all_keywords)
all_keywords = list(set(all_keywords))
print(len(all_keywords))

6144


In [3]:
with open("./vocabulary.txt", "w") as f:
    f.write("\n".join(all_keywords))

# Generator

In [26]:
import tqdm.autonotebook as tqdm
import random
import os

def read_files_to_string(path, splitter='\n'):
    content = []

    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            with open(filepath, 'r', errors='replace') as file:
                content.append(file.read())

    all_content = splitter.join(content)
    no_empty_lines = splitter.join([line for line in all_content.split(splitter) if line.strip()])

    return no_empty_lines.split(splitter)

def generate_random_files(strings, output_path, num_files, k=100, header=""):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for i in tqdm.tqdm(range(num_files)):
        random_selection = random.choices(strings, k=k)

        file_name = f"output_file_{i+1}.txt"
        file_path = os.path.join(output_path, file_name)

        with open(file_path, 'w') as file:
            if header:
                file.write(header + "\n")
            for line in random_selection:
                file.write(line + "\n")

## Generate TL Lang

In [10]:
path_to_folder = '../../datasets/downloaded_files/TL Type Language'
result = read_files_to_string(path_to_folder)

output_path = '../../datasets/generated/TL Type Language'
generate_random_files(result, output_path, 10000)

 11%|█         | 1051/10000 [00:00<00:00, 10502.50it/s]

100%|██████████| 10000/10000 [00:00<00:00, 11532.93it/s]


## Generate FUNC

In [31]:
path_to_folder = '../../datasets/generated/custom_func'
result = read_files_to_string(path_to_folder, splitter='\n---')

output_path = '../../datasets/generated/FUNC contract'
generate_random_files(result, output_path, 3000, k=100)

100%|██████████| 3000/3000 [00:00<00:00, 8415.59it/s]
