# Grab GitHub

In [1]:
import os
import subprocess
import requests
import uuid

import tqdm.autonotebook as tqdm

GITHUB_API_URL = "https://api.github.com"
PER_PAGE = 100
MAX_SIZE_KB = 40000

TOKEN = '???'

HEADERS = {
    'Authorization': f'token {TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

def get_repositories_for_language(language, limit=10):
    repos = []
    page = 1
    while len(repos) < limit:
        l = f'"{language}"'
        response = requests.get(f"{GITHUB_API_URL}/search/repositories?q=language:{l}&per_page={PER_PAGE}&page={page}", headers=HEADERS)
        items = response.json().get('items', [])
        if not items:
            break
        repos.extend(items[:limit - len(repos)])
        page += 1
    return repos

def get_repositories_for_query(language, limit=10):
    repos = []
    page = 1
    while len(repos) < limit:

        l = f'{language}'
        response = requests.get(f"{GITHUB_API_URL}/search/repositories?q={l}&per_page={PER_PAGE}&page={page}", headers=HEADERS)
        items = response.json().get('items', [])
        if not items:
            break
        repos.extend(items[:limit - len(repos)])
        page += 1
    return repos

def clone_and_process_repo(repo, language, extensions):
    author, repo_name = repo['full_name'].split('/')
    clone_dest_folder = os.path.join('downloaded_repos', language, author, repo_name)
    files_dest_folder = os.path.join('downloaded_files', language, author, repo_name)

    if repo['size'] > MAX_SIZE_KB:
        return

    if not os.path.exists(clone_dest_folder):
        subprocess.run(['git', 'clone', repo['clone_url'], clone_dest_folder], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    for root, dirs, files in os.walk(clone_dest_folder):
        for filename in files:
            if any(filename.lower().endswith(f"{ext.lower()}") for ext in extensions):
                source_filepath = os.path.join(root, filename)
                dest_filepath = os.path.join(files_dest_folder, filename)
                os.makedirs(os.path.dirname(dest_filepath), exist_ok=True)
                if os.path.exists(dest_filepath):
                    basename = os.path.basename(dest_filepath)
                    basename, ext = os.path.splitext(basename)
                    random_name = uuid.uuid4().hex
                    dest_filepath = os.path.join(os.path.dirname(dest_filepath), random_name + ext)

                os.rename(source_filepath, dest_filepath)

def count_files_in_directory(directory):
    return sum([len(files) for _, _, files in os.walk(directory)])


  import tqdm.autonotebook as tqdm


In [2]:
from utils.lang_enum import TGLANG_LANGUAGE_EXTENSIONS

GITHUB2TGLANG = {

    # Other
    # 'Markdown': 'TGLANG_LANGUAGE_MARKDOWN',
    # 'JSON': 'TGLANG_LANGUAGE_JSON',
    # 'CSV': 'TGLANG_LANGUAGE_CSV',

    # Langsearch
    # '1C Enterprise': 'TGLANG_LANGUAGE_1S_ENTERPRISE',
    # 'ABAP': 'TGLANG_LANGUAGE_ABAP',
    # 'HTML': 'TGLANG_LANGUAGE_HTML',
    # 'QML': 'TGLANG_LANGUAGE_QML',
    # 'YAML': 'TGLANG_LANGUAGE_YAML'
    # 'Batchfile': 'TGLANG_LANGUAGE_BATCH',
    # 'ASP': 'TGLANG_LANGUAGE_ASP',
    # 'Dockerfile': 'TGLANG_LANGUAGE_DOCKER',
    # 'Protocol Buffer': 'TGLANG_LANGUAGE_PROTOBUF',
    # 'NGINX': 'TGLANG_LANGUAGE_NGINX',
    # 'Solidity': 'TGLANG_LANGUAGE_SOLIDITY',
    # 'GAMS': 'TGLANG_LANGUAGE_GAMS',
    # 'AutoHotkey': 'TGLANG_LANGUAGE_AUTOHOTKEY',
    # 'HACK': 'TGLANG_LANGUAGE_HACK',
    # 'TypeScript': 'TGLANG_LANGUAGE_TYPESCRIPT',
    # 'Verilog': 'TGLANG_LANGUAGE_VERILOG',
    # 'PLSQL': 'TGLANG_LANGUAGE_PL_SQL',
    # 'Makefile': 'TGLANG_LANGUAGE_MAKEFILE',
    # 'Apex': 'TGLANG_LANGUAGE_APEX',
    # 'Cpp': 'TGLANG_LANGUAGE_CPLUSPLUS',
    # 'Python': 'TGLANG_LANGUAGE_PYTHON',
    # 'JavaScript': 'TGLANG_LANGUAGE_JAVASCRIPT',
    # 'C': 'TGLANG_LANGUAGE_C',
    # 'Csharp': 'TGLANG_LANGUAGE_CSHARP',
    # 'Java': 'TGLANG_LANGUAGE_JAVA',
    # 'CSS': 'TGLANG_LANGUAGE_CSS',
    # 'Go': 'TGLANG_LANGUAGE_GO',
    # 'SQL': 'TGLANG_LANGUAGE_SQL',
    # 'XML': 'TGLANG_LANGUAGE_XML',
    # 'Rust': 'TGLANG_LANGUAGE_RUST',
    # 'Shell': 'TGLANG_LANGUAGE_SHELL',
    # 'Common Lisp': 'TGLANG_LANGUAGE_COMMON_LISP',
    # 'IDL': 'TGLANG_LANGUAGE_IDL',
    # 'Emacs Lisp': 'TGLANG_LANGUAGE_LISP',
    # 'Assembly': 'TGLANG_LANGUAGE_ASSEMBLY',
    # 'TeX': 'TGLANG_LANGUAGE_LATEX',
    # 'Elm': 'TGLANG_LANGUAGE_ELM',
    # 'OpenEdge ABL': 'TGLANG_LANGUAGE_OPENEDGE_ABL',
    # 'Julia': 'TGLANG_LANGUAGE_JULIA',
    # 'Fsharp': 'TGLANG_LANGUAGE_FSHARP',
    # 'Objective-C': 'TGLANG_LANGUAGE_OBJECTIVE_C',
    # 'PHP': 'TGLANG_LANGUAGE_PHP',
    # 'PowerShell': 'TGLANG_LANGUAGE_POWERSHELL',
    # 'ActionScript': 'TGLANG_LANGUAGE_ACTIONSCRIPT',
    # 'Groovy': 'TGLANG_LANGUAGE_APACHE_GROOVY',
    # 'Ada': 'TGLANG_LANGUAGE_ADA',
    # 'AppleScript': 'TGLANG_LANGUAGE_APPLESCRIPT',
    # 'BASIC': 'TGLANG_LANGUAGE_BASIC',
    # 'AWK': 'TGLANG_LANGUAGE_AWK',
    # 'Crystal': 'TGLANG_LANGUAGE_CRYSTAL',
    # 'D': 'TGLANG_LANGUAGE_D',
    # 'Dart': 'TGLANG_LANGUAGE_DART',
    # 'Clojure': 'TGLANG_LANGUAGE_CLOJURE',
    # 'COBOL': 'TGLANG_LANGUAGE_COBOL',
    # 'Delphi': 'TGLANG_LANGUAGE_DELPHI',
    # 'Elixir': 'TGLANG_LANGUAGE_ELIXIR',
    # 'CoffeeScript': 'TGLANG_LANGUAGE_COFFESCRIPT',
    # 'Erlang': 'TGLANG_LANGUAGE_ERLANG',
    # 'Forth': 'TGLANG_LANGUAGE_FORTH',
    # 'Fortran': 'TGLANG_LANGUAGE_FORTRAN',
    # 'Haskell': 'TGLANG_LANGUAGE_HASKELL',
    # 'Kotlin': 'TGLANG_LANGUAGE_KOTLIN',
    # 'Lua': 'TGLANG_LANGUAGE_LUA',
    # 'MATLAB': 'TGLANG_LANGUAGE_MATLAB',
    # 'Nim': 'TGLANG_LANGUAGE_NIM',
    # 'Pascal': 'TGLANG_LANGUAGE_PASCAL',
    # 'OCaml': 'TGLANG_LANGUAGE_OCAML',
    # 'R': 'TGLANG_LANGUAGE_R',
    # 'Perl': 'TGLANG_LANGUAGE_PERL',
    # 'Prolog': 'TGLANG_LANGUAGE_PROLOG',
    # 'Ruby': 'TGLANG_LANGUAGE_RUBY',
    # 'Scala': 'TGLANG_LANGUAGE_SCALA',
    # 'Swift': 'TGLANG_LANGUAGE_SWIFT',
    # 'Visual Basic .NET': 'TGLANG_LANGUAGE_VISUAL_BASIC',
    # 'Scheme': 'TGLANG_LANGUAGE_SCHEME',
    # 'SAS': 'TGLANG_LANGUAGE_SAS',
    # 'Raku': 'TGLANG_LANGUAGE_RAKU',
    # 'Smalltalk': 'TGLANG_LANGUAGE_SMALLTALK',
    # 'Tcl': 'TGLANG_LANGUAGE_TCL',
    # 'Vala': 'TGLANG_LANGUAGE_VALA',
    # 'VBScript': 'TGLANG_LANGUAGE_VBSCRIPT',

    # QSearch
    # 'Gradle': 'TGLANG_LANGUAGE_GRADLE',
    # 'GraphQL': 'TGLANG_LANGUAGE_GRAPHQL',
    # 'Wolfram': 'TGLANG_LANGUAGE_WOLFRAM',
    # 'TEXTILE': 'TGLANG_LANGUAGE_TEXTILE',
    # 'INI': 'TGLANG_LANGUAGE_INI',
    # 'Bison': 'TGLANG_LANGUAGE_BISON',
    # 'Keyman': 'TGLANG_LANGUAGE_KEYMAN',
    # 'Logo language': 'TGLANG_LANGUAGE_LOGO',
    #'FUNC contract': 'TGLANG_LANGUAGE_FUNC',

    ### Not processed

    # QSearch
    'TL Type Language': 'TGLANG_LANGUAGE_TL',
    'FIFT TON': 'TGLANG_LANGUAGE_FIFT',
    'Icon language': 'TGLANG_LANGUAGE_ICON',
}

In [None]:
LIMIT_REPOS = 500
LIMIT_FILES = 20000 # it was different value for each language

for github_lang in GITHUB2TGLANG.keys():
    EXTENSIONS = TGLANG_LANGUAGE_EXTENSIONS[GITHUB2TGLANG[github_lang]]

    # repos = get_repositories_for_language(github_lang, LIMIT_REPOS)
    repos = get_repositories_for_query(github_lang, LIMIT_REPOS)

    file_progress = tqdm.tqdm(total=LIMIT_FILES, desc=f"Files ({github_lang})", position=0, leave=True)

    for repo in repos:
        clone_and_process_repo(repo, github_lang, EXTENSIONS)

        files_path = os.path.join('downloaded_files', github_lang)
        num_files = count_files_in_directory(files_path)
        file_progress.update(num_files - file_progress.n)
        if num_files >= LIMIT_FILES:
            break

print("Done!")

# Create datasets

In [None]:
import yaml
import os

import tqdm.autonotebook as tqdm
import pandas as pd

from utils.lang_enum import languages

DF_CACHE_PATH = "../datasets/cache/all.pickle"
DF_PATHONLY_CACHE_PATH = "../datasets/cache/path_only.pickle"

In [2]:
def collect_files_content(root_dir, yaml_path):
    with open(yaml_path, 'r') as f:
        mappings = yaml.safe_load(f)

    data_list = []

    for language_tag, language_mapping in tqdm.tqdm(mappings.items()):
        for language, extensions in language_mapping.items():
            if not isinstance(extensions, list):
                extensions = [extensions]

            for extension in extensions:
                lang_dir = os.path.join(root_dir, language)

                if not os.path.exists(lang_dir):
                    continue

                for dirpath, dirnames, filenames in os.walk(lang_dir, followlinks=True):
                    for filename in filenames:
                        if extension == ".*" or filename.endswith(extension):
                            try:
                                max_file_size = 4 * 1024 * 1024 # 4MB
                                if os.path.getsize(os.path.join(dirpath, filename)) < max_file_size:
                                    with open(os.path.join(dirpath, filename), 'r') as file:
                                        content = file.read()
                                        if len(content) > 0:
                                            abs_path = os.path.abspath(os.path.join(dirpath, filename))
                                            data_list.append((language_tag, language, content, abs_path))
                            except:
                                pass

    return data_list

### RosettaCodeData

In [3]:
datafile = "../datasets/RosettaCodeData/Conf/nlang.yaml"
dataroot = "../datasets/RosettaCodeData/Lang"

data = collect_files_content(dataroot, datafile)
rosetta_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
rosetta_df["source"] = "rosetta"

  0%|          | 0/74 [00:00<?, ?it/s]

100%|██████████| 74/74 [00:03<00:00, 21.04it/s]


### DLLDCodeData

In [4]:
datafile = "../datasets/deep-learning-lang-detection/data/nlang.yaml"
dataroot = "../datasets/deep-learning-lang-detection/data/all"

data = collect_files_content(dataroot, datafile)
dlld_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
dlld_df["source"] = "dlld"

100%|██████████| 16/16 [00:01<00:00, 12.93it/s]


### GitHubCodeData

In [5]:
datafile = "../datasets/downloaded_files/nlang.yaml"
dataroot = "../datasets/downloaded_files/"

data = collect_files_content(dataroot, datafile)
github_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
github_df["source"] = "github"

100%|██████████| 100/100 [00:56<00:00,  1.77it/s]


### Generated

In [6]:
datafile = "../datasets/generated/nlang.yaml"
dataroot = "../datasets/generated/"

data = collect_files_content(dataroot, datafile)
gen_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
gen_df["source"] = "generated"

100%|██████████| 3/3 [00:00<00:00,  5.71it/s]


### Stackoverflow

In [7]:
datafile = "../datasets/stackoverflow/nlang.yaml"
dataroot = "../datasets/stackoverflow/"

data = collect_files_content(dataroot, datafile)
stack_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
stack_df["source"] = "stackoverflow"

100%|██████████| 1/1 [00:02<00:00,  2.58s/it]


### Overfit

In [8]:
datafile = "../datasets/overfit/nlang.yaml"
dataroot = "../datasets/overfit/"

data = collect_files_content(dataroot, datafile)
overfit_df = pd.DataFrame(data, columns=["language_tag", "language", "code", "path"])
overfit_df["source"] = "overfit"

100%|██████████| 6/6 [00:00<00:00, 3053.36it/s]


In [9]:
df = pd.concat([rosetta_df, dlld_df, github_df, gen_df, stack_df, overfit_df], ignore_index=True)
print(f"Number of samples: {len(df):,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum()/1024/1024/1024:.2f} GB")

# Number of samples: 2,025,423
# Memory usage: 22.21 GB

Number of samples: 2,154,545
Memory usage: 22.29 GB


In [10]:
print(df.groupby("language_tag").count().sort_values(by="code", ascending=True).to_string())

                               language    code    path  source
language_tag                                                   
TGLANG_LANGUAGE_KEYMAN              115     115     115     115
TGLANG_LANGUAGE_TEXTILE             127     127     127     127
TGLANG_LANGUAGE_REGEX               824     824     824     824
TGLANG_LANGUAGE_LOGO                982     982     982     982
TGLANG_LANGUAGE_BISON              1101    1101    1101    1101
TGLANG_LANGUAGE_WOLFRAM            1627    1627    1627    1627
TGLANG_LANGUAGE_1S_ENTERPRISE      1681    1681    1681    1681
TGLANG_LANGUAGE_ICON               1770    1770    1770    1770
TGLANG_LANGUAGE_FUNC               3092    3092    3092    3092
TGLANG_LANGUAGE_FIFT               3166    3166    3166    3166
TGLANG_LANGUAGE_HACK               3403    3403    3403    3403
TGLANG_LANGUAGE_NGINX              3437    3437    3437    3437
TGLANG_LANGUAGE_INI                3831    3831    3831    3831
TGLANG_LANGUAGE_APPLESCRIPT        3838 

In [11]:
import pickle

pickle.dump(df, open(DF_CACHE_PATH, "wb"))
print(f"Saved to {DF_CACHE_PATH}")

pickle.dump(df[["language_tag", "language", "path", "source"]], open(DF_PATHONLY_CACHE_PATH, "wb"))
print(f"Saved to {DF_PATHONLY_CACHE_PATH}")

Saved to ../datasets/cache/all.pickle
Saved to ../datasets/cache/path_only.pickle


In [None]:
import pickle
pickle.load(open(DF_PATHONLY_CACHE_PATH, "rb"))

# Generator

In [26]:
import tqdm.autonotebook as tqdm
import random
import os

def read_files_to_string(path, splitter='\n'):
    content = []

    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            with open(filepath, 'r', errors='replace') as file:
                content.append(file.read())

    all_content = splitter.join(content)
    no_empty_lines = splitter.join([line for line in all_content.split(splitter) if line.strip()])

    return no_empty_lines.split(splitter)

def generate_random_files(strings, output_path, num_files, k=100, header=""):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for i in tqdm.tqdm(range(num_files)):
        random_selection = random.choices(strings, k=k)

        file_name = f"output_file_{i+1}.txt"
        file_path = os.path.join(output_path, file_name)

        with open(file_path, 'w') as file:
            if header:
                file.write(header + "\n")
            for line in random_selection:
                file.write(line + "\n")

## Generate TL Lang

In [10]:
path_to_folder = '../datasets/downloaded_files/TL Type Language'
result = read_files_to_string(path_to_folder)

output_path = '../datasets/generated/TL Type Language'
generate_random_files(result, output_path, 10000)

 11%|█         | 1051/10000 [00:00<00:00, 10502.50it/s]

100%|██████████| 10000/10000 [00:00<00:00, 11532.93it/s]


## Generate FIFT

In [27]:
path_to_folder = '../datasets/generated/custom_fift'
result = read_files_to_string(path_to_folder, splitter='\n---')

output_path = '../datasets/generated/FIFT TON'
generate_random_files(result, output_path, 3000, k=100)

100%|██████████| 3000/3000 [00:00<00:00, 14930.17it/s]


## Generate FUNC

In [31]:
path_to_folder = '../datasets/generated/custom_func'
result = read_files_to_string(path_to_folder, splitter='\n---')

output_path = '../datasets/generated/FUNC contract'
generate_random_files(result, output_path, 3000, k=100)

100%|██████████| 3000/3000 [00:00<00:00, 8415.59it/s]


## Parse StackOverflow Comments

In [1]:
import os
import tqdm.autonotebook as tqdm
import xml.etree.ElementTree as ET

step = 1000
filename = "../datasets/stackoverflow/Comments.xml"
comments = []


context = ET.iterparse(filename, events=("start", "end"))
context = iter(context)
event, root = next(context)

progress = tqdm.tqdm(total=90000000, desc="Comments", position=0, leave=True)

i = 0
for event, elem in context:
    if event == "end" and elem.tag == "row":
        i += 1
        if i % step == 0:
            comments.append(elem.get('Text'))
            progress.update(step)

        elem.clear()

del context

print(f"Number of comments: {len(comments):,}")

save_dir = "../datasets/stackoverflow/other/comments"
os.makedirs(save_dir, exist_ok=True)

for i, comment in enumerate(tqdm.tqdm(comments)):
    with open(os.path.join(save_dir, f"{i}.txt"), "w") as f:
        f.write(comment)

print("Done!")

  import tqdm.autonotebook as tqdm
Comments:  99%|█████████▉| 89310000/90000000 [05:43<00:02, 263127.63it/s]

Number of comments: 89,336


## Parse StackOverflow Posts

In [None]:
import os
import tqdm.autonotebook as tqdm
import xml.etree.ElementTree as ET
import html
import xml.etree.ElementTree as ET
import re

step = 1000
total = 100000 * step
offset = 40195000

MIN_CODE_LEN = 50

filename = "../datasets/stackoverflow/Posts.xml"
def extract_code_and_other_from_body(body_text):
    body_text = html.unescape(body_text)
    pre_code_blocks = re.findall(r'<pre><code>(.*?)</code></pre>', body_text, re.DOTALL)
    body_without_pre_code = re.sub(r'<pre><code>.*?</code></pre>', '', body_text, flags=re.DOTALL)
    inline_code_blocks = re.findall(r'<code>([^<]+)</code>', body_without_pre_code)
    non_code_text = re.sub(r'<code>[^<]+</code>', '', body_without_pre_code)
    non_code_text = re.sub(r'<[^>]+>', '', non_code_text).strip()
    all_code_blocks = pre_code_blocks + inline_code_blocks
    return all_code_blocks, non_code_text

def extract_tags(tags_str):
    return set(tags_str[1:-1].split('><'))

progress = tqdm.tqdm(total=total, desc="Posts", position=0, leave=True)

all_code = []
all_tags = []
all_other = []
i = 0
for event, elem in ET.iterparse(filename, events=('end',)):
    if elem.tag == "row":
        i += 1
        if i < offset:
            continue

        if i % step == 0:
            body = elem.attrib.get("Body", "")
            tags_str = elem.attrib.get("Tags", "")
            tags = extract_tags(tags_str)
            code_blocks, other_text = extract_code_and_other_from_body(body)
            for code in code_blocks:
                if len(code) > MIN_CODE_LEN:
                    all_code.append(code)
                    all_tags.append(list(tags))
            if other_text:
                all_other.append(other_text)
            elem.clear()

            progress.update(step)

            if i >= total:
                break

In [4]:
len(all_other)

39786

In [540]:
import random
print(random.choice(all_other))

If you put the image in the same directory as the class file then the following should work for you:



Also would suggest setting the icon image before you make the frame visible


In [5]:
print(f"Number of 'other' posts: {len(all_other):,}")
save_dir = "../datasets/stackoverflow/other/posts/0"
os.makedirs(save_dir, exist_ok=True)
for i, post in enumerate(tqdm.tqdm(all_other)):
    with open(os.path.join(save_dir, f"{i}.txt"), "w") as f:
        f.write(post)
print("Done!")

Number of 'other' posts: 39,786


100%|██████████| 39786/39786 [00:01<00:00, 24327.86it/s]

Done!



