In [None]:
import pandas as pd
import json
from mapping import get_technology

EXCLUDED_DIRS = ("docs/", "data/", "lib/", "benchmark/", "annotations/", "examples/")

df = pd.read_csv("../data/technology_composition/project_technologies.csv")

df_sample = df.sample(20)

df_sample.to_csv("../data/technology_composition/project_technologies_sample.csv", index=False)


In [2]:
project_sample = df_sample["project"].tolist()

file_type_files = []

for project in project_sample:
    project_file = f"../data/projects_last_commit/{project}_last_commit.json"
    with open(project_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        commit_data = data["latest_commit_data"]["network_data"]

        for config_file in commit_data["config_file_data"]:
            if config_file["file_path"].startswith(EXCLUDED_DIRS):
                continue

            if config_file["concept"] in ["json", "yaml", "toml", "configparser"]:
                tech = get_technology(config_file["file_path"])
                if not tech:
                    file_type_files.append(config_file["file_path"])

First iteration: 21
Second iteration: 

In [3]:
print(file_type_files)

['.github/codeql/codeql-config.yml', '.mocharc.yml', 'examples/schema/storing-schemas-as-json/schema.json', 'lgtm.yml', 'hugoreleaser.yaml', 'snap/snapcraft.yaml', '.github/mypy-primer-ty.toml', '.pre-commit-config.yaml', '_typos.toml', 'assets/badge/format.json', 'assets/badge/v0.json', 'assets/badge/v1.json', 'assets/badge/v2.json', 'crates/ruff_python_ast/ast.toml', 'dist-workspace.toml', 'mkdocs.insiders.yml', 'mkdocs.public.yml', 'mkdocs.template.yml', 'playground/api/wrangler.toml', 'ruff.schema.json', 'scripts/benchmarks/graph-spec.json', 'ty.schema.json', '.changeset/config.json', '.config/demos.json', '.github/configs/semgrep_rules.yaml', '.github/filters.json', 'client/python/gradio_client/types.json', 'demo/image_classifier/files/imagenet_labels.json', 'demo/image_classifier_2/files/imagenet_labels.json', 'demo/mini_leaderboard/assets/leaderboard_data.json', 'gradio/cli/commands/components/files/pyproject_.toml', 'gradio/cli/commands/deploy_space_action.yaml', 'js/_website/s

In [1]:
from mapping import get_technology

tests = [
    ("packages/vite/tsconfig.base.json", "tsconfig"),
    ("ci/k8s/jobset.yaml", "kubernetes"),
    ("docker-compose.server.example.yml", "docker-compose"),
    ("js/tsconfig.eslint.json", "eslint"),
    (".github/ISSUE_TEMPLATE/bug_report.yaml", "github"),
    ("docs/reference/docker_compose_run.yaml", "docker-compose"),
    ("apps/api/pnpm-lock.yaml", "pnpm"),
    ("apps/redis/fly.toml", "flyio"),
    ("apps/ui/ingestion-ui/package-lock.json", "npm"),
    ("docker-compose.yaml", "docker-compose"),
    ("examples/kubernetes/cluster-install/api.yaml", "kubernetes"),
    (".devcontainer/devcontainer.json", "devcontainer"),
    ( ".eslintrc.json", "eslint"),
    (".github/ISSUE_TEMPLATE/bug_report.yml", "github"),
    (".github/actions/build-electron/action.yml", "github-actions"),
    (".github/dependabot.yml", "dependabot"),
    (".github/ISSUE_TEMPLATE/config.yml", "github"),
    ("apollo-adminservice/src/main/resources/application-zookeeper-discovery.properties", "spring")
]

results = []

for filename, expected in tests:
    actual = get_technology(filename)
    success = (actual == expected)
    results.append((filename, expected, actual, success))

results

[('packages/vite/tsconfig.base.json', 'tsconfig', 'tsconfig', True),
 ('ci/k8s/jobset.yaml', 'kubernetes', 'kubernetes', True),
 ('docker-compose.server.example.yml',
  'docker-compose',
  'docker-compose',
  True),
 ('js/tsconfig.eslint.json', 'eslint', 'eslint', True),
 ('.github/ISSUE_TEMPLATE/bug_report.yaml', 'github', 'github', True),
 ('docs/reference/docker_compose_run.yaml',
  'docker-compose',
  'docker-compose',
  True),
 ('apps/api/pnpm-lock.yaml', 'pnpm', 'pnpm', True),
 ('apps/redis/fly.toml', 'flyio', 'flyio', True),
 ('apps/ui/ingestion-ui/package-lock.json', 'npm', 'npm', True),
 ('docker-compose.yaml', 'docker-compose', 'docker-compose', True),
 ('examples/kubernetes/cluster-install/api.yaml',
  'kubernetes',
  'kubernetes',
  True),
 ('.devcontainer/devcontainer.json', 'devcontainer', 'devcontainer', True),
 ('.eslintrc.json', 'eslint', 'eslint', True),
 ('.github/ISSUE_TEMPLATE/bug_report.yml', 'github', 'github', True),
 ('.github/actions/build-electron/action.yml'