In [1]:
import pandas as pd
import json
import sys
from pathlib import Path

ROOT = Path.cwd().parent   # adjust depth if needed
sys.path.insert(0, str(ROOT))
from mapping import get_technology

EXCLUDED_DIRS = ("docs/", "data/", "lib/", "benchmark/", "annotations/", "examples/")

df = pd.read_csv("../../data/technological/composition/project_technologies.csv")

df_sample = df.sample(50)

df_sample.to_csv("../../data/technological/composition/project_technologies_sample.csv", index=False)


  "github action": [".*?\.github\/workflows\/[^\/]*\.yml$", ".github/workflows/*.yaml", ".github/workflows/*/*.yaml", ".github/workflows/*.yml", ".github/workflows/*/*.yml",  ".github/actions/*.yaml",  ".github/actions/*/*.yaml", ".github/actions/*.yml", ".github/actions/*/*.yml"],


In [2]:
project_sample = df_sample["project"].tolist()

file_type_files = []

for project in project_sample:
    project_file = f"../../data/projects_last_commit/{project}_last_commit.json"
    with open(project_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        commit_data = data["latest_commit_data"]["network_data"]

        for config_file in commit_data["config_file_data"]:
            if config_file["file_path"].startswith(EXCLUDED_DIRS):
                continue

            if config_file["concept"] in ["yaml", ".properties", "ini", "toml", "json", "xml"]:
                tech = get_technology(config_file["file_path"])
                if not tech:
                    file_type_files.append(config_file["file_path"])

First iteration: 21
Second iteration: 

In [3]:
print(file_type_files[:100])

['.github/actionlint.yml', '.stylish-haskell.yaml', 'stack.yaml', 'packages/docs/project.inlang/settings.json', 'packages/docs/src/lib/mdsvex/shiki.theme.json', 'packages/docs/src/translation/ar.json', 'packages/docs/src/translation/bn.json', 'packages/docs/src/translation/ca.json', 'packages/docs/src/translation/cs.json', 'packages/docs/src/translation/de.json', 'packages/docs/src/translation/en.json', 'packages/docs/src/translation/es.json', 'packages/docs/src/translation/fa.json', 'packages/docs/src/translation/fr.json', 'packages/docs/src/translation/hu.json', 'packages/docs/src/translation/id.json', 'packages/docs/src/translation/it.json', 'packages/docs/src/translation/ja.json', 'packages/docs/src/translation/ko.json', 'packages/docs/src/translation/ms.json', 'packages/docs/src/translation/pl.json', 'packages/docs/src/translation/pt.json', 'packages/docs/src/translation/ro.json', 'packages/docs/src/translation/ru.json', 'packages/docs/src/translation/uk.json', 'packages/docs/src/

In [4]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent   # adjust depth if needed
sys.path.insert(0, str(ROOT))
from mapping import get_technology

tests = [
    ("packages/vite/tsconfig.base.json", "tsconfig"),
    ("ci/k8s/jobset.yaml", "kubernetes"),
    ("docker-compose.server.example.yml", "docker-compose"),
    ("js/tsconfig.eslint.json", "eslint"),
    (".github/ISSUE_TEMPLATE/bug_report.yaml", "github"),
    ("docs/reference/docker_compose_run.yaml", "docker-compose"),
    ("apps/api/pnpm-lock.yaml", "pnpm"),
    ("apps/redis/fly.toml", "flyio"),
    ("apps/ui/ingestion-ui/package-lock.json", "npm"),
    ("docker-compose.yaml", "docker-compose"),
    ("examples/kubernetes/cluster-install/api.yaml", "kubernetes"),
    (".devcontainer/devcontainer.json", "devcontainer"),
    (".eslintrc.json", "eslint"),
    (".github/ISSUE_TEMPLATE/bug_report.yml", "github"),
    (".github/actions/build-electron/action.yml", "github-actions"),
    (".github/dependabot.yml", "dependabot"),
    (".github/ISSUE_TEMPLATE/config.yml", "github"),
    ("apollo-adminservice/src/main/resources/application-zookeeper-discovery.properties", "spring"),
    (".mocharc.yml", "mocha"),
    ("packages/utils/src/utils/defaultSnapshots/chainConfig.json", "fuel"),
    ("deploy/charts/litellm-helm/templates/*.yaml", "helm"),
    ("charts/budibase/templates/*.yaml", "helm"),
    ("helm/minio/Chart.yaml", "helm"),
    ("helm/minio/values.yaml", "helm"),
    ("libs/automation/.verdaccio/config.yml", "verdaccio"),
    ("tsconfig.base.json", "tsconfig"),
    ("packages/*/src/tsconfig.cjs.json", "tsconfig"),
    ("ultralytics/cfg/models/**.yaml", "ultralytics yolo"),
    (".azure/pipelines/localization.yml", "azure pipelines"),
    (".azure/pipelines/jobs/*.yml", "azure pipelines"),
    (".github/DISCUSSION_TEMPLATE/camera-support.yml", "github"),
]

results = []

for filename, expected in tests:
    actual = get_technology(filename)
    success = (actual == expected)
    results.append((filename, expected, actual, success))

results

[('packages/vite/tsconfig.base.json', 'tsconfig', 'tsconfig', True),
 ('ci/k8s/jobset.yaml', 'kubernetes', 'kubernetes', True),
 ('docker-compose.server.example.yml',
  'docker-compose',
  'docker compose',
  False),
 ('js/tsconfig.eslint.json', 'eslint', 'eslint', True),
 ('.github/ISSUE_TEMPLATE/bug_report.yaml', 'github', 'github issues', False),
 ('docs/reference/docker_compose_run.yaml',
  'docker-compose',
  'docker compose',
  False),
 ('apps/api/pnpm-lock.yaml', 'pnpm', 'pnpm', True),
 ('apps/redis/fly.toml', 'flyio', 'flyio', True),
 ('apps/ui/ingestion-ui/package-lock.json', 'npm', 'nodejs', False),
 ('docker-compose.yaml', 'docker-compose', 'docker compose', False),
 ('examples/kubernetes/cluster-install/api.yaml',
  'kubernetes',
  'kubernetes',
  True),
 ('.devcontainer/devcontainer.json', 'devcontainer', 'devcontainer', True),
 ('.eslintrc.json', 'eslint', 'eslint', True),
 ('.github/ISSUE_TEMPLATE/bug_report.yml', 'github', 'github issues', False),
 ('.github/actions/bui