In [3]:
import os
import shutil
import pandas as pd
from pathlib import Path

# === SETUP ROOT PATH ===
project_root = Path("..").resolve()
csv_path = project_root / "license_data_summary.csv"
source_dirs = [project_root / "Split-DB-Foss-Licenses", project_root / "Split-SPDX-Licenses"]
output_base = project_root / "Combined-Licenses"

# === READ CSV AND GET UNIQUE LICENSES ===
df = pd.read_csv(csv_path)
unique_licenses = df['license'].dropna().unique()

# === CREATE OUTPUT STRUCTURE ===
output_base.mkdir(parents=True, exist_ok=True)

# === COPY FILES FROM BOTH SOURCES TO THE COMBINED FOLDER ===
for license_name in unique_licenses:
    combined_license_dir = output_base / license_name
    combined_license_dir.mkdir(parents=True, exist_ok=True)

    for source in source_dirs:
        license_source_dir = source / license_name
        if license_source_dir.is_dir():
            for filename in os.listdir(license_source_dir):
                src_file = license_source_dir / filename
                if src_file.is_file():
                    dest_file = combined_license_dir / filename
                    if dest_file.exists():
                        base, ext = os.path.splitext(filename)
                        # Add source name to resolve conflicts
                        source_tag = source.name.replace('-', '_')
                        dest_file = combined_license_dir / f"{base}_{source_tag}{ext}"
                    shutil.copy2(src_file, dest_file)

print(f"✅ All licenses combined into: {output_base}")


✅ All licenses combined into: /Users/rajuljha/Documents/gsoc/atarashi_classifier/Combined-Licenses


In [4]:
import os
import pandas as pd
from pathlib import Path

combined_path = Path("..") / "Combined-Licenses"
license_file_counts = {}

for license_dir in combined_path.iterdir():
    if license_dir.is_dir():
        txt_files = list(license_dir.glob("*.txt"))
        license_file_counts[license_dir.name] = len(txt_files)

# Convert to DataFrame and sort
df_counts = pd.DataFrame([
    {"license_name": name, "file_count": count}
    for name, count in license_file_counts.items()
])

df_counts = df_counts.sort_values(by="file_count", ascending=False)
for _, row in df_counts.iterrows():
    print(f"{row['license_name']}: {row['file_count']} files")

total_files = df_counts['file_count'].sum()
print(f"\nTotal .txt license files: {total_files}")

output_csv_path = Path("..") / "combined_license_file_counts.csv"
df_counts.to_csv(output_csv_path, index=False)

print(f"\n Saved sorted license file counts to: {output_csv_path}")

APL-1.0: 1795 files
BitTorrent-1.1: 1792 files
BitTorrent-1.0: 1644 files
RPL-1.1: 1558 files
NPL-1.1: 1502 files
RPL-1.5: 1410 files
NPL-1.0: 1317 files
MPL-1.1: 1294 files
SPL-1.0: 1283 files
CUA-OPL-1.0: 1280 files
RSCPL: 1278 files
RHeCos-1.1: 1254 files
ODbL-1.0: 1245 files
Interbase-1.0: 1226 files
RPSL-1.0: 1194 files
NOSL: 1181 files
SugarCRM-1.1.3: 1168 files
Hacktivismo: 1165 files
Motosoto: 1147 files
OPL-1.0: 1125 files
GPL-3.0: 1071 files
GPL-3.0+: 1055 files
CC-BY-SA-3.0-AT: 1033 files
AGPL-3.0: 1011 files
Zimbra-1.2: 1010 files
CPAL-1.0: 973 files
LGPL-2.1: 956 files
MPL-1.0: 950 files
gSOAP-1.3b: 945 files
SNIA: 940 files
LGPL-2.1+: 935 files
NASA-1.3: 924 files
GFDL-1.3: 916 files
CC-BY-NC-ND-3.0-IGO: 909 files
CC-BY-NC-SA-4.0: 907 files
FreeImage: 892 files
LGPL-2.0+: 892 files
CC-BY-SA-4.0: 891 files
ODC-By-1.0: 879 files
CC-BY-NC-SA-3.0: 873 files
LGPL-2.0: 867 files
CDDL-1.1: 857 files
GFDL-1.2: 856 files
Jabber-1.0: 854 files
CC-BY-3.0-AT: 847 files
CDDL-1.0: 846 