In [26]:
import os
from collections import defaultdict
from fuzzywuzzy import fuzz
import re
import pandas as pd

In [27]:
# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

In [28]:
def remove_channel_suffix(file_name):
    file_name = file_name.lower()
    file_name = re.sub(r'\(.*?\)', '', file_name)  # Remove content in brackets
    file_name = file_name.strip()  # Remove leading and trailing spaces

    if " l.csv" in file_name:
        file_name = file_name.replace(" l.csv", ".csv")
    elif " r.csv" in file_name:
        file_name = file_name.replace(" r.csv", ".csv")

    return file_name.strip()  # Remove any extra spaces after channel suffix removal

In [29]:
import re

def read_iem_list(file_path):
    iem_list = {}
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            match = re.match(r"'(.*?)' :\s*\[(.*?)\]", line)
            if match:
                iem_name = match.group(1)
                variations = match.group(2)
                iem_list[iem_name] = [v.strip() for v in variations.split(',')]
    return iem_list

def process_iem_list(iem_list):
    iem_mapping = {}
    for iem_name, variations in iem_list.items():
        for variation in variations:
            iem_mapping[variation] = iem_name
            
    return iem_mapping

def match_iem_name(file_name, iem_mapping):
    file_name_lower = file_name.lower()
    for variation, iem_name in iem_mapping.items():
        if variation.lower() in file_name_lower:
            return iem_name
    return None

In [30]:
#ef group_files_by_iem(files, iem_mapping, threshold=100):
#   groups = defaultdict(list)
#   added_files = set()
#
#   # Create a special "target" group
#   target_group = "target"
#
#   for file1 in files:
#       modified_file1 = remove_channel_suffix(file1)
#
#       # Check if the file name contains "target"
#       if "target" in file1.lower():
#           if file1 not in added_files:
#               groups[target_group].append(file1)
#               added_files.add(file1)
#           continue
#
#       # Use the IEM mapping to match the IEM name
#       iem_name = match_iem_name(modified_file1, iem_mapping)
#       if iem_name is not None:
#           modified_file1 = iem_name
#
#       if file1 not in added_files:
#           groups[file1].append(file1)
#           added_files.add(file1)
#
#       for file2 in files:
#           if file2 not in added_files:
#               modified_file2 = remove_channel_suffix(file2)
#
#               # Use the IEM mapping to match the IEM name
#               iem_name2 = match_iem_name(modified_file2, iem_mapping)
#               if iem_name2 is not None:
#                   modified_file2 = iem_name2
#
#               similarity = fuzz.token_set_ratio(modified_file1, modified_file2)
#
#               # If the similarity score is above the threshold, add the file to the group
#               if similarity >= threshold:
#                   groups[file1].append(file2)
#                   added_files.add(file2)
#               # If the similarity score is between 95 and the threshold, compare only the IEM part
#               elif 99 <= similarity < threshold:
#                   iem1 = re.sub(r'[^a-zA-Z0-9]', '', modified_file1.split()[1]) if len(modified_file1.split()) > 1 else ''
#                   iem2 = re.sub(r'[^a-zA-Z0-9]', '', modified_file2.split()[1]) if len(modified_file2.split()) > 1 else ''
#                   iem_similarity = fuzz.token_set_ratio(iem1, iem2)
#                   
#                   if iem_similarity >= threshold:
#                       groups[file1].append(file2)
#                       added_files.add(file2)
#
#   return groups


In [31]:
def group_files_by_iem(files, iem_mapping, threshold=100):
    groups = defaultdict(list)
    added_files = set()

    # Create a special "target" group
    target_group = "target"

    for file1 in files:
        modified_file1 = remove_channel_suffix(file1)

        # Check if the file name contains "target"
        if "target" in file1.lower():
            if file1 not in added_files:
                groups[target_group].append(file1)
                added_files.add(file1)
            continue

        # Use the iem_mapping to find the canonical name of the IEM for file1
        canonical_iem1 = None
        for variation, iem in iem_mapping.items():
            if variation.lower() in modified_file1.lower():
                canonical_iem1 = iem
                break

        if canonical_iem1 is None:
            continue

        if file1 not in added_files:
            groups[canonical_iem1].append(file1)
            added_files.add(file1)

        for file2 in files:
            if file2 not in added_files:
                modified_file2 = remove_channel_suffix(file2)

                # Use the iem_mapping to find the canonical name of the IEM for file2
                canonical_iem2 = None
                for variation, iem in iem_mapping.items():
                    if variation.lower() in modified_file2.lower():
                        canonical_iem2 = iem
                        break

                if canonical_iem2 is None:
                    continue

                # If the canonical names are the same, add the file to the group
                if canonical_iem1 == canonical_iem2:
                    groups[canonical_iem1].append(file2)
                    added_files.add(file2)

    return groups


In [32]:
# Gather all file names
all_files = []

In [33]:
for reviewer in reviewers:
    reviewer_path = f'output/csv/{reviewer}'

    if os.path.exists(reviewer_path):
        files = os.listdir(reviewer_path)
        all_files.extend([(reviewer, file) for file in files])

In [34]:
# # Group files by IEM
# grouped_files = group_files_by_iem([file for _, file in all_files])

In [35]:


# Read the IEM list from the text file and process it
iem_list = read_iem_list('IEM Lists/output.txt')
iem_mapping = process_iem_list(iem_list)

# Group files by IEM
grouped_files = group_files_by_iem([file for _, file in all_files], iem_mapping)

In [36]:
print(f"Total number of files: {len(all_files)}")
print(f"Total number of groups: {len(grouped_files)}")
print(f"Total number of files in groups: {sum([len(files) for files in grouped_files.values()])}")
print(f"Total number of files not in groups: {len(all_files) - sum([len(files) for files in grouped_files.values()])}")
print(f"\nAvg number of files per group: {sum([len(files) for files in grouped_files.values()]) / len(grouped_files)}")


Total number of files: 9882
Total number of groups: 1104
Total number of files in groups: 6402
Total number of files not in groups: 3480

Avg number of files per group: 5.798913043478261


In [37]:

# Print grouped files
for representative, files in grouped_files.items():
    print(f"{representative}:")
    for file in files:
        reviewers_with_file = [reviewer for reviewer, f in all_files if f == file]
        print(f"  {file} - {', '.join(reviewers_with_file)}")
    print()

7Hz Salnotes Dioko:
  7Hz Dioko L.csv - achoreviews, bedrock, timmyv, iemworld, data_mrs
  7HZ SALNOTES DIOKO L.csv - aftersound
  7HZ SALNOTES DIOKO R.csv - aftersound
  7Hz Salnotes Dioko L.csv - arn, pw
  7Hz Dioko R.csv - bedrock, timmyv, hbb, iemworld, data_mrs
  Salnotes Dioko L.csv - eplv
  Salnotes Dioko R.csv - eplv
  7hz Dioko L.csv - hobbytalk
  7hz Dioko R.csv - hobbytalk
  7HZ DIOKO (bass mod) L.csv - ianfann
  7HZ DIOKO (bass mod) R.csv - ianfann
  7HZ DIOKO L.csv - ianfann
  7HZ DIOKO R.csv - ianfann
  Dioko L.csv - kr0mka
  Dioko R.csv - kr0mka
  Dioko Spring R.csv - kr0mka
  Resolve 7Hz Salnotes Dioko Gras L.csv - kurin
  7hz Salnotes Dioko L.csv - nymz
  7hz Salnotes Dioko R.csv - nymz
  7Hz Salnotes Dioko R.csv - pw
  7Hz Salnotes Dioko TapedVent L.csv - pw
  7hz x Crinacle Dioko L.csv - vortexreviews
  7hz x Crinacle Dioko R.csv - vortexreviews
  7hz x Crinacle Dioko Updated L.csv - vortexreviews
  7hz x Crinacle Dioko Updated R.csv - vortexreviews
  7Hz x Crinacle 