In [3]:
# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

In [39]:
#from fuzzywuzzy import fuzz
#from collections import defaultdict
#
#def group_files_by_iem(files, threshold=80):
#    groups = defaultdict(list)
#    grouped_files = set()
#    
#    for file1 in files:
#        if file1 not in grouped_files:
#            groups[file1].append(file1)
#            grouped_files.add(file1)
#            
#            for file2 in files:
#                if file2 not in grouped_files:
#                    similarity = fuzz.token_set_ratio(file1, file2)
#                    
#                    if similarity >= threshold:
#                        groups[file1].append(file2)
#                        grouped_files.add(file2)
#    
#    return groups

In [40]:
from fuzzywuzzy import fuzz
from collections import defaultdict
import re

In [41]:
def custom_similarity(file1, file2):
    brand_pattern = r'^(.*?)\s'
    iem_pattern = r'\s(.*?)\s'
    variation_pattern = r'\s(.*)$'
    
    brand1 = re.search(brand_pattern, file1).group(1) if re.search(brand_pattern, file1) else ''
    iem1 = re.search(iem_pattern, file1).group(1) if re.search(iem_pattern, file1) else ''
    variation1 = re.search(variation_pattern, file1).group(1) if re.search(variation_pattern, file1) else ''
    
    brand2 = re.search(brand_pattern, file2).group(1) if re.search(brand_pattern, file2) else ''
    iem2 = re.search(iem_pattern, file2).group(1) if re.search(iem_pattern, file2) else ''
    variation2 = re.search(variation_pattern, file2).group(1) if re.search(variation_pattern, file2) else ''
    
    brand_similarity = fuzz.token_set_ratio(brand1, brand2)
    iem_similarity = fuzz.token_set_ratio(iem1, iem2)
    variation_similarity = fuzz.token_set_ratio(variation1, variation2)
    
    return brand_similarity, iem_similarity, variation_similarity

In [42]:
def group_files_by_iem(files, threshold=85):
    groups = defaultdict(list)
    grouped_files = set()
    
    # Create a special "target" group
    target_group = "target"
    
    for file1 in files:
        if file1 not in grouped_files:
            # Check if the file name contains "target"
            if "target" in file1.lower():
                groups[target_group].append(file1)
                grouped_files.add(file1)
                continue
            
            groups[file1].append(file1)
            grouped_files.add(file1)
            
            for file2 in files:
                if file2 not in grouped_files:
                    similarity = fuzz.token_set_ratio(file1, file2)
                    
                    if similarity >= threshold:
                        groups[file1].append(file2)
                        grouped_files.add(file2)
    
    return groups


In [43]:
import os

# Gather all file names
all_files = []

for reviewer in reviewers:
    reviewer_path = f'output/csv/{reviewer}'
    
    if os.path.exists(reviewer_path):
        files = os.listdir(reviewer_path)
        all_files.extend([(reviewer, file) for file in files])

# Group files by IEM
grouped_files = group_files_by_iem([file for _, file in all_files])


In [44]:
# Print the names of the created groups
print("Names of the created groups:")
for group_name in grouped_files.keys():
    print(group_name)
print()

Names of the created groups:
7Hz Dioko L.csv
7Hz Eternal L.csv
7Hz Legato L.csv
7Hz Timeless L.csv
7Hz Zero L.csv
target
Audeze iSine LX (Spinfit) L.csv
Aune Jasper L.csv
Blon BL01 L.csv
Blon Fat Girl L.csv
Blon Z200 L.csv
CCA CRA L.csv
CCZ Emerald L.csv
CCZ Warrior L.csv
Celest Pandamon L.csv
Dunu Kima L.csv
Dunu Talos (Planar + BA) (IE600 Foam Tips) L.csv
Dunu Titan S (Isolation) L.csv
Dunu Vulkan L.csv
Effect Audio Axiom L.csv
Fiio FHE Eclipse L.csv
Final Audio E500 L.csv
Hidition Violet L.csv
Hifiman RE600s v2 L.csv
Hifiman Svanar L.csv
Hifiman TWS800 L.csv
HZ Sound Heart Mirror Pro L.csv
Ikko Opal OH02 L.csv
ITK Unknown L.csv
Joyodio Shine (0000) L.csv
KBEAR Lark (S1) L.csv
KBEAR Little Q L.csv
KBEAR Qinglong L.csv
KBEAR Robin L.csv
Kinera BD005 Pro L.csv
Kiwi Ears Cadenza L.csv
Kiwi Ears Orchestra Lite L.csv
Koss KEB90 L.csv
KZ AS16 Pro L.csv
KZ ATE L.csv
KZ DQ6 L.csv
KZ EDX L.csv
KZ PR1 L.csv
KZ ZAX L.csv
KZ ZES L.csv
KZ ZSN Pro X L.csv
Letshuoer D13 (Gold Filter) L.csv
Letshuoe

In [45]:
# Print grouped files
for representative, files in grouped_files.items():
    print(f"{representative}:")
    for file in files:
        reviewers_with_file = [reviewer for reviewer, f in all_files if f == file]
        print(f"  {file} - {', '.join(reviewers_with_file)}")
    print()

7Hz Dioko L.csv:
  7Hz Dioko L.csv - achoreviews, bedrock, timmyv, iemworld, data_mrs
  7HZ SALNOTES DIOKO L.csv - aftersound
  7HZ SALNOTES DIOKO R.csv - aftersound
  7Hz Salnotes Dioko L.csv - arn, pw
  7Hz Dioko R.csv - bedrock, timmyv, hbb, iemworld, data_mrs
  Salnotes Dioko L.csv - eplv
  7hz Dioko L.csv - hobbytalk
  7hz Dioko R.csv - hobbytalk
  7HZ DIOKO (bass mod) L.csv - ianfann
  7HZ DIOKO (bass mod) R.csv - ianfann
  7HZ DIOKO L.csv - ianfann
  7HZ DIOKO R.csv - ianfann
  Dioko L.csv - kr0mka
  Dioko R.csv - kr0mka
  Resolve 7Hz Salnotes Dioko Gras L.csv - kurin
  7hz Salnotes Dioko L.csv - nymz
  7hz Salnotes Dioko R.csv - nymz
  7Hz Salnotes Dioko R.csv - pw
  7Hz Salnotes Dioko TapedVent L.csv - pw
  7hz x Crinacle Dioko L.csv - vortexreviews
  7hz x Crinacle Dioko R.csv - vortexreviews
  7hz x Crinacle Dioko Updated L.csv - vortexreviews
  7hz x Crinacle Dioko Updated R.csv - vortexreviews
  7Hz x Crinacle Salnotes Dioko L.csv - vsg
  7Hz x Crinacle Salnotes Dioko R.cs

In [46]:
print(f"Total number of files: {len(all_files)}")
print(f"Total number of groups: {len(grouped_files)}")
print(f"Total number of files in groups: {sum([len(files) for files in grouped_files.values()])}")
print(f"Total number of files not in groups: {len(all_files) - sum([len(files) for files in grouped_files.values()])}")
print(f"\nAvg number of files per group: {sum([len(files) for files in grouped_files.values()]) / len(grouped_files)}")

Total number of files: 9884
Total number of groups: 1627
Total number of files in groups: 7670
Total number of files not in groups: 2214

Avg number of files per group: 4.71419791026429


In [48]:
# all_files = []
# 
# for reviewer in reviewers:
#     reviewer_path = f'output/csv/{reviewer}'
#     
#     if os.path.exists(reviewer_path):
#         files = os.listdir(reviewer_path)
#         cleaned_files = []
#         for file in files:
#             file_lower = file.lower()
#             if file_lower.endswith(' l.csv'):
#                 cleaned_files.append(file_lower[:-6])
#             elif file_lower.endswith(' r.csv'):
#                 cleaned_files.append(file_lower[:-6])
#             else:
#                 cleaned_files.append(file_lower)
#         all_files.extend(cleaned_files)
# 
# # Remove duplicates and sort the list
# unique_files = sorted(set(all_files))
# 
# # Print unique, lowercased, and sorted file names
# for file in unique_files:
#     print(file)


07m
07m_retune
1 custom pm-01
10mm bio v1
10mm bio v2
10mm bio v3
10mm bio v4
10mm bio v5
1more colorbuds 2
1more comfobudspro
1more comfobudsprooff
1more evo
1more pistonclassic
1more quad driver
1more triple driver (foam eartips)
1more triple driver (no tip on coupler 8k peak disappeared foam)
1more triple driver (orignal silicone eartips)
2ht
2p
32bg
4acoustic harmonie
4acoustic symphonie 1 on
4acoustic symphonie 2 on
4acoustic symphonie 3 on
4acoustic symphonie all off
4acoustic symphonie all on
4acoustic unison
4p
4ss
556
55hz 03xx
55hz 1
55hz 350
55hz 3d
55hz b
55hz eki
55hz eki fil fo
55hz eki tune
55hz fo
55hz hbb m
55hz hbb maka
55hz hok
55hz mystery
55hz no damp
55hz os
55hz osummer
55hz osun sil
55hz p1 foam
55hz p1 sil
55hz p2
55hz qk
55hz qkz foa
55hz qkz sil
55hz r
55hz secretz 429
55hz see
55hz sil
55hz ss
55hz st
55hz t
55hz x hbb kai
5i
5i anc
5i anc bass
5i anc treble
5i bass
5i transparency
5i treble
64 audio duo
64 audio fourte
64 audio fourte blanc
64 audio fourte 

In [12]:
# import os
# from collections import defaultdict
# import re
# 
# def remove_channel_suffix(file_name):
#     file_name = file_name.lower()
#     file_name = re.sub(r'\(.*?\)', '', file_name)  # Remove content in brackets
#     file_name = file_name.strip()  # Remove leading and trailing spaces
# 
#     if " l.csv" in file_name:
#         file_name = file_name.replace(" l.csv", ".csv")
#     elif " r.csv" in file_name:
#         file_name = file_name.replace(" r.csv", ".csv")
# 
#     return file_name.strip()  # Remove any extra spaces after channel suffix removal
# 
# def group_files_by_iem(files, threshold=80):
#     groups = defaultdict(list)
#     added_files = set()
# 
#     # Create a special "target" group
#     target_group = "target"
# 
#     for file1 in files:
#         modified_file1 = remove_channel_suffix(file1)
# 
#         # Check if the file name contains "target"
#         if "target" in file1.lower():
#             if file1 not in added_files:
#                 groups[target_group].append(file1)
#                 added_files.add(file1)
#             continue
# 
#         if file1 not in added_files:
#             groups[file1].append(file1)
#             added_files.add(file1)
# 
#         for file2 in files:
#             if file2 not in added_files:
#                 modified_file2 = remove_channel_suffix(file2)
#                 similarity = fuzz.token_set_ratio(modified_file1, modified_file2)
# 
#                 if similarity >= threshold:
#                     groups[file1].append(file2)
#                     added_files.add(file2)
# 
#     return groups
# 
# # Gather all file names
# all_files = []
# 
# for reviewer in reviewers:
#     reviewer_path = f'output/csv/{reviewer}'
# 
#     if os.path.exists(reviewer_path):
#         files = os.listdir(reviewer_path)
#         all_files.extend([(reviewer, file) for file in files])
# 
# # Group files by IEM
# grouped_files = group_files_by_iem([file for _, file in all_files])
# 
# # Print grouped files
# for representative, files in grouped_files.items():
#     print(f"{representative}:")
#     for file in files:
#         reviewers_with_file = [reviewer for reviewer, f in all_files if f == file]
#         print(f"  {file} - {', '.join(reviewers_with_file)}")
#     print()
# 

KeyboardInterrupt: 

In [1]:
import os
from collections import defaultdict
from fuzzywuzzy import fuzz
import re

# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

def remove_channel_suffix(file_name):
    file_name = file_name.lower()
    file_name = re.sub(r'\(.*?\)', '', file_name)  # Remove content in brackets
    file_name = file_name.strip()  # Remove leading and trailing spaces

    if " l.csv" in file_name:
        file_name = file_name.replace(" l.csv", ".csv")
    elif " r.csv" in file_name:
        file_name = file_name.replace(" r.csv", ".csv")

    return file_name.strip()  # Remove any extra spaces after channel suffix removal

def group_files_by_iem(files, threshold=90):
    groups = defaultdict(list)
    added_files = set()

    # Create a special "target" group
    target_group = "target"

    for file1 in files:
        modified_file1 = remove_channel_suffix(file1)

        # Check if the file name contains "target"
        if "target" in file1.lower():
            if file1 not in added_files:
                groups[target_group].append(file1)
                added_files.add(file1)
            continue

        if file1 not in added_files:
            groups[file1].append(file1)
            added_files.add(file1)

        for file2 in files:
            if file2 not in added_files:
                modified_file2 = remove_channel_suffix(file2)
                similarity = fuzz.token_set_ratio(modified_file1, modified_file2)

                # If the similarity score is above the threshold, add the file to the group
                if similarity >= threshold:
                    groups[file1].append(file2)
                    added_files.add(file2)
                # If the similarity score is between 80 and the threshold, compare only the IEM part
                elif 80 <= similarity < threshold:
                    iem1 = re.sub(r'[^a-zA-Z0-9]', '', modified_file1.split()[1]) if len(modified_file1.split()) > 1 else ''
                    iem2 = re.sub(r'[^a-zA-Z0-9]', '', modified_file2.split()[1]) if len(modified_file2.split()) > 1 else ''
                    iem_similarity = fuzz.token_set_ratio(iem1, iem2)
                    
                    if iem_similarity >= threshold:
                        groups[file1].append(file2)
                        added_files.add(file2)

    return groups

# Gather all file names
all_files = []

for reviewer in reviewers:
    reviewer_path = f'output/csv/{reviewer}'

    if os.path.exists(reviewer_path):
        files = os.listdir(reviewer_path)
        all_files.extend([(reviewer, file) for file in files])

# Group files by IEM
grouped_files = group_files_by_iem([file for _, file in all_files])

# Print grouped files
for representative, files in grouped_files.items():
    print(f"{representative}:")
    for file in files:
        reviewers_with_file = [reviewer for reviewer, f in all_files if f == file]
        print(f"  {file} - {', '.join(reviewers_with_file)}")
    print()



7Hz Dioko L.csv:
  7Hz Dioko L.csv - achoreviews, bedrock, timmyv, iemworld, data_mrs
  7HZ SALNOTES DIOKO L.csv - aftersound
  7HZ SALNOTES DIOKO R.csv - aftersound
  7Hz Salnotes Dioko L.csv - arn, pw
  7Hz Dioko R.csv - bedrock, timmyv, hbb, iemworld, data_mrs
  Salnotes Dioko L.csv - eplv
  Salnotes Dioko R.csv - eplv
  7hz Dioko L.csv - hobbytalk
  7hz Dioko R.csv - hobbytalk
  7HZ DIOKO (bass mod) L.csv - ianfann
  7HZ DIOKO (bass mod) R.csv - ianfann
  7HZ DIOKO L.csv - ianfann
  7HZ DIOKO R.csv - ianfann
  Dioko L.csv - kr0mka
  Dioko R.csv - kr0mka
  Resolve 7Hz Salnotes Dioko Gras L.csv - kurin
  7hz Salnotes Dioko L.csv - nymz
  7hz Salnotes Dioko R.csv - nymz
  7Hz Salnotes Dioko R.csv - pw
  7Hz Salnotes Dioko TapedVent L.csv - pw
  7hz x Crinacle Dioko L.csv - vortexreviews
  7hz x Crinacle Dioko R.csv - vortexreviews
  7hz x Crinacle Dioko Updated L.csv - vortexreviews
  7hz x Crinacle Dioko Updated R.csv - vortexreviews
  7Hz x Crinacle Salnotes Dioko L.csv - vsg
  7Hz 

In [2]:
print(f"Total number of files: {len(all_files)}")
print(f"Total number of groups: {len(grouped_files)}")
print(f"Total number of files in groups: {sum([len(files) for files in grouped_files.values()])}")
print(f"Total number of files not in groups: {len(all_files) - sum([len(files) for files in grouped_files.values()])}")
print(f"\nAvg number of files per group: {sum([len(files) for files in grouped_files.values()]) / len(grouped_files)}")

Total number of files: 9882
Total number of groups: 1603
Total number of files in groups: 7668
Total number of files not in groups: 2214

Avg number of files per group: 4.7835308796007485


In [4]:
import os

# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

for reviewer in reviewers:
    reviewer_path = f'output/csv/{reviewer}'
    
    if os.path.exists(reviewer_path):
        print(f"Files in '{reviewer_path}':")
        for file_name in os.listdir(reviewer_path):
            print(f"  {file_name}")
        print()
    else:
        print(f"Directory '{reviewer_path}' not found.\n")


Files in 'output/csv/achoreviews':
  7Hz Dioko L.csv
  7Hz Eternal L.csv
  7Hz Legato L.csv
  7Hz Timeless L.csv
  7Hz Zero L.csv
  Acho Reviews Target.csv
  Antdroid Target.csv
  Audeze iSine LX (Spinfit) L.csv
  Aune Jasper L.csv
  Bad Guy Target.csv
  Banbeucmas Target.csv
  Blon BL01 L.csv
  Blon BL03 L.csv
  Blon BL05s L.csv
  Blon Fat Girl L.csv
  Blon T1 L.csv
  Blon T3 L.csv
  Blon Z200 L.csv
  CCA CRA L.csv
  CCA CRA Plus L.csv
  CCZ Emerald L.csv
  CCZ Melody L.csv
  CCZ Warrior L.csv
  Celest Pandamon L.csv
  Crinacle Target.csv
  Diffuse Field Target.csv
  Dunu Kima L.csv
  Dunu Talos (Planar + BA) (IE600 Foam Tips) L.csv
  Dunu Talos (Planar + BA) L.csv
  Dunu Talos (Planar Only) (IE600 Foam Tips) L.csv
  Dunu Talos (Planar Only) L.csv
  Dunu Titan S (Isolation) L.csv
  Dunu Titan S L.csv
  Dunu Vulkan L.csv
  Effect Audio Axiom L.csv
  Etymotic Target.csv
  Fiio FHE Eclipse L.csv
  Final Audio E500 L.csv
  Free Field Target.csv
  Harman Target.csv
  Hidition Violet L.csv
