In [40]:
import os
import re
import pandas as pd

In [41]:
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

In [42]:
# Replace double backslashes with single forward slashes in file paths
def fix_file_path(file_path):
    return file_path.replace("\\", "/")

In [43]:
# Read the data from the .csv file
def read_iem_list(file_path):
    with open(file_path, 'r') as f:
        content = f.read()

    # Parse the content of the .txt file using regex to get the iem_list 
    iem_list = {}
    pattern = re.compile(r"'(.*?)' : \[(.*?)\]")
    matches = pattern.findall(content)

    for iem, variations in matches:
        iem = iem.strip()
        variations = [v.strip() for v in variations.split(',')]
        iem_list[iem] = variations

    return iem_list

# Read the iem_list from the .txt file
iem_list = read_iem_list('IEM Lists/output.txt')

In [44]:
# Gets the list of all the csv files
def get_csv_files():
    # Creates a list of all the csv files
    csv_files = []
    # Iterates through all the reviewers
    for reviewer in reviewers:
        # Gets the path of the reviewer
        path = os.path.join("output/csv", reviewer)
        # Iterates through all the files in the path
        for file in os.listdir(path):
            # Checks if the file is a csv file
            if file.endswith(".csv"):
                # Adds the file to the list
                csv_files.append(os.path.join(path, file))
    return csv_files

In [45]:
def process_filename(filename):
    # Remove file extension
    filename = os.path.splitext(filename)[0]
    
    # Convert to lowercase
    filename = filename.lower()
    
    # Remove text in brackets (round, square, and curly)
    filename = re.sub(r'(\[.*?\]|\{.*?\}|\(.*?\))', '', filename)
    
    # Remove trailing L or R
    filename = re.sub(r'[lr]$', '', filename)
    
    # Strip leading and trailing whitespaces
    filename = filename.strip()

    return filename

In [46]:
#def group_by_iem(csv_files):
#    grouped_files = {}
#
#    # Sort the variations in descending order of their length
#    # This is to ensure that the longest variations are matched first
#    for iem in iem_list:
#        iem_list[iem].sort(key=len, reverse=True)
#
#    # Sort the IEM list keys by their length in descending order
#    # This is to ensure that the longest IEMs are matched first
#    sorted_iem_list = sorted(iem_list.keys(), key=len, reverse=True)
#
#    for filepath in csv_files:
#        filename = os.path.basename(filepath)
#        processed_name = process_filename(filename)
#
#        # Iterate through the sorted IEM list keys
#        for iem_key in sorted_iem_list:
#            variations = iem_list[iem_key]
#
#            for variation in variations:
#                # Use regex to match the beginning of the processed file name
#                if re.match(f"^{variation}( |$)", processed_name, re.IGNORECASE):
#                    if iem_key not in grouped_files:
#                        grouped_files[iem_key] = []
#                    grouped_files[iem_key].append(filepath)
#                    break
#            else:
#                continue
#            break
#
#    # Sort the final dictionary alphabetically (case-insensitive) by the keys
#    sorted_grouped_files = {key: grouped_files[key] for key in sorted(grouped_files, key=lambda k: k.lower())}
#
#    return sorted_grouped_files


In [47]:
def group_by_iem(csv_files):
    grouped_files = {}

    # Sort the variations in descending order of their length
    # This is to ensure that the longest variations are matched first
    for iem in iem_list:
        iem_list[iem].sort(key=len, reverse=True)

    # Sort the IEM list keys by their length in descending order
    # This is to ensure that the longest IEMs are matched first
    sorted_iem_list = sorted(iem_list.keys(), key=len, reverse=True)

    for filepath in csv_files:
        filename = os.path.basename(filepath)
        processed_name = process_filename(filename)

        # Iterate through the sorted IEM list keys
        for iem_key in sorted_iem_list:
            variations = iem_list[iem_key]

            for variation in variations:
                # Escape the '+' character in variation
                escaped_variation = variation.replace('+', r'\+')

                # Use regex to match the beginning of the processed file name
                if re.match(f"^{escaped_variation}( |$)", processed_name):
                    if iem_key not in grouped_files:
                        grouped_files[iem_key] = []
                    grouped_files[iem_key].append(filepath)
                    break
            else:
                continue
            break

    # Sort the final dictionary alphabetically (case-insensitive) by the keys
    sorted_grouped_files = {key: grouped_files[key] for key in sorted(grouped_files, key=lambda k: k.lower())}

    return sorted_grouped_files


In [48]:
def create_iem_dicts(grouped_files):
    iem_dicts = []
    for iem, files in grouped_files.items():
        iem_dict = {'IEM': iem, 'Files': files}
        iem_dicts.append(iem_dict)
    return iem_dicts

csv_files = [fix_file_path(file) for file in get_csv_files()]
grouped_files = group_by_iem(csv_files)
iem_dicts = create_iem_dicts(grouped_files)

print(iem_dicts)

[{'IEM': '1Custom Apex Ti', 'Files': ['output/csv/recode/apexti L.csv', 'output/csv/recode/apexti R.csv']}, {'IEM': '1Custom PM01', 'Files': ['output/csv/hobbytalk/1 Custom PM-01 L.csv', 'output/csv/hobbytalk/1 Custom PM-01 R.csv']}, {'IEM': '1MORE ColorBuds 2', 'Files': ['output/csv/vsg/1MORE ColorBuds 2 L.csv', 'output/csv/vsg/1MORE ColorBuds 2 R.csv']}, {'IEM': '1MORE ComfoBuds Pro', 'Files': ['output/csv/eplv/1More ComfoBudsPro L.csv', 'output/csv/eplv/1More ComfoBudsProOff L.csv', 'output/csv/eplv/1More ComfoBudsProOff R.csv']}, {'IEM': '1MORE Evo', 'Files': ['output/csv/vsg/1MORE EVO L.csv', 'output/csv/vsg/1MORE EVO R.csv']}, {'IEM': '1MORE Piston Classic', 'Files': ['output/csv/eplv/1More PistonClassic L.csv', 'output/csv/eplv/1More PistonClassic R.csv']}, {'IEM': '1MORE Quad Driver', 'Files': ['output/csv/iemworld/1MORE Quad Driver L.csv', 'output/csv/iemworld/1MORE Quad Driver R.csv', 'output/csv/kurin/Oratory 1MORE Quad Driver L.csv']}, {'IEM': '1MORE Single Driver', 'Files'

In [49]:
# Count the total number of files
total_files = len(csv_files)

# Count the total number of groups (IEMs)
total_groups = len(iem_dicts)

# Count the total number of files in groups
files_in_groups = sum([len(d['Files']) for d in iem_dicts])

# Calculate the total number of files not in groups
files_not_in_groups = total_files - files_in_groups

# Calculate the average number of files per group
avg_files_per_group = files_in_groups / total_groups

In [50]:
# Print the stats
print(f"Total number of files: {total_files}")
print(f"Total number of groups: {total_groups}")
print(f"Total number of files in groups: {files_in_groups}")
print(f"Total number of files not in groups: {files_not_in_groups}")
print(f"Average number of files per group: {avg_files_per_group:.2f}")

Total number of files: 9882
Total number of groups: 1274
Total number of files in groups: 7451
Total number of files not in groups: 2431
Average number of files per group: 5.85


In [51]:
# Find and print the file paths of files not in groups
grouped_file_paths = [file for d in iem_dicts for file in d['Files']]
files_not_grouped = [file for file in csv_files if file not in grouped_file_paths]

print("\nFiles not in groups:\n")
for file in files_not_grouped:
    file_basename = os.path.basename(file)
    processed_file = process_filename(file_basename)
    print(processed_file)


Files not in groups:

innerfidelity id target
precogvision target
rtings target
simgot ea500
simgot ea500
sonarworks target
strauss and wagner em205
super review target
t force yuan li
tanchjim ola
tanchjim tanya
tanchjim zero
tangzu shimin li
tangzu wan er sg
thieaudio legacy 2
tin c2 mech warrior
tin c3
tin t1 plus
tin t1s
tin t2 dlc
tin t2
tin t2 plus
tin t3 buds
tin t3 plus
tin t4 plus
tipsy ttromso pine stone sea
tkzk ouranos
tri i one
tri meteor
tri x hbb kai
tripowin cencibel
tripowin lea
tripowin rhombus
trn mt1
trn st5
trn ta1
trn ta1 max
trn xuanwu
unique melody mest mk2
urbanfun ybf-iss014
urbanfun ybf-iss014
venture electronics grand duke
z_isolation comparison free field
ief neutral target
ief neutra
innerfidelity id target
innerfidelity target
precogvision target
rtings target
simgot ea500 classic
simgot ea500 classic
simgot ea500 harman
simgot ea500 harman
simgot em2 roltion
simgot em2 roltion
skald s11
smabat nco
smabat nco
sonarworks target
sony ier m9
sony ier m9
son

In [52]:
# Sort the output alphabetically and case-insensitive, and remove duplicates
print("\nSorted unique files not in groups:\n")
seen = set()
sorted_files_not_grouped = sorted(files_not_grouped, key=lambda x: process_filename(os.path.basename(x)).lower())
for file in sorted_files_not_grouped:
    file_basename = os.path.basename(file)
    processed_file = process_filename(file_basename)
    if processed_file not in seen:
        print(processed_file)
        seen.add(processed_file)



Sorted unique files not in groups:

32bg
55hz 03xx
55hz 1
55hz 350
55hz 3d
55hz b
55hz eki
55hz eki fil fo
55hz eki tune
55hz fo
55hz hbb m
55hz hbb maka
55hz hok
55hz mystery
55hz no damp
55hz os
55hz osummer
55hz osun sil
55hz p1 foam
55hz p1 sil
55hz p2
55hz qk
55hz qkz foa
55hz qkz sil
55hz r
55hz secretz 429
55hz see
55hz sil
55hz ss
55hz st
55hz t
55hz x hbb kai
7a pro1stock
7hz sample
7hz sample med
7hz sample sha
7hz sample shal
a2
aful new
aful soloist
agent 007
agent 008
akros erasmus
anole vx 003
anole vx 020
anole vx 100
arere
audiosense dt300+36
audiosense dt300+75
avid
blon aa
blon bl03v2 stock
blon bl05sv2
blon bl05v2
bonus ie pro
bt
cca ca16v2
ccz coffee beanv2
ccz coffeebean
dkm
dm2_dm1
dm2_eqmdm1
dm2_olina
drop x etymotic erx
du b
edifier neobuds pro
enco x2 - punchy
enco x2 - real
enco x2 - simple and clear
enco x2 eq classic
enco x2 eq punchy
enco x2 eq real
enco x2 eq simple & clear
fd5
foam and 450
goldpla filt 2
graph x
gs audio se12b
harman 2017 target
harman 2

In [53]:
# Gather all file names
all_files = []

In [54]:
# Group files by IEM
grouped_files = group_by_iem([file for _, file in all_files])

In [55]:
for reviewer in reviewers:
    reviewer_path = f'output/csv/{reviewer}'

    if os.path.exists(reviewer_path):
        files = os.listdir(reviewer_path)
        all_files.extend([(reviewer, file) for file in files])

In [56]:
# Print grouped files
for representative, files in grouped_files.items():
    print(f"{representative}:")
    for file in files:
        reviewers_with_file = [reviewer for reviewer, f in all_files if f == file]
        print(f"  {file} - {', '.join(reviewers_with_file)}")
    print()

In [57]:
# Create a dataframe from the list of dictionaries
iem_dicts_df = pd.DataFrame(iem_dicts)

In [58]:
iem_dicts_df

Unnamed: 0,IEM,Files
0,1Custom Apex Ti,"[output/csv/recode/apexti L.csv, output/csv/re..."
1,1Custom PM01,"[output/csv/hobbytalk/1 Custom PM-01 L.csv, ou..."
2,1MORE ColorBuds 2,"[output/csv/vsg/1MORE ColorBuds 2 L.csv, outpu..."
3,1MORE ComfoBuds Pro,"[output/csv/eplv/1More ComfoBudsPro L.csv, out..."
4,1MORE Evo,"[output/csv/vsg/1MORE EVO L.csv, output/csv/vs..."
...,...,...
1269,Xingshenglong DIY E3000,"[output/csv/eplv/DIY E3000 L.csv, output/csv/e..."
1270,Xingshenglong DIY E4000,"[output/csv/eplv/DIY E4000 L.csv, output/csv/e..."
1271,XINHS Erasmus,"[output/csv/harpo/Erasmus 00 R.csv, output/csv..."
1272,Yanyin Canon,[output/csv/tgx78/Canon L.csv]


In [59]:
iem_dicts_df.to_csv('output/iem_dicts.csv', index=False)