In [1]:
import os
import re
import pandas as pd

In [2]:
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

In [3]:
# Read the data from the .csv file
def read_iem_list(file_path):
    with open(file_path, 'r') as f:
        content = f.read()

    # Parse the content of the .txt file using regex to get the iem_list 
    iem_list = {}
    pattern = re.compile(r"'(.*?)' : \[(.*?)\]")
    matches = pattern.findall(content)

    for iem, variations in matches:
        iem = iem.strip()
        variations = [v.strip() for v in variations.split(',')]
        iem_list[iem] = variations

    return iem_list

# Read the iem_list from the .txt file
iem_list = read_iem_list('IEM Lists/output.txt')

In [4]:
# Gets the list of all the csv files
def get_csv_files():
    # Creates a list of all the csv files
    csv_files = []
    # Iterates through all the reviewers
    for reviewer in reviewers:
        # Gets the path of the reviewer
        path = os.path.join("output/csv", reviewer)
        # Iterates through all the files in the path
        for file in os.listdir(path):
            # Checks if the file is a csv file
            if file.endswith(".csv"):
                # Adds the file to the list
                csv_files.append(os.path.join(path, file))
    return csv_files

In [5]:
def process_filename(filename):
    # Remove file extension
    filename = os.path.splitext(filename)[0]
    
    # Convert to lowercase
    filename = filename.lower()
    
    # Remove text in brackets (round, square, and curly)
    filename = re.sub(r'(\[.*?\]|\{.*?\}|\(.*?\))', '', filename)
    
    # Remove trailing L or R
    filename = re.sub(r'[lr]$', '', filename)
    
    # Strip leading and trailing whitespaces
    filename = filename.strip()

    return filename

In [6]:
def group_by_iem(csv_files):
    grouped_files = {}

    # Sort the variations in descending order of their length
    # This is to ensure that the longest variations are matched first
    for iem in iem_list:
        iem_list[iem].sort(key=len, reverse=True)

    # Sort the IEM list keys by their length in descending order
    # This is to ensure that the longest IEMs are matched first
    sorted_iem_list = sorted(iem_list.keys(), key=len, reverse=True)

    for filepath in csv_files:
        filename = os.path.basename(filepath)
        processed_name = process_filename(filename)

        # Iterate through the sorted IEM list keys
        for iem_key in sorted_iem_list:
            variations = iem_list[iem_key]

            for variation in variations:
                # Use regex to match the beginning of the processed file name
                if re.match(f"^{variation}( |$)", processed_name):
                    if iem_key not in grouped_files:
                        grouped_files[iem_key] = []
                    grouped_files[iem_key].append(filepath)
                    break
            else:
                continue
            break

    # Sort the final dictionary alphabetically (case-insensitive) by the keys
    sorted_grouped_files = {key: grouped_files[key] for key in sorted(grouped_files, key=lambda k: k.lower())}

    return sorted_grouped_files


In [7]:
def create_iem_dicts(grouped_files):
    iem_dicts = []
    for iem, files in grouped_files.items():
        iem_dict = {'IEM': iem, 'Files': files}
        iem_dicts.append(iem_dict)
    return iem_dicts

csv_files = get_csv_files()
grouped_files = group_by_iem(csv_files)
iem_dicts = create_iem_dicts(grouped_files)

print(iem_dicts)

[{'IEM': '1Custom Apex Ti', 'Files': ['output/csv\\recode\\apexti L.csv', 'output/csv\\recode\\apexti R.csv']}, {'IEM': '1Custom PM01', 'Files': ['output/csv\\hobbytalk\\1 Custom PM-01 L.csv', 'output/csv\\hobbytalk\\1 Custom PM-01 R.csv']}, {'IEM': '1MORE ColorBuds 2', 'Files': ['output/csv\\vsg\\1MORE ColorBuds 2 L.csv', 'output/csv\\vsg\\1MORE ColorBuds 2 R.csv']}, {'IEM': '1MORE ComfoBuds Pro', 'Files': ['output/csv\\eplv\\1More ComfoBudsPro L.csv', 'output/csv\\eplv\\1More ComfoBudsProOff L.csv', 'output/csv\\eplv\\1More ComfoBudsProOff R.csv']}, {'IEM': '1MORE Evo', 'Files': ['output/csv\\vsg\\1MORE EVO L.csv', 'output/csv\\vsg\\1MORE EVO R.csv']}, {'IEM': '1MORE Piston Classic', 'Files': ['output/csv\\eplv\\1More PistonClassic L.csv', 'output/csv\\eplv\\1More PistonClassic R.csv']}, {'IEM': '1MORE Quad Driver', 'Files': ['output/csv\\iemworld\\1MORE Quad Driver L.csv', 'output/csv\\iemworld\\1MORE Quad Driver R.csv', 'output/csv\\kurin\\Oratory 1MORE Quad Driver L.csv']}, {'IEM'

In [8]:
# Count the total number of files
total_files = len(csv_files)

# Count the total number of groups (IEMs)
total_groups = len(iem_dicts)

# Count the total number of files in groups
files_in_groups = sum([len(d['Files']) for d in iem_dicts])

# Calculate the total number of files not in groups
files_not_in_groups = total_files - files_in_groups

# Calculate the average number of files per group
avg_files_per_group = files_in_groups / total_groups

In [9]:
# Print the stats
print(f"Total number of files: {total_files}")
print(f"Total number of groups: {total_groups}")
print(f"Total number of files in groups: {files_in_groups}")
print(f"Total number of files not in groups: {files_not_in_groups}")
print(f"Average number of files per group: {avg_files_per_group:.2f}")

Total number of files: 9882
Total number of groups: 1266
Total number of files in groups: 7406
Total number of files not in groups: 2476
Average number of files per group: 5.85


In [10]:
# Find and print the file paths of files not in groups
grouped_file_paths = [file for d in iem_dicts for file in d['Files']]
files_not_grouped = [file for file in csv_files if file not in grouped_file_paths]

print("\nFiles not in groups:\n")
for file in files_not_grouped:
    print(file)



Files not in groups:

output/csv\achoreviews\Innerfidelity ID Target.csv
output/csv\achoreviews\Precogvision Target.csv
output/csv\achoreviews\Rtings Target.csv
output/csv\achoreviews\Simgot EA500 (black) L.csv
output/csv\achoreviews\Simgot EA500 (red) L.csv
output/csv\achoreviews\Sonarworks Target.csv
output/csv\achoreviews\Strauss and Wagner EM205 L.csv
output/csv\achoreviews\Super Review Target.csv
output/csv\achoreviews\T Force Yuan Li L.csv
output/csv\achoreviews\Tanchjim Ola L.csv
output/csv\achoreviews\Tanchjim Tanya L.csv
output/csv\achoreviews\Tanchjim Zero L.csv
output/csv\achoreviews\Tangzu Shimin Li L.csv
output/csv\achoreviews\Tangzu Wan Er SG L.csv
output/csv\achoreviews\Thieaudio Legacy 2 L.csv
output/csv\achoreviews\Tin C2 Mech Warrior L.csv
output/csv\achoreviews\Tin C3 L.csv
output/csv\achoreviews\Tin T1 Plus L.csv
output/csv\achoreviews\Tin T1S L.csv
output/csv\achoreviews\Tin T2 DLC L.csv
output/csv\achoreviews\Tin T2 L.csv
output/csv\achoreviews\Tin T2 Plus L.csv


In [12]:
# Gather all file names
all_files = []

In [13]:
# Group files by IEM
grouped_files = group_by_iem([file for _, file in all_files])

In [None]:
for reviewer in reviewers:
    reviewer_path = f'output/csv/{reviewer}'

    if os.path.exists(reviewer_path):
        files = os.listdir(reviewer_path)
        all_files.extend([(reviewer, file) for file in files])

In [15]:
# Print grouped files
for representative, files in grouped_files.items():
    print(f"{representative}:")
    for file in files:
        reviewers_with_file = [reviewer for reviewer, f in all_files if f == file]
        print(f"  {file} - {', '.join(reviewers_with_file)}")
    print()

In [None]:
# Create a dataframe from the list of dictionaries
iem_dicts_df = pd.DataFrame(iem_dicts)

In [None]:
iem_dicts_df

Unnamed: 0,IEM,Files
0,1Custom Apex Ti,"[output/csv\recode\apexti L.csv, output/csv\re..."
1,1Custom PM01,"[output/csv\hobbytalk\1 Custom PM-01 L.csv, ou..."
2,1MORE ColorBuds 2,"[output/csv\vsg\1MORE ColorBuds 2 L.csv, outpu..."
3,1MORE ComfoBuds Pro,"[output/csv\eplv\1More ComfoBudsPro L.csv, out..."
4,1MORE Evo,"[output/csv\vsg\1MORE EVO L.csv, output/csv\vs..."
...,...,...
1261,Xingshenglong DIY E3000,"[output/csv\eplv\DIY E3000 L.csv, output/csv\e..."
1262,Xingshenglong DIY E4000,"[output/csv\eplv\DIY E4000 L.csv, output/csv\e..."
1263,XINHS Erasmus,"[output/csv\harpo\Erasmus 00 R.csv, output/csv..."
1264,Yanyin Canon,[output/csv\tgx78\Canon L.csv]


In [None]:
iem_dicts_df.to_csv('output/iem_dicts.csv', index=False)