In [2]:
import os
from collections import defaultdict
from fuzzywuzzy import fuzz, process
import re
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np

In [3]:
# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

In [4]:
known_iems = {
    'moondrop blessing 2 dusk': ['dusk', 'blessing 2 dusk', 'b2 dusk', 'md b2 dusk', 'moondrop dusk', 'moondrop blessing 2 dusk']
    # Add more IEMs and their variations here
}


In [5]:
def normalize_file_name(file_name, known_iems, threshold=80):
    file_name = re.sub(r'\.csv$', '', file_name, flags=re.IGNORECASE)  # Remove file extension
    file_name = file_name.strip()  # Remove leading/trailing spaces
    file_name = file_name.lower()  # Convert to lowercase

    best_match = None
    best_ratio = 0

    for iem_name, variations in known_iems.items():
        for variation in variations:
            ratio = fuzz.token_set_ratio(file_name, variation)
            if ratio > best_ratio and ratio >= threshold:
                best_ratio = ratio
                best_match = iem_name

    if best_match:
        return best_match
    else:
        return file_name

In [6]:
def merge_similar_keys(grouped_data, threshold=80):
    keys = list(grouped_data.keys())
    merged_data = {}
    
    for key in keys:
        found_similar = False
        for merged_key in merged_data:
            if fuzz.token_set_ratio(key, merged_key) >= threshold:
                merged_data[merged_key].update(grouped_data[key])
                found_similar = True
                break

        if not found_similar:
            merged_data[key] = grouped_data[key]

    return merged_data

grouped_data = {}

In [7]:
for reviewer in reviewers:
    dir_path = f'output/csv/{reviewer}'
    for file_name in os.listdir(dir_path):
        normalized_name = normalize_file_name(file_name, known_iems)
        file_path = os.path.join(dir_path, file_name)

        if normalized_name not in grouped_data:
            grouped_data[normalized_name] = {}

        if reviewer not in grouped_data[normalized_name]:
            grouped_data[normalized_name][reviewer] = []

        grouped_data[normalized_name][reviewer].append(file_path)

In [8]:
merged_data = merge_similar_keys(grouped_data, threshold=80)

# Optional: Convert the dictionary to a pandas DataFrame with lists of file paths
df = pd.DataFrame.from_dict(merged_data, orient='index')

In [9]:
total_files = sum([len(files) for iem_group in grouped_data.values() for files in iem_group.values()])
total_groups = len(grouped_data)
total_files_in_groups = sum([len(files) for iem_group in grouped_data.values() for files in iem_group.values() if len(files) > 1])
total_files_not_in_groups = total_files - total_files_in_groups
avg_files_per_group = total_files / total_groups

In [10]:
print(f"Total number of files: {total_files}")
print(f"Total number of groups: {total_groups}")
print(f"Total number of files in groups: {total_files_in_groups}")
print(f"Total number of files not in groups: {total_files_not_in_groups}")
print(f"\nAvg number of files per group: {avg_files_per_group}")

Total number of files: 11473
Total number of groups: 6807
Total number of files in groups: 68
Total number of files not in groups: 11405

Avg number of files per group: 1.6854708388423683


In [11]:
# Print grouped files
for representative, group in merged_data.items():
    print(f"{representative}:")
    for reviewer, files in group.items():
        for file in files:
            file_name = os.path.basename(file)
            print(f"  {file_name} - {reviewer}")
    print()


7hz dioko l:
  7Hz Dioko L.csv - achoreviews
  7Hz Dioko R.csv - bedrock
  7Hz Dioko R.csv - timmyv
  7hz Dioko R.csv - hobbytalk
  7HZ DIOKO (bass mod) R.csv - ianfann
  7Hz Dioko R.csv - iemworld
  7Hz Dioko (pre-op) R.csv - data_mrs
  7HZ SALNOTES DIOKO R.csv - aftersound
  7Hz Salnotes Dioko L.csv - arn
  7hz Salnotes Dioko R.csv - nymz
  7Hz Salnotes Dioko TapedVent L.csv - pw
  7Hz Dioko R.csv - hbb
  Dioko R.csv - kr0mka
  Resolve 7Hz Salnotes Dioko Gras L.csv - kurin
  7hz x Crinacle Dioko Updated R.csv - vortexreviews
  7Hz x Crinacle Salnotes Dioko R.csv - vsg

7hz eternal l:
  7Hz Eternal L.csv - achoreviews
  7Hz Eternal R.csv - bedrock
  7Hz ETERNAL R.csv - iemworld
  7hz Eternal R.csv - vortexreviews
  7Hz Eternal R.csv - vsg
  7Hz Eternal R.csv - harpo
  7Hz Eternal R.csv - hbb

7hz legato l:
  7Hz Legato L.csv - achoreviews
  7HZ LEGATO R.csv - aftersound
  7Hz Legato R.csv - timmyv
  7Hz Legato R.csv - cqtek
  7HZ LEGATO R.csv - ianfann
  7hz LEGATO R.csv - nymz
  7hz 

In [12]:
df

Unnamed: 0,achoreviews,bedrock,timmyv,hobbytalk,ianfann,iemworld,data_mrs,aftersound,arn,nymz,...,jacstone,recode,bryaudioreviews,animagus,cammyfi,eplv,shortbus,suporsalad,wdym,akros
7hz dioko l,[output/csv/achoreviews\7Hz Dioko L.csv],[output/csv/bedrock\7Hz Dioko R.csv],[output/csv/timmyv\7Hz Dioko R.csv],[output/csv/hobbytalk\7hz Dioko R.csv],[output/csv/ianfann\7HZ DIOKO (bass mod) R.csv],[output/csv/iemworld\7Hz Dioko R.csv],[output/csv/data_mrs\7Hz Dioko (pre-op) R.csv],[output/csv/aftersound\7HZ SALNOTES DIOKO R.csv],[output/csv/arn\7Hz Salnotes Dioko L.csv],[output/csv/nymz\7hz Salnotes Dioko R.csv],...,,,,,,,,,,
7hz eternal l,[output/csv/achoreviews\7Hz Eternal L.csv],[output/csv/bedrock\7Hz Eternal R.csv],,,,[output/csv/iemworld\7Hz ETERNAL R.csv],,,,,...,,,,,,,,,,
7hz legato l,[output/csv/achoreviews\7Hz Legato L.csv],,[output/csv/timmyv\7Hz Legato R.csv],,[output/csv/ianfann\7HZ LEGATO R.csv],,,[output/csv/aftersound\7HZ LEGATO R.csv],,[output/csv/nymz\7hz LEGATO R.csv],...,,,,,,,,,,
7hz timeless l,[output/csv/achoreviews\7Hz Timeless L.csv],,[output/csv/timmyv\7Hz Timeless R.csv],[output/csv/hobbytalk\7hz Timeless Tape Mod R....,[output/csv/ianfann\7HZ TIMELESS OG R.csv],[output/csv/iemworld\7Hz Timeless (MoonDroop T...,[output/csv/data_mrs\7Hz Timeless (S2) R.csv],[output/csv/aftersound\7HZ TIMELESS S2 R.csv],[output/csv/arn\7Hz Timeless L.csv],[output/csv/nymz\7hz Timeless AE R.csv],...,,,,,,,,,,
7hz zero l,[output/csv/achoreviews\7Hz Zero L.csv],[output/csv/bedrock\7Hz Zero R.csv],[output/csv/timmyv\7Hz Zero R.csv],[output/csv/hobbytalk\7hz Zero Tape R.csv],[output/csv/ianfann\7HZ ZERO MECHA R.csv],[output/csv/iemworld\7hz Zero (Red Orignal Ear...,[output/csv/data_mrs\7Hz Zero R.csv],[output/csv/aftersound\7HZ SALNOTES ZERO R.csv],[output/csv/arn\7Hz Salnotes Zero L.csv],[output/csv/nymz\7hz Salnotes Zero S2 R.csv],...,[output/csv/jacstone\7Hz ZERO S1 R.csv],[output/csv/recode\zero R.csv],,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
elex (elear pads) l,,,,,,,,,,,...,,,,,,,,,[output/csv/wdym\Elex (Elear Pads) L.csv],
elex (utopia pads) l,,,,,,,,,,,...,,,,,,,,,[output/csv/wdym\Elex (Utopia Pads) L.csv],
sparks (basshead tuning) l,,,,,,,,,,,...,,,,,,,,,[output/csv/wdym\Sparks (Stock Tuning) L.csv],
ola stock l,,,,,,,,,,,...,,,,,,,,,,[output/csv/akros\Ola Stock L.csv]


### Group Similar Names

In [18]:
import os
import re
from fuzzywuzzy import fuzz
from sklearn.cluster import DBSCAN
import numpy as np

def levenshtein_distance(a, b):
    return 1 - fuzz.ratio(a, b) / 100

def get_iem_names(reviewers):
    iem_names = set()

    for reviewer in reviewers:
        dir_path = f'output/csv/{reviewer}'
        for file_name in os.listdir(dir_path):
            file_name = re.sub(r'\.csv$', '', file_name, flags=re.IGNORECASE)  # Remove file extension
            file_name = file_name.strip()  # Remove leading/trailing spaces
            file_name = file_name.lower()  # Convert to lowercase
            iem_names.add(file_name)

    return list(iem_names)

# Get unique IEM names
iem_names = get_iem_names(reviewers)

# Create a pairwise distance matrix using the Levenshtein distance
distance_matrix = np.zeros((len(iem_names), len(iem_names)))

for i, name_a in enumerate(iem_names):
    for j, name_b in enumerate(iem_names):
        if i == j:
            continue
        distance_matrix[i, j] = levenshtein_distance(name_a, name_b)

# Cluster IEM names using DBSCAN
dbscan = DBSCAN(eps=0.3, min_samples=2, metric="precomputed")
clusters = dbscan.fit_predict(distance_matrix)

# Print clustered IEM names
clustered_iem_names = {}
for iem_name, cluster in zip(iem_names, clusters):
    if cluster not in clustered_iem_names:
        clustered_iem_names[cluster] = []

    clustered_iem_names[cluster].append(iem_name)

for cluster, names in clustered_iem_names.items():
    print(f"Cluster {cluster}:")
    for name in names:
        print(f"  - {name}")


KeyboardInterrupt: 

In [3]:
import os
import re
import spacy
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_iem_names(reviewers):
    iem_names = set()

    for reviewer in reviewers:
        dir_path = f'output/csv/{reviewer}'
        for file_name in os.listdir(dir_path):
            file_name = re.sub(r'\.csv$', '', file_name, flags=re.IGNORECASE)  # Remove file extension
            file_name = file_name.strip()  # Remove leading/trailing spaces
            file_name = file_name.lower()  # Convert to lowercase
            iem_names.add(file_name)

    return list(iem_names)

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def tokenize(text):
    tokens = [token.text for token in nlp(text)]
    return tokens

# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

# Get unique IEM names
iem_names = preprocess_iem_names(reviewers)

# Tokenize IEM names and compute TF-IDF matrix
vectorizer = TfidfVectorizer(tokenizer=tokenize)
tfidf_matrix = vectorizer.fit_transform(iem_names)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Convert similarity matrix to distance matrix
distance_matrix = 1 - similarity_matrix

# Set small negative values to zero
distance_matrix[distance_matrix < 0] = 0

# Cluster IEM names using DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2, metric="precomputed")
clusters = dbscan.fit_predict(distance_matrix)

# Print clustered IEM names
clustered_iem_names = {}
for iem_name, cluster in zip(iem_names, clusters):
    if cluster not in clustered_iem_names:
        clustered_iem_names[cluster] = []

    clustered_iem_names[cluster].append(iem_name)

for cluster, names in clustered_iem_names.items():
    print(f"Cluster {cluster}:")
    for name in names:
        print(f"  - {name}")


Cluster 0:
  - lz a7 blue monitor r
  - dunu talos (s2 on) l
  - letshuoer galileo (pre-production) r
  - fiio fd3 (foam tips) l
  - tanchjim hana r
  - dunu titan 5 r
  - tangzu heyday r
  - moondrop blessing 2 dusk s1 l
  - supernova final r
  - moondrop blessing 2 (moondrop filter) l
  - letshuoer z12 l
  - cca cra l
  - tanchjim oxygen l
  - truthear hola (wide bore silicone tips) r
  - heart mirror r
  - tripowin x hbb olina a r
  - thieaudio oracle mkii r
  - resolve softears rsv gras l
  - 64 audio u6t xx (silicon tips wide) r
  - thieaudio legacy 3 1 on r
  - etymotic er2xr double r
  - final audio e500 r
  - md droplet dsp r
  - letshuoer z12 (s2) l
  - md kxxx r
  - cca cra (1dividedby8 foam) l
  - sony ier-m7 r
  - dunu titan tanya l
  - heart mirror pro l
  - bqeyz autumn (bass vent, sony import foam) l
  - fiio fd5 r
  - mooondrop kato l
  - bqeyz autumn (y3 wtf inner vent, 2 high-density tuning foam, 500 mesh) r
  - blon bl max r
  - moondrop b2 dusk l
  - bgvp ns9 r
  - 

In [1]:
import os
import re
import numpy as np
from sklearn.cluster import DBSCAN
import textdistance

def preprocess_iem_names(reviewers):
    iem_names = set()

    for reviewer in reviewers:
        dir_path = f'output/csv/{reviewer}'
        for file_name in os.listdir(dir_path):
            file_name = re.sub(r'\.csv$', '', file_name, flags=re.IGNORECASE)  # Remove file extension
            file_name = file_name.strip()  # Remove leading/trailing spaces
            file_name = file_name.lower()  # Convert to lowercase
            iem_names.add(file_name)

    return list(iem_names)

# List of reviewers
reviewers = [
    'achoreviews',
    'aftersound',
    'animagus',
    'arn',
    'bedrock',
    'bryaudioreviews',
    'cammyfi',
    'soundjedi',
    'eplv',
    'timmyv',
    'harpo',
    'hbb',
    'cqtek',
    'hobbytalk',
    'ianfann',
    'iemworld',
    'jacstone',
    'kr0mka',
    'kurin',
    'melatonin',
    'nymz', 
    'pw',
    'recode',
    'rg',
    'shortbus',
    'suporsalad',
    'tgx78',
    'vortexreviews',
    'vsg',
    'wdym',
    'akros',
    'data_mrs'
]

# Get unique IEM names
iem_names = preprocess_iem_names(reviewers)

# Compute the Levenshtein distance matrix
distance_matrix = np.zeros((len(iem_names), len(iem_names)))
for i, name1 in enumerate(iem_names):
    for j, name2 in enumerate(iem_names):
        distance_matrix[i, j] = textdistance.levenshtein.normalized_distance(name1, name2)

# Cluster IEM names using DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2, metric="precomputed")
clusters = dbscan.fit_predict(distance_matrix)

# Print clustered IEM names
clustered_iem_names = {}
for iem_name, cluster in zip(iem_names, clusters):
    if cluster not in clustered_iem_names:
        clustered_iem_names[cluster] = []

    clustered_iem_names[cluster].append(iem_name)

for cluster, names in clustered_iem_names.items():
    print(f"Cluster {cluster}:")
    for name in names:
        print(f"  - {name}")


Cluster 0:
  - tripowin x hbb olina (moondrop eartips) r
  - trn mt1 r
  - hidizs ms5 treble tk2 l
  - mim dark magician l
  - shuoer z12 l
  - tri meteor r
  - rikudougoku_target
  - 7a est7 l
  - akoustyx s-6 (tanchjim) l
  - mestjp l
  - tanchjim cora l
  - kinera celest pandamon l
  - intime sora29 l
  - geek wold gk80 mic r
  - gs audio gd5 l
  - thieaudio wraith pad a r
  - kxxs (symbio w peel) r
  - kz dq6 xl r
  - meizu live (blue) l
  - vision ears elysium r
  - celest pandamon l
  - audiosense dt600 (stock) r
  - qdc v14 standard r
  - letshuoer d13 stock nozzle l
  - thieaudio legacy 5 (75 ohm) l
  - trn bax r
  - harmonicdyne g200 flannel l
  - final audio a4000 v2 r
  - final e2000 (s2) r
  - aiderlot m5 (treble + sony import foam) l
  - calcini sweatpro 3 l
  - joyodio shine all on r
  - final ze3000 s2 l
  - trn reference r
  - qoa magarita r
  - tinhifi p1 max giant panda r
  - madoo type 512 r
  - softears volume (ef04 muse02 + hidizs sonata hd + y2 paper filter on ba 