In [80]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Similarity is from 0 to 1 (0: totally different, 1: totally identical)
def get_similarity(text1, text2):

    # Create a TfidfVectorizer to convert the texts to vectors
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])

    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))[0][0]
    return similarity

# Read processed train data
df_processed_train = pd.read_csv('df_train_processed.csv')
df_processed_train = df_processed_train[['processed_transcription', 'labels']]

# Number of labels: 40
nlabels = df_processed_train['labels'].nunique()

# Remove 'N' since we don't want numbers for similarity calculation
df_processed_train['processed_transcription'] = df_processed_train['processed_transcription'].str.replace('N', '').str.replace(' +', ' ')

# For selecting sample transcriptions for similarities
temp_df = df_processed_train.copy()
df_processed_train = pd.DataFrame()

# Sample from each label
for label in range(nlabels):
    sample_size = min(temp_df[temp_df['labels'] == label].shape[0], 35)
    df_processed_train = pd.concat([df_processed_train, temp_df[temp_df['labels'] == label].sample(n = sample_size)])

df_processed_train = df_processed_train.reset_index(drop = True)
nrows = df_processed_train.shape[0]

# Processed transcription column as list of all the processed transcriptions
texts = list(df_processed_train['processed_transcription'])

# Store similarity between texts in 2D matrix
df_similarity_transcription_matrix = [[0 for _ in range(nrows)] for _2 in range(nrows)]

# Get similarity between each transcription
for i in range(nrows):
    for j in range(i):

        text_1 = texts[i]
        text_2 = texts[j]

        # Get the simarilty between text_i & text_j
        df_similarity_transcription_matrix[i][j] = get_similarity(text_1, text_2)

# Get the mean similarity between two labels (using all of their relevant transcriptions)
def get_mean_similarity(idx_lst_1, idx_lst_2):

    # We need the total count to get the average
    similarity_count = 0
    similarity_summation = 0
    
    for idx1 in idx_lst_1:
        for idx2 in idx_lst_2:

            # Skip since we didn't store for j bigger or equal to i in similarity matrix-(i, j)
            if idx1 <= idx2: continue

            # Count each instance
            similarity_count += 1
            
            # Add each similarity
            similarity_summation += df_similarity_transcription_matrix[idx1][idx2]

    return similarity_summation / similarity_count

df_similarity_label_matrix = [[0 for _ in range(nlabels)] for _2 in range(nlabels)]

for label_1 in range(nlabels):
    for label_2 in range(label_1):

        # Transcription index of each label
        transcription_idx_label_1 = list(df_processed_train[df_processed_train['labels'] == label_1].index)
        transcription_idx_label_2 = list(df_processed_train[df_processed_train['labels'] == label_2].index)
        
        # Get the mean similarity between two labels
        df_similarity_label_matrix[label_1][label_2] = get_mean_similarity(transcription_idx_label_1, transcription_idx_label_2)

# Visualize df_similarity_label_matrix

# linkage:
# (1, 18, 17, 11, 37)
# (15, 36, 32, 29, 16)
# (23, 5, 34, 9, 4)
# (30, 2, 10, 19, 35)
# (14, 40, 22, 12, 31)
# (7, 21, 20, 13, 28)
# (6, 33, 39, 8, 24)
# (26, 25, 27, 3, 38)