In [2]:
# from summa.summarizer import summarize
# from summa import keywords
# import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Similarity is from 0 to 1 (0: totally different, 1: totally identical)
def get_similarity(text1, text2):

    # Create a TfidfVectorizer to convert the texts to vectors
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])

    # Calculate the cosine similarity between the vectors
    similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))[0][0]
    return similarity

In [4]:
import pandas as pd
import numpy as np

# Read processed train data
df_processed_train = pd.read_csv('./sjang/data/df_train_processed.csv')#.head(100)
df_processed_train = df_processed_train[['processed_transcription', 'labels']]

nrows = df_processed_train.shape[0]
nlabels = df_processed_train['labels'].nunique()

# Remove 'N' since we don't want numbers for similarity calculation
df_processed_train['processed_transcription'] = df_processed_train['processed_transcription'].str.replace('N', '').str.replace(' +', ' ')

# Processed transcription column as list of all the processed transcriptions
texts = list(df_processed_train['processed_transcription'])

# Store similarity between texts in 2D matrix
df_similarity_transcription_matrix = [[0 for _ in range(nrows)] for _2 in range(nrows)]

# Get similarity between each transcription
for i in range(nrows):
    for j in range(i):

        text_1 = texts[i]
        text_2 = texts[j]

        # Get the simarilty between text_i & text_j
        df_similarity_transcription_matrix[i][j] = get_similarity(text_1, text_2)

FileNotFoundError: [Errno 2] No such file or directory: './sjang/data/df_train_processed.csv'

In [151]:
# Get the mean similarity between two labels (using all of their relevant transcriptions)
def get_mean_similarity(idx_lst_1, idx_lst_2):

    # We need the total count to get the average
    similarity_count = 0
    similarity_summation = 0
    
    for idx1 in idx_lst_1:
        for idx2 in idx_lst_2:

            # Skip since we didn't store for j bigger or equal to i in similarity matrix-(i, j)
            if idx1 <= idx2: continue

            # Count each instance
            similarity_count += 1
            
            # Add each similarity
            similarity_summation += df_similarity_transcription_matrix[idx1][idx2]

    return similarity_summation / similarity_count

In [182]:
df_similarity_label_matrix = [[0 for _ in range(nlabels)] for _2 in range(nlabels)]

for label_1 in range(nlabels):
    for label_2 in range(label_1):

        # Transcription index of each label
        transcription_idx_label_1 = list(df_processed_train[df_processed_train['labels'] == label_1].index)
        transcription_idx_label_2 = list(df_processed_train[df_processed_train['labels'] == label_2].index)
        
        # Get the mean similarity between two labels
        df_similarity_label_matrix[label_1][label_2] = get_mean_similarity(transcription_idx_label_1, transcription_idx_label_2)

# 여기까지 직접 돌려보고 df_similarity_label_matrix가 어떤 값들을 가지는지 봐주세요.
# 그거에 따라 threshold_similarity를 정해서 threshold_similarity 이상의 similarity를 가지는 label끼리 묶으면 됩니다
threshold_similarity = 0.1 # Change this value later

# Set value as 0 if below threshold, otherwise set as 1
for i in range(nlabels):
    for j in range(nlabels):
        similarity_val = df_similarity_label_matrix[i][j]
        df_similarity_label_matrix[i][j] = 0 if similarity_val < threshold_similarity else 1

# Set upper-triangle as well
for i in range(nlabels):
    for j in range(nlabels):
        if df_similarity_label_matrix[i][j] == 1:
            df_similarity_label_matrix[j][i] = 1

In [183]:
def get_connected_nodes(adj_matrix, start_node):
    visited = set()
    queue = [start_node]
    while queue:
        node = queue.pop(0)
        if node not in visited:
            visited.add(node)
            for idx, value in enumerate(adj_matrix[node]):
                if value == 1:
                    queue.append(idx)
    return visited

all_connected_labels = []

for label in range(nlabels):
    cur_group = get_connected_nodes(df_similarity_label_matrix, label)
    if cur_group in all_connected_labels: continue
    all_connected_labels.append(cur_group)

In [184]:
print(all_connected_labels)

[{0, 3, 4, 5, 7, 8, 10, 11, 13, 14, 15, 16, 17, 20, 21, 23}, {1, 18, 12, 6}, {2}, {9}, {19}, {22}]
