In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import math

def calculate_cosine_similarity(string1, string2):
    if not string1.strip() or not string2.strip():
        return 0.0  # Return 0 if either string is empty or contains only whitespace
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([string1, string2])
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity

# Create Correlation CSV file
def process_pairs(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Create an empty list to store all pairs
    all_pairs = []

    # Calculate total number of pairs
    total_pairs = math.comb(len(df), 2)

    # Initialize tqdm progress bar
    progress_bar = tqdm(total=total_pairs, desc="Processing pairs", unit=" pair")

    # Iterate through every pair of rows
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            id1 = df.loc[i, 'video_id']
            id2 = df.loc[j, 'video_id']

            title1 = df.loc[i, 'title']
            title2 = df.loc[j, 'title']
            title_similarity = calculate_cosine_similarity(title1, title2)

            if pd.isna(df.loc[i, 'description']) or pd.isna(df.loc[j, 'description']):
                description_similarity = 0  # If description is NaN, assign 0 to similarity
            else:
                description1 = df.loc[i, 'description']
                description2 = df.loc[j, 'description']
                description_similarity = calculate_cosine_similarity(description1, description2)

            comment1 = ''
            comment2 = ''
            filepath = 'Datasets/final_comments/' + str(id1) + '.txt'
            if os.path.exists(filepath):
                with open(filepath, 'r') as file:
                    comment1 = file.read()
            filepath = 'Datasets/final_comments/' + str(id2) + '.txt'
            if os.path.exists(filepath):
                with open(filepath, 'r') as file:
                    comment2 = file.read()
                    
            comment_similarity = calculate_cosine_similarity(comment1, comment2)

            # all_pairs.append({'id1': id1, 'id2': id2, 'comment': comment_similarity})
            all_pairs.append({'id1': id1, 'id2': id2,
                              'title':title_similarity,
                              'description': description_similarity,
                              'comment': comment_similarity})

            # Update progress bar
            progress_bar.update(1)

    # Close progress bar
    progress_bar.close()

    similarity_df = pd.DataFrame(all_pairs)
    similarity_df.to_csv('Datasets/Correlation_2.0.csv', index=False)

# Example usage:
# file_path = 'test.csv' 
file_path = 'Datasets/final_data.csv'  # Specify the path to your CSV file
process_pairs(file_path)
