In [11]:
# Import libraries
import os
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
# Function to extract text content from .ipynb files
def extract_text_from_ipynb(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = json.load(f)
    text_data = []
    for cell in content.get('cells', []):
        if cell.get('cell_type') == 'markdown' or cell.get('cell_type') == 'code':
            text_data.append(' '.join(cell.get('source', [])))
    return ' '.join(text_data)

# Function to calculate pairwise similarity
def calculate_similarity(file_texts, file_names):
    # Vectorize text using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(file_texts)
    
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    # Create a DataFrame for readability
    similarity_df = pd.DataFrame(similarity_matrix, index=file_names, columns=file_names)
    return similarity_df

# Main function
def analyze_similarity_in_folder(folder_path):
    # Gather .ipynb files in the folder
    files = [f for f in os.listdir(folder_path) if f.endswith('.ipynb')]
    file_texts = []
    for file in files:
        file_path = os.path.join(folder_path, file)
        file_texts.append(extract_text_from_ipynb(file_path))
    
    # Calculate similarity matrix
    similarity_df = calculate_similarity(file_texts, files)
    
    # Identify high similarity pairs
    high_similarity_pairs = []
    for i in range(len(files)):
        for j in range(i + 1, len(files)):
            if similarity_df.iloc[i, j] > 0.8:  # Threshold for high similarity
                high_similarity_pairs.append((files[i], files[j], similarity_df.iloc[i, j]))
    
    return similarity_df, high_similarity_pairs

# Specify the folder path
folder_path = "/Users/samirachaunkaria/Desktop/FIN 359 Coursework"

# Run the analysis
similarity_matrix, high_similarity_pairs = analyze_similarity_in_folder(folder_path)

# Output results
print("Similarity Matrix:")
print(similarity_matrix)

if high_similarity_pairs:
    print("\nHighly Similar File Pairs (Threshold > 0.8):")
    for pair in high_similarity_pairs:
        print(f"{pair[0]} and {pair[1]}: Similarity = {pair[2]:.2f}")
else:
    print("\nNo highly similar file pairs found.")

Similarity Matrix:
                                                    Untitled7.ipynb  \
Untitled7.ipynb                                                 0.0   
if-else statement.ipynb                                         0.0   
Launcher.ipynb                                                  0.0   
Untitled5.ipynb                                                 0.0   
Untitled1.ipynb                                                 0.0   
Intro to Pandas - In class demo (Oct 29).ipynb                  0.0   
Untitled3.ipynb                                                 0.0   
Untitled.ipynb                                                  0.0   
samira_chaunkaria_homework_3.ipynb                              0.0   
Untitled4.ipynb                                                 0.0   
Untitled6.ipynb                                                 0.0   
Robotic_OLS.ipynb                                               0.0   
NPV exercise.ipynb                                        