# %% [markdown]
# # VidyaVichar Code Similarity Analysis
# ## CS6.302 - Assignment 3, Question 1
# 
# This notebook performs comprehensive similarity analysis across 27 MERN stack implementations

# %% [markdown]
# ## Part A: Preprocessing & Data Understanding


In [2]:
import os
import re
import json
import glob
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# %%
class CodePreprocessor:
    """Handles preprocessing of code files for similarity analysis"""
    
    def __init__(self, projects_dir: str):
        self.projects_dir = Path(projects_dir)
        self.valid_extensions = {'.js', '.jsx', '.json', '.css'}
        self.projects_data = {}
        
    def remove_comments_and_logs(self, code: str, file_ext: str) -> str:
        """Remove comments and console logs from code"""
        if file_ext in ['.js', '.jsx']:
            # Remove single-line comments
            code = re.sub(r'//.*?\n', '\n', code)
            # Remove multi-line comments
            code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
            # Remove console.log statements
            code = re.sub(r'console\.log\([^)]*\);?', '', code)
        elif file_ext == '.css':
            # Remove CSS comments
            code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        return code
    
    def normalize_formatting(self, code: str) -> str:
        """Normalize whitespace and indentation"""
        # Remove extra whitespace
        lines = [line.strip() for line in code.split('\n')]
        # Remove empty lines
        lines = [line for line in lines if line]
        return '\n'.join(lines)
    
    def is_minified(self, code: str) -> bool:
        """Check if code is minified"""
        lines = code.split('\n')
        if not lines:
            return False
        avg_line_length = sum(len(line) for line in lines) / len(lines)
        return avg_line_length > 200
    
    def preprocess_file(self, file_path: Path) -> str:
        """Preprocess a single file"""
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            file_ext = file_path.suffix
            
            # Skip minified files
            if self.is_minified(content):
                return ""
            
            # Remove comments and logs
            content = self.remove_comments_and_logs(content, file_ext)
            
            # Normalize formatting
            content = self.normalize_formatting(content)
            
            return content
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return ""
    
    def analyze_project(self, project_path: Path) -> Dict:
        """Analyze a single project and return statistics"""
        stats = {
            'project_name': project_path.name,
            'total_files': 0,
            'total_folders': 0,
            'loc': 0,
            'js_files': 0,
            'jsx_files': 0,
            'json_files': 0,
            'css_files': 0,
            'react_components': 0,
            'express_routes': 0,
            'mongoose_models': 0,
            'file_contents': {}
        }
        
        # Count folders
        stats['total_folders'] = sum(1 for _ in project_path.rglob('*') if _.is_dir())
        
        # Process files
        for file_path in project_path.rglob('*'):
            if file_path.is_file() and file_path.suffix in self.valid_extensions:
                stats['total_files'] += 1
                
                # Count by extension
                ext = file_path.suffix
                if ext == '.js':
                    stats['js_files'] += 1
                elif ext == '.jsx':
                    stats['jsx_files'] += 1
                elif ext == '.json':
                    stats['json_files'] += 1
                elif ext == '.css':
                    stats['css_files'] += 1
                
                # Preprocess and store content
                content = self.preprocess_file(file_path)
                if content:
                    relative_path = str(file_path.relative_to(project_path))
                    stats['file_contents'][relative_path] = content
                    
                    # Count LOC
                    stats['loc'] += len(content.split('\n'))
                    
                    # Detect React components
                    if ext in ['.js', '.jsx']:
                        if re.search(r'(class\s+\w+\s+extends\s+React\.Component|function\s+\w+\s*\([^)]*\)\s*{.*return\s*\(?\s*<)', content):
                            stats['react_components'] += 1
                        
                        # Detect Express routes
                        if re.search(r'(app|router)\.(get|post|put|delete|patch)\s*\(', content):
                            stats['express_routes'] += 1
                        
                        # Detect Mongoose models
                        if re.search(r'mongoose\.model\s*\(|new\s+mongoose\.Schema\s*\(', content):
                            stats['mongoose_models'] += 1
        
        return stats
    
    def preprocess_all_projects(self) -> pd.DataFrame:
        """Preprocess all projects and return summary DataFrame"""
        all_stats = []
        
        # Get all project directories
        project_dirs = [d for d in self.projects_dir.iterdir() if d.is_dir()]
        
        print(f"Found {len(project_dirs)} projects to analyze...")
        
        for i, project_dir in enumerate(project_dirs, 1):
            print(f"Processing {i}/{len(project_dirs)}: {project_dir.name}")
            stats = self.analyze_project(project_dir)
            self.projects_data[project_dir.name] = stats
            all_stats.append({
                'Project': stats['project_name'],
                'Total Files': stats['total_files'],
                'Total Folders': stats['total_folders'],
                'Lines of Code': stats['loc'],
                'JS Files': stats['js_files'],
                'JSX Files': stats['jsx_files'],
                'JSON Files': stats['json_files'],
                'CSS Files': stats['css_files'],
                'React Components': stats['react_components'],
                'Express Routes': stats['express_routes'],
                'Mongoose Models': stats['mongoose_models']
            })
        
        return pd.DataFrame(all_stats)

# %%
# Initialize preprocessor
# CHANGE THIS PATH to where your 27 projects are located
PROJECTS_DIR = "ALL_PROJECTS"

preprocessor = CodePreprocessor(PROJECTS_DIR)
summary_df = preprocessor.preprocess_all_projects()

# %%
# Display summary
print("\n=== Project Summary Statistics ===")
print(summary_df.to_string(index=False))
print("\n=== Summary Statistics ===")
print(summary_df.describe())

# %%
# Save preprocessing summary
os.makedirs('results', exist_ok=True)
summary_df.to_csv('results/preprocessing_summary.csv', index=False)
print("\nPreprocessing summary saved to results/preprocessing_summary.csv")

Found 18 projects to analyze...
Processing 1/18: Team-26
Processing 2/18: Team-03
Processing 3/18: Team-27
Processing 4/18: Team-33
Processing 5/18: Team-28
Processing 6/18: Team-06
Processing 7/18: Team-29
Processing 8/18: Team-31
Processing 9/18: Team-16
Processing 10/18: Team-22
Processing 11/18: Team-25
Processing 12/18: Team-05
Processing 13/18: Team-20
Processing 14/18: Team-10
Processing 15/18: Team-14
Processing 16/18: Team-21
Processing 17/18: Team-13
Processing 18/18: Team-32

=== Project Summary Statistics ===
Project  Total Files  Total Folders  Lines of Code  JS Files  JSX Files  JSON Files  CSS Files  React Components  Express Routes  Mongoose Models
Team-26           29             27           4429        16          7           4          2                 0               5                4
Team-03           32             30           7850        12         12           6          2                 0               2                2
Team-27           53             35

# %% [markdown]
# ## Part B: Code Similarity Computation

In [4]:
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import itertools

# %%
class SimilarityAnalyzer:
    """Performs multi-level similarity analysis"""
    
    def __init__(self, projects_data: Dict):
        self.projects_data = projects_data
        self.project_names = list(projects_data.keys())
        self.n_projects = len(self.project_names)
        
    def get_combined_content(self, project_name: str) -> str:
        """Combine all file contents for a project"""
        contents = self.projects_data[project_name]['file_contents']
        return '\n\n'.join(contents.values())
    
    def textual_similarity_difflib(self) -> np.ndarray:
        """Compute textual similarity using difflib"""
        print("\nComputing textual similarity (difflib)...")
        similarity_matrix = np.zeros((self.n_projects, self.n_projects))
        
        for i, proj1 in enumerate(self.project_names):
            content1 = self.get_combined_content(proj1)
            for j, proj2 in enumerate(self.project_names):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                elif i < j:
                    content2 = self.get_combined_content(proj2)
                    ratio = SequenceMatcher(None, content1, content2).ratio()
                    similarity_matrix[i][j] = ratio
                    similarity_matrix[j][i] = ratio
            print(f"  Processed {i+1}/{self.n_projects}")
        
        return similarity_matrix
    
    def textual_similarity_tfidf(self) -> np.ndarray:
        """Compute textual similarity using TF-IDF and cosine similarity"""
        print("\nComputing textual similarity (TF-IDF + Cosine)...")
        
        # Prepare documents
        documents = [self.get_combined_content(proj) for proj in self.project_names]
        
        # Create TF-IDF matrix
        vectorizer = TfidfVectorizer(
            max_features=5000,
            token_pattern=r'\b\w+\b',
            ngram_range=(1, 2)
        )
        tfidf_matrix = vectorizer.fit_transform(documents)
        
        # Compute cosine similarity
        similarity_matrix = cosine_similarity(tfidf_matrix)
        
        return similarity_matrix
    
    def structural_similarity_ast(self) -> np.ndarray:
        """Compute structural similarity based on file structure and patterns"""
        print("\nComputing structural similarity...")
        similarity_matrix = np.zeros((self.n_projects, self.n_projects))
        
        def extract_structure_features(project_name: str) -> Dict:
            """Extract structural features from project"""
            contents = self.projects_data[project_name]['file_contents']
            features = {
                'file_types': defaultdict(int),
                'imports': set(),
                'function_names': set(),
                'route_patterns': set(),
                'model_schemas': set()
            }
            
            for path, content in contents.items():
                ext = Path(path).suffix
                features['file_types'][ext] += 1
                
                # Extract imports
                imports = re.findall(r'import\s+.*?from\s+[\'"]([^\'"]+)[\'"]', content)
                features['imports'].update(imports)
                
                # Extract function names
                functions = re.findall(r'(?:function|const|let|var)\s+(\w+)\s*[=\(]', content)
                features['function_names'].update(functions)
                
                # Extract route patterns
                routes = re.findall(r'\.(get|post|put|delete|patch)\s*\(\s*[\'"]([^\'"]+)[\'"]', content)
                features['route_patterns'].update([r[1] for r in routes])
                
                # Extract schema fields
                schemas = re.findall(r'(\w+)\s*:\s*\{\s*type\s*:', content)
                features['model_schemas'].update(schemas)
            
            return features
        
        # Extract features for all projects
        all_features = {proj: extract_structure_features(proj) for proj in self.project_names}
        
        # Compute similarity
        for i, proj1 in enumerate(self.project_names):
            feat1 = all_features[proj1]
            for j, proj2 in enumerate(self.project_names):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                elif i < j:
                    feat2 = all_features[proj2]
                    
                    # Compute Jaccard similarity for various features
                    similarities = []
                    
                    # Import similarity
                    if feat1['imports'] or feat2['imports']:
                        import_sim = len(feat1['imports'] & feat2['imports']) / len(feat1['imports'] | feat2['imports'])
                        similarities.append(import_sim)
                    
                    # Function name similarity
                    if feat1['function_names'] or feat2['function_names']:
                        func_sim = len(feat1['function_names'] & feat2['function_names']) / len(feat1['function_names'] | feat2['function_names'])
                        similarities.append(func_sim)
                    
                    # Route pattern similarity
                    if feat1['route_patterns'] or feat2['route_patterns']:
                        route_sim = len(feat1['route_patterns'] & feat2['route_patterns']) / len(feat1['route_patterns'] | feat2['route_patterns'])
                        similarities.append(route_sim)
                    
                    # Model schema similarity
                    if feat1['model_schemas'] or feat2['model_schemas']:
                        schema_sim = len(feat1['model_schemas'] & feat2['model_schemas']) / len(feat1['model_schemas'] | feat2['model_schemas'])
                        similarities.append(schema_sim)
                    
                    # Average similarity
                    avg_sim = np.mean(similarities) if similarities else 0.0
                    similarity_matrix[i][j] = avg_sim
                    similarity_matrix[j][i] = avg_sim
            
            print(f"  Processed {i+1}/{self.n_projects}")
        
        return similarity_matrix
    
    def semantic_similarity_embeddings(self) -> np.ndarray:
        """Compute semantic similarity using sentence transformers"""
        print("\nComputing semantic similarity (embeddings)...")
        
        try:
            # Load model
            model = SentenceTransformer('all-MiniLM-L6-v2')
            
            # Prepare documents (truncate to avoid memory issues)
            documents = []
            for proj in self.project_names:
                content = self.get_combined_content(proj)
                # Take first 10000 characters to avoid memory issues
                documents.append(content[:10000])
            
            # Generate embeddings
            embeddings = model.encode(documents, show_progress_bar=True)
            
            # Compute cosine similarity
            similarity_matrix = cosine_similarity(embeddings)
            
            return similarity_matrix
        except Exception as e:
            print(f"Error in semantic similarity: {e}")
            print("Falling back to TF-IDF similarity...")
            return self.textual_similarity_tfidf()

# %%
# Perform similarity analysis
analyzer = SimilarityAnalyzer(preprocessor.projects_data)

# Compute different similarity metrics
similarity_difflib = analyzer.textual_similarity_difflib()
similarity_tfidf = analyzer.textual_similarity_tfidf()
similarity_structural = analyzer.structural_similarity_ast()
similarity_semantic = analyzer.semantic_similarity_embeddings()

# %%
# Save similarity matrices
np.save('results/similarity_difflib.npy', similarity_difflib)
np.save('results/similarity_tfidf.npy', similarity_tfidf)
np.save('results/similarity_structural.npy', similarity_structural)
np.save('results/similarity_semantic.npy', similarity_semantic)

print("\nSimilarity matrices saved to results/")

# %%
# Create DataFrames for better visualization
project_names = analyzer.project_names

df_difflib = pd.DataFrame(similarity_difflib, index=project_names, columns=project_names)
df_tfidf = pd.DataFrame(similarity_tfidf, index=project_names, columns=project_names)
df_structural = pd.DataFrame(similarity_structural, index=project_names, columns=project_names)
df_semantic = pd.DataFrame(similarity_semantic, index=project_names, columns=project_names)

# Save as CSV
df_difflib.to_csv('results/similarity_matrix_difflib.csv')
df_tfidf.to_csv('results/similarity_matrix_tfidf.csv')
df_structural.to_csv('results/similarity_matrix_structural.csv')
df_semantic.to_csv('results/similarity_matrix_semantic.csv')

ModuleNotFoundError: No module named 'sklearn'

# %% [markdown]
# ## Part C: Visualization & Reporting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from scipy.cluster.hierarchy import dendrogram, linkage

# %%
class SimilarityVisualizer:
    """Visualizes similarity analysis results"""
    
    def __init__(self, project_names: List[str]):
        self.project_names = project_names
        
    def plot_heatmap(self, similarity_matrix: np.ndarray, title: str, filename: str):
        """Plot similarity heatmap"""
        plt.figure(figsize=(14, 12))
        sns.heatmap(
            similarity_matrix,
            xticklabels=self.project_names,
            yticklabels=self.project_names,
            annot=False,
            cmap='YlOrRd',
            vmin=0,
            vmax=1,
            cbar_kws={'label': 'Similarity Score'}
        )
        plt.title(title, fontsize=16, fontweight='bold')
        plt.xlabel('Projects', fontsize=12)
        plt.ylabel('Projects', fontsize=12)
        plt.xticks(rotation=90, ha='right', fontsize=8)
        plt.yticks(rotation=0, fontsize=8)
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved: results/{filename}")
    
    def plot_network_graph(self, similarity_matrix: np.ndarray, title: str, filename: str, threshold: float = 0.5):
        """Plot network graph of similar projects"""
        plt.figure(figsize=(16, 12))
        
        # Create graph
        G = nx.Graph()
        
        # Add nodes
        for name in self.project_names:
            G.add_node(name)
        
        # Add edges for similarities above threshold
        for i in range(len(self.project_names)):
            for j in range(i + 1, len(self.project_names)):
                if similarity_matrix[i][j] >= threshold:
                    G.add_edge(
                        self.project_names[i],
                        self.project_names[j],
                        weight=similarity_matrix[i][j]
                    )
        
        # Layout
        pos = nx.spring_layout(G, k=2, iterations=50)
        
        # Draw
        nx.draw_networkx_nodes(G, pos, node_size=500, node_color='lightblue', alpha=0.9)
        nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold')
        
        # Draw edges with varying thickness
        edges = G.edges()
        weights = [G[u][v]['weight'] for u, v in edges]
        nx.draw_networkx_edges(G, pos, width=[w*3 for w in weights], alpha=0.5)
        
        plt.title(f'{title}\n(Threshold: {threshold})', fontsize=16, fontweight='bold')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved: results/{filename}")
    
    def plot_bar_chart(self, matrices: Dict[str, np.ndarray], filename: str):
        """Plot bar chart of average similarities"""
        plt.figure(figsize=(12, 6))
        
        metric_names = list(matrices.keys())
        avg_similarities = []
        
        for matrix in matrices.values():
            # Get upper triangle (excluding diagonal)
            mask = np.triu(np.ones_like(matrix, dtype=bool), k=1)
            avg_sim = matrix[mask].mean()
            avg_similarities.append(avg_sim)
        
        bars = plt.bar(metric_names, avg_similarities, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
        plt.ylabel('Average Similarity Score', fontsize=12)
        plt.xlabel('Similarity Metric', fontsize=12)
        plt.title('Average Similarity Scores by Metric', fontsize=16, fontweight='bold')
        plt.ylim(0, 1)
        
        # Add value labels on bars
        for bar, val in zip(bars, avg_similarities):
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{val:.3f}',
                    ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved: results/{filename}")
    
    def plot_distribution(self, similarity_matrix: np.ndarray, title: str, filename: str):
        """Plot distribution of similarity scores"""
        plt.figure(figsize=(10, 6))
        
        # Get upper triangle (excluding diagonal)
        mask = np.triu(np.ones_like(similarity_matrix, dtype=bool), k=1)
        similarities = similarity_matrix[mask]
        
        plt.hist(similarities, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
        plt.axvline(similarities.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {similarities.mean():.3f}')
        plt.axvline(np.median(similarities), color='green', linestyle='--', linewidth=2, label=f'Median: {np.median(similarities):.3f}')
        
        plt.xlabel('Similarity Score', fontsize=12)
        plt.ylabel('Frequency', fontsize=12)
        plt.title(title, fontsize=16, fontweight='bold')
        plt.legend()
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved: results/{filename}")

# %%
# Create visualizations
visualizer = SimilarityVisualizer(project_names)

print("\nGenerating visualizations...")

# Heatmaps
visualizer.plot_heatmap(similarity_difflib, 'Textual Similarity (Difflib)', 'heatmap_difflib.png')
visualizer.plot_heatmap(similarity_tfidf, 'Textual Similarity (TF-IDF)', 'heatmap_tfidf.png')
visualizer.plot_heatmap(similarity_structural, 'Structural Similarity', 'heatmap_structural.png')
visualizer.plot_heatmap(similarity_semantic, 'Semantic Similarity', 'heatmap_semantic.png')

# Network graphs
visualizer.plot_network_graph(similarity_tfidf, 'Project Similarity Network (TF-IDF)', 'network_tfidf.png', threshold=0.3)
visualizer.plot_network_graph(similarity_structural, 'Project Similarity Network (Structural)', 'network_structural.png', threshold=0.3)

# Bar chart
matrices_dict = {
    'Difflib': similarity_difflib,
    'TF-IDF': similarity_tfidf,
    'Structural': similarity_structural,
    'Semantic': similarity_semantic
}
visualizer.plot_bar_chart(matrices_dict, 'average_similarities.png')

# Distributions
visualizer.plot_distribution(similarity_tfidf, 'Distribution of TF-IDF Similarity Scores', 'distribution_tfidf.png')
visualizer.plot_distribution(similarity_structural, 'Distribution of Structural Similarity Scores', 'distribution_structural.png')

# %%
# Find most and least similar pairs
def find_extreme_pairs(similarity_matrix: np.ndarray, project_names: List[str], metric_name: str):
    """Find most and least similar project pairs"""
    n = len(project_names)
    
    # Get upper triangle (excluding diagonal)
    pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            pairs.append((project_names[i], project_names[j], similarity_matrix[i][j]))
    
    # Sort by similarity
    pairs.sorted = sorted(pairs, key=lambda x: x[2], reverse=True)
    
    print(f"\n{metric_name}:")
    print("\nTop 5 Most Similar Pairs:")
    for proj1, proj2, sim in pairs[:5]:
        print(f"  {proj1} <-> {proj2}: {sim:.4f}")
    
    print("\nTop 5 Least Similar Pairs:")
    for proj1, proj2, sim in pairs[-5:]:
        print(f"  {proj1} <-> {proj2}: {sim:.4f}")

# %%
print("\n" + "="*80)
print("SIMILARITY ANALYSIS RESULTS")
print("="*80)

find_extreme_pairs(similarity_difflib, project_names, "Textual Similarity (Difflib)")
find_extreme_pairs(similarity_tfidf, project_names, "Textual Similarity (TF-IDF)")
find_extreme_pairs(similarity_structural, project_names, "Structural Similarity")
find_extreme_pairs(similarity_semantic, project_names, "Semantic Similarity")

# %%
# Generate summary statistics
summary_stats = {
    'Metric': [],
    'Mean': [],
    'Median': [],
    'Std Dev': [],
    'Min': [],
    'Max': []
}

for metric_name, matrix in matrices_dict.items():
    mask = np.triu(np.ones_like(matrix, dtype=bool), k=1)
    values = matrix[mask]
    
    summary_stats['Metric'].append(metric_name)
    summary_stats['Mean'].append(values.mean())
    summary_stats['Median'].append(np.median(values))
    summary_stats['Std Dev'].append(values.std())
    summary_stats['Min'].append(values.min())
    summary_stats['Max'].append(values.max())

stats_df = pd.DataFrame(summary_stats)
stats_df.to_csv('results/summary_statistics.csv', index=False)

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(stats_df.to_string(index=False))

print("\nâœ… All analysis complete! Check the 'results/' folder for outputs.")