
# %% [markdown]
# # VidyaVichar Code Similarity Analysis
# ## CS6.302 - Assignment 3, Question 1
# 
# This notebook performs comprehensive similarity analysis across 27 MERN stack implementations

# %% [markdown]
# ## Part A: Preprocessing & Data Understanding

In [1]:
import os
import re
import json
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import warnings
from concurrent.futures import ProcessPoolExecutor
warnings.filterwarnings('ignore')

from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

# ==================== PART A: PREPROCESSING ====================

class OptimizedCodePreprocessor:
    """Optimized preprocessor with all Part A requirements"""
    
    def __init__(self, projects_dir: str, max_file_size: int = 100000):
        self.projects_dir = Path(projects_dir)
        self.valid_extensions = {'.js', '.jsx', '.json', '.css'}
        self.projects_data = {}
        self.max_file_size = max_file_size
        
    def remove_comments_and_logs(self, code: str, file_ext: str) -> str:
        """Remove comments and console logs"""
        if file_ext in ['.js', '.jsx']:
            code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
            code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
            code = re.sub(r'console\.log\([^)]*\);?', '', code)
        elif file_ext == '.css':
            code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        return code
    
    def normalize_formatting(self, code: str) -> str:
        """Normalize whitespace and indentation"""
        lines = [line.strip() for line in code.split('\n') if line.strip()]
        return '\n'.join(lines)
    
    def is_minified(self, code: str) -> bool:
        """Check if code is minified"""
        lines = code.split('\n')
        if not lines:
            return False
        avg_line_length = sum(len(line) for line in lines) / len(lines)
        return avg_line_length > 200
    
    def preprocess_file(self, file_path: Path) -> str:
        """Preprocess a single file"""
        try:
            if file_path.stat().st_size > self.max_file_size:
                return ""
            
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            if self.is_minified(content):
                return ""
            
            content = self.remove_comments_and_logs(content, file_path.suffix)
            content = self.normalize_formatting(content)
            return content
        except Exception:
            return ""
    
    def analyze_project(self, project_path: Path) -> Dict:
        """Analyze project - Part A requirements"""
        stats = {
            'project_name': project_path.name,
            'total_files': 0,
            'total_folders': 0,
            'loc': 0,
            'js_files': 0,
            'jsx_files': 0,
            'json_files': 0,
            'css_files': 0,
            'react_components': 0,
            'express_routes': 0,
            'mongoose_models': 0,
            'file_contents': {},
            'combined_content': ""
        }
        
        stats['total_folders'] = sum(1 for _ in project_path.rglob('*') if _.is_dir())
        
        valid_files = [f for f in project_path.rglob('*') 
                      if f.is_file() and f.suffix in self.valid_extensions]
        
        all_content = []
        for file_path in valid_files:
            stats['total_files'] += 1
            ext = file_path.suffix
            
            if ext == '.js':
                stats['js_files'] += 1
            elif ext == '.jsx':
                stats['jsx_files'] += 1
            elif ext == '.json':
                stats['json_files'] += 1
            elif ext == '.css':
                stats['css_files'] += 1
            
            content = self.preprocess_file(file_path)
            if content:
                relative_path = str(file_path.relative_to(project_path))
                stats['file_contents'][relative_path] = content
                all_content.append(content)
                stats['loc'] += len(content.split('\n'))
                
                if ext in ['.js', '.jsx']:
                    if 'React.Component' in content or 'return (' in content:
                        stats['react_components'] += 1
                    if re.search(r'(app|router)\.(get|post|put|delete)', content):
                        stats['express_routes'] += 1
                    if 'mongoose.model' in content or 'mongoose.Schema' in content:
                        stats['mongoose_models'] += 1
        
        stats['combined_content'] = '\n\n'.join(all_content)
        return stats
    
    def preprocess_all_projects(self) -> pd.DataFrame:
        """Preprocess all projects"""
        all_stats = []
        project_dirs = [d for d in self.projects_dir.iterdir() if d.is_dir()]
        
        print(f"Found {len(project_dirs)} projects to analyze...")
        
        for i, project_dir in enumerate(project_dirs, 1):
            print(f"Processing {i}/{len(project_dirs)}: {project_dir.name}")
            stats = self.analyze_project(project_dir)
            self.projects_data[project_dir.name] = stats
            all_stats.append({
                'Project': stats['project_name'],
                'Total Files': stats['total_files'],
                'Total Folders': stats['total_folders'],
                'Lines of Code': stats['loc'],
                'JS Files': stats['js_files'],
                'JSX Files': stats['jsx_files'],
                'JSON Files': stats['json_files'],
                'CSS Files': stats['css_files'],
                'React Components': stats['react_components'],
                'Express Routes': stats['express_routes'],
                'Mongoose Models': stats['mongoose_models']
            })
        
        return pd.DataFrame(all_stats)


# %% [markdown]
# ## Part B: Code Similarity Computation

In [2]:
def compute_difflib_pair(args):
    """Compute difflib similarity for parallel processing"""
    i, j, content1, content2 = args
    if i == j:
        return (i, j, 1.0)
    matcher = SequenceMatcher(None, content1, content2)
    quick = matcher.quick_ratio()
    ratio = matcher.ratio() if quick > 0.5 else quick * 0.9
    return (i, j, ratio)

class OptimizedSimilarityAnalyzer:
    """Part B: Multi-level similarity analysis"""
    
    def __init__(self, projects_data: Dict):
        self.projects_data = projects_data
        self.project_names = list(projects_data.keys())
        self.n_projects = len(self.project_names)
        
    def get_combined_content(self, project_name: str, max_chars: int = None) -> str:
        """Get combined content with optional truncation"""
        content = self.projects_data[project_name]['combined_content']
        if max_chars and len(content) > max_chars:
            step = len(content) // 5
            samples = [content[i:i+max_chars//5] for i in range(0, len(content), step)][:5]
            return '\n'.join(samples)
        return content
    
    def textual_similarity_tfidf(self) -> np.ndarray:
        """Level 1: Textual Similarity using TF-IDF"""
        print("\n[1/3] Computing Textual Similarity (TF-IDF + Cosine)...")
        
        documents = [self.get_combined_content(proj, max_chars=100000) 
                    for proj in self.project_names]
        
        vectorizer = TfidfVectorizer(
            max_features=3000,
            token_pattern=r'\b\w+\b',
            ngram_range=(1, 2),
            max_df=0.95,
            min_df=2
        )
        tfidf_matrix = vectorizer.fit_transform(documents)
        return cosine_similarity(tfidf_matrix)
    
    def structural_similarity(self) -> np.ndarray:
        """Level 2: Structural Similarity (AST-like features)"""
        print("[2/3] Computing Structural Similarity (AST features)...")
        similarity_matrix = np.zeros((self.n_projects, self.n_projects))
        
        # Extract structural features
        all_features = {}
        for proj in self.project_names:
            content = self.get_combined_content(proj)
            all_features[proj] = {
                'imports': set(re.findall(r'from\s+[\'"]([^\'"]+)[\'"]', content)[:100]),
                'functions': set(re.findall(r'(?:function|const)\s+(\w+)', content)[:100]),
                'routes': set(re.findall(r'\.(get|post|put|delete)\([\'"]([^\'"]+)', content)[:50])
            }
        
        # Compute Jaccard similarity
        for i, proj1 in enumerate(self.project_names):
            feat1 = all_features[proj1]
            for j in range(i, self.n_projects):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                else:
                    proj2 = self.project_names[j]
                    feat2 = all_features[proj2]
                    
                    sims = []
                    for key in ['imports', 'functions', 'routes']:
                        set1, set2 = feat1[key], feat2[key]
                        if set1 or set2:
                            sims.append(len(set1 & set2) / len(set1 | set2))
                    
                    avg_sim = np.mean(sims) if sims else 0.0
                    similarity_matrix[i][j] = avg_sim
                    similarity_matrix[j][i] = avg_sim
        
        return similarity_matrix
    
    def semantic_similarity(self) -> np.ndarray:
        """Level 3: Semantic Similarity using embeddings"""
        print("[3/3] Computing Semantic Similarity (Embeddings)...")
        
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer('all-MiniLM-L6-v2')
            documents = [self.get_combined_content(proj, max_chars=5000) 
                        for proj in self.project_names]
            embeddings = model.encode(documents, batch_size=8, show_progress_bar=False)
            return cosine_similarity(embeddings)
        except Exception as e:
            print(f"  Warning: Semantic embedding failed ({e}). Using TF-IDF fallback.")
            return self.textual_similarity_tfidf()

# %% [markdown]
# ## Part C: Visualization & Reporting

In [3]:
class SimilarityVisualizer:
    """Part C: Comprehensive visualizations"""
    
    def __init__(self, project_names: List[str]):
        self.project_names = project_names
        plt.style.use('seaborn-v0_8-darkgrid')
        
    def plot_heatmap(self, similarity_matrix: np.ndarray, title: str, filename: str):
        """Heatmap of similarity matrix"""
        fig, ax = plt.subplots(figsize=(14, 12))
        
        sns.heatmap(
            similarity_matrix,
            xticklabels=self.project_names,
            yticklabels=self.project_names,
            annot=False,
            fmt='.2f',
            cmap='YlOrRd',
            vmin=0,
            vmax=1,
            cbar_kws={'label': 'Similarity Score'},
            square=True,
            ax=ax
        )
        
        ax.set_title(title, fontsize=16, fontweight='bold', pad=20)
        ax.set_xlabel('Projects', fontsize=12, fontweight='bold')
        ax.set_ylabel('Projects', fontsize=12, fontweight='bold')
        plt.xticks(rotation=90, ha='right', fontsize=9)
        plt.yticks(rotation=0, fontsize=9)
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Saved: results/{filename}")
    
    def plot_network_graph(self, similarity_matrix: np.ndarray, title: str, 
                          filename: str, threshold: float = 0.3):
        """Network graph of project clusters"""
        fig, ax = plt.subplots(figsize=(16, 12))
        
        G = nx.Graph()
        for name in self.project_names:
            G.add_node(name)
        
        for i in range(len(self.project_names)):
            for j in range(i + 1, len(self.project_names)):
                if similarity_matrix[i][j] >= threshold:
                    G.add_edge(
                        self.project_names[i],
                        self.project_names[j],
                        weight=similarity_matrix[i][j]
                    )
        
        pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
        
        # Node colors based on degree
        degrees = dict(G.degree())
        node_colors = [degrees.get(node, 0) for node in G.nodes()]
        
        nx.draw_networkx_nodes(G, pos, node_size=700, 
                              node_color=node_colors, cmap='viridis',
                              alpha=0.9, ax=ax)
        nx.draw_networkx_labels(G, pos, font_size=9, font_weight='bold', ax=ax)
        
        edges = G.edges()
        weights = [G[u][v]['weight'] for u, v in edges]
        nx.draw_networkx_edges(G, pos, width=[w*4 for w in weights], 
                              alpha=0.5, edge_color='gray', ax=ax)
        
        ax.set_title(f'{title}\n(Threshold: {threshold:.2f}, Edges: {len(edges)})', 
                    fontsize=16, fontweight='bold')
        ax.axis('off')
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Saved: results/{filename}")
    
    def plot_bar_chart(self, matrices: Dict[str, np.ndarray], filename: str):
        """Bar chart of average similarities by metric"""
        fig, ax = plt.subplots(figsize=(12, 7))
        
        metric_names = list(matrices.keys())
        avg_similarities = []
        std_similarities = []
        
        for matrix in matrices.values():
            mask = np.triu(np.ones_like(matrix, dtype=bool), k=1)
            values = matrix[mask]
            avg_similarities.append(values.mean())
            std_similarities.append(values.std())
        
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
        bars = ax.bar(metric_names, avg_similarities, color=colors, 
                     yerr=std_similarities, capsize=10, alpha=0.8, 
                     edgecolor='black', linewidth=1.5)
        
        ax.set_ylabel('Average Similarity Score', fontsize=13, fontweight='bold')
        ax.set_xlabel('Similarity Metric', fontsize=13, fontweight='bold')
        ax.set_title('Average Similarity Scores by Metric', 
                    fontsize=16, fontweight='bold', pad=20)
        ax.set_ylim(0, 1)
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        
        for bar, val, std in zip(bars, avg_similarities, std_similarities):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + std + 0.02,
                   f'{val:.3f}±{std:.3f}',
                   ha='center', va='bottom', fontweight='bold', fontsize=11)
        
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Saved: results/{filename}")
    
    def plot_distribution(self, similarity_matrix: np.ndarray, title: str, filename: str):
        """Distribution histogram of similarity scores"""
        fig, ax = plt.subplots(figsize=(10, 6))
        
        mask = np.triu(np.ones_like(similarity_matrix, dtype=bool), k=1)
        similarities = similarity_matrix[mask]
        
        ax.hist(similarities, bins=30, color='steelblue', alpha=0.7, 
               edgecolor='black', linewidth=1.2)
        ax.axvline(similarities.mean(), color='red', linestyle='--', 
                  linewidth=2.5, label=f'Mean: {similarities.mean():.3f}')
        ax.axvline(np.median(similarities), color='green', linestyle='--', 
                  linewidth=2.5, label=f'Median: {np.median(similarities):.3f}')
        
        ax.set_xlabel('Similarity Score', fontsize=12, fontweight='bold')
        ax.set_ylabel('Frequency', fontsize=12, fontweight='bold')
        ax.set_title(title, fontsize=16, fontweight='bold')
        ax.legend(fontsize=11, loc='upper right')
        ax.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig(f'results/{filename}', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Saved: results/{filename}")

# ==================== ANALYSIS & REPORTING ====================

def analyze_similarities(matrices: Dict, project_names: List[str]):
    """Generate detailed analysis for report"""
    print("\n" + "="*80)
    print("SIMILARITY ANALYSIS RESULTS")
    print("="*80)
    
    for metric_name, matrix in matrices.items():
        print(f"\n{metric_name} Similarity:")
        
        # Find most and least similar pairs
        n = len(project_names)
        pairs = []
        for i in range(n):
            for j in range(i + 1, n):
                pairs.append((project_names[i], project_names[j], matrix[i][j]))
        
        pairs.sort(key=lambda x: x[2], reverse=True)
        
        print("\n  Top 5 Most Similar Pairs:")
        for proj1, proj2, sim in pairs[:5]:
            print(f"    • {proj1} ↔ {proj2}: {sim:.4f}")
        
        print("\n  Top 5 Least Similar Pairs:")
        for proj1, proj2, sim in pairs[-5:]:
            print(f"    • {proj1} ↔ {proj2}: {sim:.4f}")
    
    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    
    summary_data = {
        'Metric': [],
        'Mean': [],
        'Median': [],
        'Std Dev': [],
        'Min': [],
        'Max': []
    }
    
    for metric_name, matrix in matrices.items():
        mask = np.triu(np.ones_like(matrix, dtype=bool), k=1)
        values = matrix[mask]
        
        summary_data['Metric'].append(metric_name)
        summary_data['Mean'].append(values.mean())
        summary_data['Median'].append(np.median(values))
        summary_data['Std Dev'].append(values.std())
        summary_data['Min'].append(values.min())
        summary_data['Max'].append(values.max())
    
    stats_df = pd.DataFrame(summary_data)
    print("\n" + stats_df.to_string(index=False))
    stats_df.to_csv('results/summary_statistics.csv', index=False)
    print("\n  ✓ Saved: results/summary_statistics.csv")

# ==================== MAIN EXECUTION ====================

def main():
    """Complete analysis pipeline"""
    PROJECTS_DIR = "ALL_PROJECTS"
    
    # Create results directory
    os.makedirs('results', exist_ok=True)
    
    print("="*80)
    print("PART A: PREPROCESSING & DATA UNDERSTANDING")
    print("="*80)
    
    # Step 1: Preprocessing
    preprocessor = OptimizedCodePreprocessor(PROJECTS_DIR)
    summary_df = preprocessor.preprocess_all_projects()
    
    print("\n=== Project Summary Statistics ===")
    print(summary_df.to_string(index=False))
    print("\n" + summary_df.describe().to_string())
    
    summary_df.to_csv('results/preprocessing_summary.csv', index=False)
    print("\n✓ Saved: results/preprocessing_summary.csv")
    
    print("\n" + "="*80)
    print("PART B: CODE SIMILARITY COMPUTATION")
    print("="*80)
    
    # Step 2: Similarity Analysis
    analyzer = OptimizedSimilarityAnalyzer(preprocessor.projects_data)
    
    similarity_textual = analyzer.textual_similarity_tfidf()
    similarity_structural = analyzer.structural_similarity()
    similarity_semantic = analyzer.semantic_similarity()
    
    # Save matrices
    np.save('results/similarity_textual.npy', similarity_textual)
    np.save('results/similarity_structural.npy', similarity_structural)
    np.save('results/similarity_semantic.npy', similarity_semantic)
    
    # Save as CSV
    project_names = analyzer.project_names
    pd.DataFrame(similarity_textual, index=project_names, columns=project_names).to_csv('results/similarity_textual.csv')
    pd.DataFrame(similarity_structural, index=project_names, columns=project_names).to_csv('results/similarity_structural.csv')
    pd.DataFrame(similarity_semantic, index=project_names, columns=project_names).to_csv('results/similarity_semantic.csv')
    
    print("\n✓ All similarity matrices saved")
    
    print("\n" + "="*80)
    print("PART C: VISUALIZATION & REPORTING")
    print("="*80)
    
    # Step 3: Visualizations
    visualizer = SimilarityVisualizer(project_names)
    
    matrices_dict = {
        'Textual (TF-IDF)': similarity_textual,
        'Structural (AST)': similarity_structural,
        'Semantic (Embeddings)': similarity_semantic
    }
    
    print("\nGenerating visualizations...")
    
    # Heatmaps (required)
    visualizer.plot_heatmap(similarity_textual, 'Textual Similarity Heatmap (TF-IDF)', 'heatmap_textual.png')
    visualizer.plot_heatmap(similarity_structural, 'Structural Similarity Heatmap (AST)', 'heatmap_structural.png')
    visualizer.plot_heatmap(similarity_semantic, 'Semantic Similarity Heatmap (Embeddings)', 'heatmap_semantic.png')
    
    # Network graphs (required)
    visualizer.plot_network_graph(similarity_textual, 'Project Similarity Network (Textual)', 'network_textual.png', 0.3)
    visualizer.plot_network_graph(similarity_structural, 'Project Similarity Network (Structural)', 'network_structural.png', 0.3)
    
    # Bar chart (required)
    visualizer.plot_bar_chart(matrices_dict, 'average_similarities_by_metric.png')
    
    # Additional visualizations
    visualizer.plot_distribution(similarity_textual, 'Distribution of Textual Similarity Scores', 'distribution_textual.png')
    visualizer.plot_distribution(similarity_structural, 'Distribution of Structural Similarity Scores', 'distribution_structural.png')
    
    # Step 4: Analysis
    analyze_similarities(matrices_dict, project_names)
    
    print("\n" + "="*80)
    print("✅ ANALYSIS COMPLETE!")
    print("="*80)
    print("\nGenerated files in results/:")
    print("  • preprocessing_summary.csv")
    print("  • similarity_textual.csv/npy")
    print("  • similarity_structural.csv/npy")
    print("  • similarity_semantic.csv/npy")
    print("  • heatmap_textual.png")
    print("  • heatmap_structural.png")
    print("  • heatmap_semantic.png")
    print("  • network_textual.png")
    print("  • network_structural.png")
    print("  • average_similarities_by_metric.png")
    print("  • distribution_textual.png")
    print("  • distribution_structural.png")
    print("  • summary_statistics.csv")
    print("\nNext: Write your 1-page report.pdf with methodology and insights!")

if __name__ == "__main__":
    main()

PART A: PREPROCESSING & DATA UNDERSTANDING
Found 18 projects to analyze...
Processing 1/18: Team-26
Processing 2/18: Team-03
Processing 3/18: Team-27
Processing 4/18: Team-33
Processing 5/18: Team-28
Processing 6/18: Team-06
Processing 7/18: Team-29
Processing 8/18: Team-31
Processing 9/18: Team-16
Processing 10/18: Team-22
Processing 11/18: Team-25
Processing 12/18: Team-05
Processing 13/18: Team-20
Processing 14/18: Team-10
Processing 15/18: Team-14
Processing 16/18: Team-21
Processing 17/18: Team-13
Processing 18/18: Team-32

=== Project Summary Statistics ===
Project  Total Files  Total Folders  Lines of Code  JS Files  JSX Files  JSON Files  CSS Files  React Components  Express Routes  Mongoose Models
Team-26           29             27           4429        16          7           4          2                 7               5                4
Team-03           32             30           3513        12         12           6          2                10               2          