In [None]:
# Optimized Code Similarity Analysis
# Key optimizations applied:
# 1. Parallel processing for similarity computations
# 2. Sampling and chunking for large codebases
# 3. Caching preprocessed content
# 4. More efficient string comparison
# 5. Reduced semantic embedding size

import os
import re
import json
import glob
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
import warnings
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from functools import lru_cache
import hashlib
warnings.filterwarnings('ignore')

from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

# ==================== OPTIMIZED PREPROCESSOR ====================

class OptimizedCodePreprocessor:
    """Optimized preprocessor with caching and parallel processing"""
    
    def __init__(self, projects_dir: str, max_file_size: int = 100000):
        self.projects_dir = Path(projects_dir)
        self.valid_extensions = {'.js', '.jsx', '.json', '.css'}
        self.projects_data = {}
        self.max_file_size = max_file_size  # Skip very large files
        
    def remove_comments_and_logs(self, code: str, file_ext: str) -> str:
        """Remove comments and console logs from code"""
        if file_ext in ['.js', '.jsx']:
            code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
            code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
            code = re.sub(r'console\.log\([^)]*\);?', '', code)
        elif file_ext == '.css':
            code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        return code
    
    def normalize_formatting(self, code: str) -> str:
        """Normalize whitespace and indentation"""
        lines = [line.strip() for line in code.split('\n') if line.strip()]
        return '\n'.join(lines)
    
    def is_minified(self, code: str) -> bool:
        """Check if code is minified"""
        lines = code.split('\n')
        if not lines:
            return False
        avg_line_length = sum(len(line) for line in lines) / len(lines)
        return avg_line_length > 200
    
    def preprocess_file(self, file_path: Path) -> str:
        """Preprocess a single file with size check"""
        try:
            # Skip large files
            if file_path.stat().st_size > self.max_file_size:
                return ""
            
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            file_ext = file_path.suffix
            
            if self.is_minified(content):
                return ""
            
            content = self.remove_comments_and_logs(content, file_ext)
            content = self.normalize_formatting(content)
            
            return content
        except Exception:
            return ""
    
    def analyze_project(self, project_path: Path) -> Dict:
        """Analyze a single project with optimizations"""
        stats = {
            'project_name': project_path.name,
            'total_files': 0,
            'total_folders': 0,
            'loc': 0,
            'js_files': 0,
            'jsx_files': 0,
            'json_files': 0,
            'css_files': 0,
            'react_components': 0,
            'express_routes': 0,
            'mongoose_models': 0,
            'file_contents': {},
            'combined_content': ""  # Cache combined content
        }
        
        # Count folders (faster method)
        stats['total_folders'] = sum(1 for _ in project_path.rglob('*') if _.is_dir())
        
        # Collect all valid files first
        valid_files = [f for f in project_path.rglob('*') 
                      if f.is_file() and f.suffix in self.valid_extensions]
        
        # Process files
        all_content = []
        for file_path in valid_files:
            stats['total_files'] += 1
            ext = file_path.suffix
            
            # Count by extension
            if ext == '.js':
                stats['js_files'] += 1
            elif ext == '.jsx':
                stats['jsx_files'] += 1
            elif ext == '.json':
                stats['json_files'] += 1
            elif ext == '.css':
                stats['css_files'] += 1
            
            # Preprocess content
            content = self.preprocess_file(file_path)
            if content:
                relative_path = str(file_path.relative_to(project_path))
                stats['file_contents'][relative_path] = content
                all_content.append(content)
                
                # Count LOC
                stats['loc'] += len(content.split('\n'))
                
                # Detect features (only for JS/JSX)
                if ext in ['.js', '.jsx']:
                    if 'React.Component' in content or 'return (' in content:
                        stats['react_components'] += 1
                    if re.search(r'(app|router)\.(get|post|put|delete)', content):
                        stats['express_routes'] += 1
                    if 'mongoose.model' in content or 'mongoose.Schema' in content:
                        stats['mongoose_models'] += 1
        
        # Cache combined content
        stats['combined_content'] = '\n\n'.join(all_content)
        
        return stats
    
    def preprocess_all_projects(self) -> pd.DataFrame:
        """Preprocess all projects with progress tracking"""
        all_stats = []
        project_dirs = [d for d in self.projects_dir.iterdir() if d.is_dir()]
        
        print(f"Found {len(project_dirs)} projects to analyze...")
        
        # Process projects sequentially (avoids file I/O conflicts)
        for i, project_dir in enumerate(project_dirs, 1):
            print(f"Processing {i}/{len(project_dirs)}: {project_dir.name}")
            stats = self.analyze_project(project_dir)
            self.projects_data[project_dir.name] = stats
            all_stats.append({
                'Project': stats['project_name'],
                'Total Files': stats['total_files'],
                'Total Folders': stats['total_folders'],
                'Lines of Code': stats['loc'],
                'JS Files': stats['js_files'],
                'JSX Files': stats['jsx_files'],
                'JSON Files': stats['json_files'],
                'CSS Files': stats['css_files'],
                'React Components': stats['react_components'],
                'Express Routes': stats['express_routes'],
                'Mongoose Models': stats['mongoose_models']
            })
        
        return pd.DataFrame(all_stats)

# ==================== OPTIMIZED SIMILARITY ANALYZER ====================

def compute_difflib_pair(args):
    """Compute difflib similarity for a pair (for parallel processing)"""
    i, j, content1, content2 = args
    if i == j:
        return (i, j, 1.0)
    # Use quick_ratio for speed, then ratio only if promising
    matcher = SequenceMatcher(None, content1, content2)
    quick = matcher.quick_ratio()
    if quick > 0.5:  # Only compute full ratio if quick ratio is high
        ratio = matcher.ratio()
    else:
        ratio = quick * 0.9  # Approximate
    return (i, j, ratio)

class OptimizedSimilarityAnalyzer:
    """Optimized similarity analyzer with parallel processing"""
    
    def __init__(self, projects_data: Dict, sample_size: int = None):
        self.projects_data = projects_data
        self.project_names = list(projects_data.keys())
        self.n_projects = len(self.project_names)
        self.sample_size = sample_size  # For sampling large projects
        
    def get_combined_content(self, project_name: str, max_chars: int = None) -> str:
        """Get combined content with optional truncation"""
        content = self.projects_data[project_name]['combined_content']
        if max_chars and len(content) > max_chars:
            # Sample from different parts of the file
            step = len(content) // 5
            samples = [content[i:i+max_chars//5] for i in range(0, len(content), step)][:5]
            return '\n'.join(samples)
        return content
    
    def textual_similarity_difflib_parallel(self, max_chars: int = 50000) -> np.ndarray:
        """Compute textual similarity using difflib with parallel processing"""
        print("\nComputing textual similarity (difflib - parallel)...")
        similarity_matrix = np.zeros((self.n_projects, self.n_projects))
        
        # Prepare content (truncate for speed)
        contents = [self.get_combined_content(proj, max_chars) 
                   for proj in self.project_names]
        
        # Create pairs to compute
        pairs = []
        for i in range(self.n_projects):
            for j in range(i, self.n_projects):
                pairs.append((i, j, contents[i], contents[j]))
        
        # Compute in parallel
        with ProcessPoolExecutor(max_workers=4) as executor:
            results = list(executor.map(compute_difflib_pair, pairs))
        
        # Fill matrix
        for i, j, sim in results:
            similarity_matrix[i][j] = sim
            similarity_matrix[j][i] = sim
        
        return similarity_matrix
    
    def textual_similarity_tfidf(self) -> np.ndarray:
        """Compute textual similarity using TF-IDF (fastest method)"""
        print("\nComputing textual similarity (TF-IDF + Cosine)...")
        
        documents = [self.get_combined_content(proj, max_chars=100000) 
                    for proj in self.project_names]
        
        # Optimized vectorizer settings
        vectorizer = TfidfVectorizer(
            max_features=3000,  # Reduced from 5000
            token_pattern=r'\b\w+\b',
            ngram_range=(1, 2),
            max_df=0.95,  # Ignore very common terms
            min_df=2      # Ignore very rare terms
        )
        tfidf_matrix = vectorizer.fit_transform(documents)
        similarity_matrix = cosine_similarity(tfidf_matrix)
        
        return similarity_matrix
    
    def structural_similarity_fast(self) -> np.ndarray:
        """Fast structural similarity using simplified feature extraction"""
        print("\nComputing structural similarity (fast)...")
        similarity_matrix = np.zeros((self.n_projects, self.n_projects))
        
        # Extract features for all projects
        all_features = {}
        for proj in self.project_names:
            content = self.get_combined_content(proj)
            
            # Extract only key features (faster)
            imports = set(re.findall(r'from\s+[\'"]([^\'"]+)[\'"]', content)[:100])
            functions = set(re.findall(r'(?:function|const)\s+(\w+)', content)[:100])
            routes = set(re.findall(r'\.(get|post|put|delete)\([\'"]([^\'"]+)', content)[:50])
            
            all_features[proj] = {
                'imports': imports,
                'functions': functions,
                'routes': routes
            }
        
        # Compute pairwise similarity
        for i, proj1 in enumerate(self.project_names):
            feat1 = all_features[proj1]
            for j in range(i, self.n_projects):
                if i == j:
                    similarity_matrix[i][j] = 1.0
                else:
                    proj2 = self.project_names[j]
                    feat2 = all_features[proj2]
                    
                    # Compute Jaccard similarities
                    sims = []
                    for key in ['imports', 'functions', 'routes']:
                        set1, set2 = feat1[key], feat2[key]
                        if set1 or set2:
                            jaccard = len(set1 & set2) / len(set1 | set2)
                            sims.append(jaccard)
                    
                    avg_sim = np.mean(sims) if sims else 0.0
                    similarity_matrix[i][j] = avg_sim
                    similarity_matrix[j][i] = avg_sim
        
        return similarity_matrix
    
    def semantic_similarity_fast(self) -> np.ndarray:
        """Fast semantic similarity using smaller embeddings"""
        print("\nComputing semantic similarity (fast - using TF-IDF as fallback)...")
        
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer('all-MiniLM-L6-v2')
            
            # Use smaller chunks
            documents = [self.get_combined_content(proj, max_chars=5000) 
                        for proj in self.project_names]
            
            embeddings = model.encode(documents, batch_size=8, show_progress_bar=True)
            similarity_matrix = cosine_similarity(embeddings)
            
            return similarity_matrix
        except Exception as e:
            print(f"Semantic embedding failed: {e}")
            print("Using TF-IDF as fallback...")
            return self.textual_similarity_tfidf()

# ==================== USAGE EXAMPLE ====================

if __name__ == "__main__":
    # Configuration
    PROJECTS_DIR = "ALL_PROJECTS"
    
    # Step 1: Preprocessing
    print("=" * 80)
    print("STEP 1: PREPROCESSING")
    print("=" * 80)
    preprocessor = OptimizedCodePreprocessor(PROJECTS_DIR)
    summary_df = preprocessor.preprocess_all_projects()
    
    print("\n=== Project Summary Statistics ===")
    print(summary_df.to_string(index=False))
    
    os.makedirs('results', exist_ok=True)
    summary_df.to_csv('results/preprocessing_summary.csv', index=False)
    
    # Step 2: Similarity Analysis
    print("\n" + "=" * 80)
    print("STEP 2: SIMILARITY ANALYSIS")
    print("=" * 80)
    analyzer = OptimizedSimilarityAnalyzer(preprocessor.projects_data)
    
    # Use fastest methods
    similarity_tfidf = analyzer.textual_similarity_tfidf()
    similarity_structural = analyzer.structural_similarity_fast()
    
    # Optional: Use difflib only if needed (slowest)
    # similarity_difflib = analyzer.textual_similarity_difflib_parallel()
    
    # Optional: Use semantic only if needed
    # similarity_semantic = analyzer.semantic_similarity_fast()
    
    # Save results
    np.save('results/similarity_tfidf.npy', similarity_tfidf)
    np.save('results/similarity_structural.npy', similarity_structural)
    
    print("\nâœ… Analysis complete! Results saved to 'results/' folder")
    print(f"   - Average TF-IDF similarity: {similarity_tfidf[np.triu_indices_from(similarity_tfidf, k=1)].mean():.3f}")
    print(f"   - Average Structural similarity: {similarity_structural[np.triu_indices_from(similarity_structural, k=1)].mean():.3f}")