# üöÄ SEMA Auto-Complete VOC Analysis

**Fully Automated Korean Voice of Customer Classification**

## Instructions:
1. Set runtime to **GPU** (Runtime ‚Üí Change runtime type ‚Üí GPU)
2. Upload Excel files to `/content/data/input/` folder BEFORE running
3. **Run ALL cells** - completely automated processing
4. Results automatically download when complete

## Features:
- üß† **Auto GPU Detection**: Selects optimal model based on available VRAM
- üìÅ **Auto File Processing**: Processes all files in input folder
- üõ°Ô∏è **Safety Features**: Timeout protection and error handling
- üì• **Auto Download**: Results download automatically

## File Requirements:
- Excel files with **VOC1** and **VOC2** columns
- Korean text content

## üîß Setup & Environment Detection

In [None]:
print("üîß Setting up SEMA Auto-Complete environment...")

# Install system dependencies
!apt-get update -qq && apt-get install -y openjdk-8-jdk -qq

# Set Java environment
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

# Install packages
!pip install -q "huggingface_hub>=0.16.0" "torch>=2.0.0" "transformers>=4.30.0,<5.0.0" "torchmetrics>=0.11.0" "lightning>=2.0.0" konlpy psutil

# Setup repository
!git clone -q https://github.com/shc443/sema_inf.git
%cd sema_inf
!pip install -q -e .

print("‚úÖ Basic setup complete!")

## üß† GPU Detection & Model Selection

In [None]:
import torch
import psutil
import os
from pathlib import Path

def detect_gpu_and_select_model():
    """Detect GPU memory and select appropriate model size"""
    
    # Check GPU availability
    if not torch.cuda.is_available():
        print("‚ùå No GPU detected! Please change runtime to GPU.")
        return None, None
    
    # Get GPU info
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    
    print(f"üöÄ GPU Detected: {gpu_name}")
    print(f"üìä GPU Memory: {gpu_memory_gb:.1f} GB")
    
    # System info
    cpu_count = psutil.cpu_count()
    ram_gb = psutil.virtual_memory().total / (1024**3)
    print(f"üñ•Ô∏è System: {cpu_count} cores, {ram_gb:.1f}GB RAM")
    
    # Model selection based on GPU memory
    if gpu_memory_gb >= 15.0:  # High-end GPU (T4, V100, A100)
        model_size = "xlarge"
        checkpoint_file = "deberta-v3-xlarge-korean_20ep_full_mar17_dropna.ckpt"
        batch_size = 8
        print(f"üéØ Selected: XLarge model (optimal for {gpu_memory_gb:.1f}GB)")
        
    elif gpu_memory_gb >= 8.0:  # Medium GPU (might work with xlarge)
        model_size = "xlarge"
        checkpoint_file = "deberta-v3-xlarge-korean_20ep_full_mar17_dropna.ckpt"
        batch_size = 4  # Smaller batch for safety
        print(f"‚ö†Ô∏è Selected: XLarge model with reduced batch size ({gpu_memory_gb:.1f}GB)")
        print("üí° Will fallback to small model if memory issues occur")
        
    else:  # Low memory GPU
        model_size = "small"
        checkpoint_file = "deberta-v3-small-korean_20ep_full_mar17_dropna.ckpt"  # Future small model
        batch_size = 8
        print(f"üîß Selected: Small model (optimized for {gpu_memory_gb:.1f}GB)")
    
    return {
        'model_size': model_size,
        'checkpoint_file': checkpoint_file,
        'batch_size': batch_size,
        'gpu_memory': gpu_memory_gb,
        'gpu_name': gpu_name
    }, None

# Detect and select model
gpu_config, error = detect_gpu_and_select_model()
if error:
    print(f"‚ùå Error: {error}")
    raise Exception(error)

print(f"\n‚úÖ Configuration ready!")
print(f"   Model: {gpu_config['model_size']}")
print(f"   Batch Size: {gpu_config['batch_size']}")
print(f"   Checkpoint: {gpu_config['checkpoint_file']}")

## üìÅ Initialize Directories & Check Input Files

In [None]:
# Create directories
os.makedirs('data/input', exist_ok=True)
os.makedirs('data/output', exist_ok=True)
os.makedirs('logs', exist_ok=True)
os.makedirs('logs/errors', exist_ok=True)

# Check for input files
input_dir = Path('data/input')
excel_files = list(input_dir.glob('*.xlsx'))

print(f"üìÅ Directories created")
print(f"üìä Input files found: {len(excel_files)}")

if len(excel_files) == 0:
    print("\n‚ö†Ô∏è No Excel files found in data/input/ directory!")
    print("üì§ Please upload your Excel files to /content/data/input/ before continuing")
    print("üí° Use the file browser on the left or the upload button")
    
    # Show how to upload files
    from IPython.display import display, HTML
    display(HTML("""
    <div style="background: #fff3cd; padding: 15px; border: 1px solid #ffeaa7; border-radius: 5px; margin: 10px 0;">
        <h4>üì§ How to Upload Files:</h4>
        <ol>
            <li>Click the <strong>üìÅ folder icon</strong> in the left sidebar</li>
            <li>Navigate to <code>data/input/</code> folder</li>
            <li>Click <strong>Upload</strong> button and select your Excel files</li>
            <li>Wait for upload to complete</li>
            <li>Re-run this cell to verify files are uploaded</li>
        </ol>
    </div>
    """))
    
    # Don't proceed without files
    raise Exception("Please upload Excel files to data/input/ directory first")
else:
    print("\nüìã Files to process:")
    for i, file in enumerate(excel_files, 1):
        print(f"   {i}. {file.name}")
    print("\n‚úÖ Ready to process!")

## üöÄ Load SEMA Model & Start Processing

In [None]:
import time
import threading
import traceback
from datetime import datetime
from google.colab import files

class AutoSemaProcessor:
    def __init__(self, gpu_config):
        self.gpu_config = gpu_config
        self.start_time = time.time()
        self.processed_files = []
        self.failed_files = []
        
    def load_sema_with_fallback(self):
        """Load SEMA with automatic fallback to smaller model if needed"""
        print(f"üîÑ Loading SEMA with {self.gpu_config['model_size']} model...")
        
        try:
            from colab_cli import SemaColabCLI
            
            # Try to initialize with selected model
            sema = SemaColabCLI()
            sema.batch_size = self.gpu_config['batch_size']
            
            # Test with small input to check memory
            if self.gpu_config['model_size'] == 'xlarge' and self.gpu_config['gpu_memory'] < 12.0:
                print("üß™ Testing XLarge model with limited memory...")
                torch.cuda.empty_cache()  # Clear cache first
            
            print(f"‚úÖ SEMA loaded successfully!")
            print(f"   Model: {self.gpu_config['model_size']}")
            print(f"   Batch Size: {sema.batch_size}")
            return sema
            
        except Exception as e:
            if 'out of memory' in str(e).lower() or 'cuda' in str(e).lower():
                print(f"‚ùå Memory error with {self.gpu_config['model_size']} model: {e}")
                
                if self.gpu_config['model_size'] == 'xlarge':
                    print("üîÑ Falling back to small model...")
                    
                    # Update config for small model
                    self.gpu_config['model_size'] = 'small'
                    self.gpu_config['checkpoint_file'] = 'deberta-v3-small-korean_20ep_full_mar17_dropna.ckpt'
                    self.gpu_config['batch_size'] = 8
                    
                    # Clear GPU memory
                    torch.cuda.empty_cache()
                    
                    # Try again with small model
                    try:
                        sema = SemaColabCLI()
                        sema.batch_size = self.gpu_config['batch_size']
                        print("‚úÖ Small model loaded successfully!")
                        return sema
                    except Exception as e2:
                        print(f"‚ùå Small model also failed: {e2}")
                        raise e2
                else:
                    print("‚ùå Small model failed - no fallback available")
                    raise e
            else:
                print(f"‚ùå Unexpected error: {e}")
                raise e
    
    def process_all_files(self):
        """Process all files in input directory"""
        try:
            # Load SEMA
            sema = self.load_sema_with_fallback()
            
            # Get input files
            input_files = list(Path('data/input').glob('*.xlsx'))
            total_files = len(input_files)
            
            print(f"\nüöÄ Starting processing of {total_files} files...")
            print(f"‚è∞ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            
            for i, input_file in enumerate(input_files, 1):
                print(f"\nüìÑ Processing file {i}/{total_files}: {input_file.name}")
                file_start = time.time()
                
                try:
                    # Process single file
                    output_file = f"data/output/{input_file.stem}_output.xlsx"
                    success = sema.process_file(str(input_file), output_file)
                    
                    if success:
                        elapsed = time.time() - file_start
                        self.processed_files.append(input_file.name)
                        print(f"‚úÖ {input_file.name} completed in {elapsed:.1f}s")
                    else:
                        self.failed_files.append(input_file.name)
                        print(f"‚ùå {input_file.name} failed")
                        
                except Exception as e:
                    self.failed_files.append(input_file.name)
                    print(f"‚ùå Error processing {input_file.name}: {e}")
                    
                    # Log error
                    error_log = {
                        'file': input_file.name,
                        'error': str(e),
                        'timestamp': datetime.now().isoformat(),
                        'traceback': traceback.format_exc()
                    }
                    
                    with open(f'logs/errors/{input_file.stem}_error.json', 'w') as f:
                        import json
                        json.dump(error_log, f, indent=2)
            
            # Summary
            total_time = time.time() - self.start_time
            print(f"\nüéâ Processing Summary:")
            print(f"   ‚úÖ Successful: {len(self.processed_files)} files")
            print(f"   ‚ùå Failed: {len(self.failed_files)} files")
            print(f"   ‚è∞ Total time: {total_time:.1f}s")
            print(f"   üß† Model used: {self.gpu_config['model_size']}")
            
            if self.failed_files:
                print(f"\n‚ùå Failed files: {', '.join(self.failed_files)}")
                print(f"üìÅ Error logs saved in logs/errors/ directory")
            
            return len(self.processed_files) > 0
            
        except Exception as e:
            print(f"‚ùå Critical error in processing: {e}")
            traceback.print_exc()
            return False

# Initialize and run processor
processor = AutoSemaProcessor(gpu_config)
success = processor.process_all_files()

if success:
    print("\nüéä Processing completed successfully!")
else:
    print("\nüí• Processing failed or no files processed")
    print("üîç Check error logs for details")

## üì• Auto-Download Results

In [None]:
# Check output files
output_files = list(Path('data/output').glob('*.xlsx'))

if len(output_files) > 0:
    print(f"üì• Downloading {len(output_files)} result files...")
    
    # Download all output files
    for output_file in output_files:
        try:
            files.download(str(output_file))
            print(f"‚úÖ Downloaded: {output_file.name}")
        except Exception as e:
            print(f"‚ùå Failed to download {output_file.name}: {e}")
    
    print(f"\nüéâ All {len(output_files)} files downloaded to your computer!")
    print("üìÅ Check your Downloads folder")
    
    # Show file details
    print("\nüìã Downloaded files:")
    for i, file in enumerate(output_files, 1):
        print(f"   {i}. {file.name}")
        
else:
    print("‚ùå No output files found to download")
    print("üîç Check processing logs above for errors")

# Final summary
print(f"\nüèÅ SEMA Auto-Complete finished!")
print(f"   üìä Processed: {len(processor.processed_files)} files")
print(f"   üì• Downloaded: {len(output_files)} files")
print(f"   üß† Model used: {gpu_config['model_size']}")

if len(processor.failed_files) > 0:
    print(f"   ‚ö†Ô∏è Failed: {len(processor.failed_files)} files (check error logs)")