# 🚀 SEMA Client - Complete VOC Analysis

**Zero-Setup Korean Voice of Customer Classification for Clients**

## 🎯 For Clients - No Setup Required!

1. **Set Runtime to GPU** (Runtime → Change runtime type → GPU)
2. **Upload Excel files** to `/content/sema/data/input/` folder
3. **Run all cells** - everything is automatic!
4. **Download results** - automatic at the end

## ✨ What This Does:
- 🤖 **Auto-detects GPU** and selects best model (small/xlarge)
- 📥 **Auto-downloads** model files from Hugging Face
- 💾 **Caches in Google Drive** for faster future runs
- 📊 **Processes all Excel files** automatically
- 📁 **Organizes input/output** in clean folders
- 📥 **Auto-downloads results** to your computer

## 📋 File Requirements:
- Excel files (.xlsx) with **VOC1** and **VOC2** columns
- Korean text content

## 🔧 Step 1: Environment Setup & GPU Detection

In [None]:
print("🚀 SEMA Client - Initializing complete environment...")

# Install system dependencies
!apt-get update -qq && apt-get install -y openjdk-8-jdk -qq

# Set Java environment for Korean NLP
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

# Install Python packages
!pip install -q "huggingface_hub>=0.16.0" "torch>=2.0.0" "transformers>=4.30.0,<5.0.0" "torchmetrics>=0.11.0" "lightning>=2.0.0" konlpy psutil

# Clone SEMA repository
!git clone -q https://github.com/shc443/sema_inf.git
%cd sema_inf
!pip install -q -e .

print("✅ Basic setup complete!")

# GPU Detection and Model Selection
import torch
import psutil
from pathlib import Path

def detect_optimal_model():
    """Detect GPU and select optimal model configuration"""
    
    if not torch.cuda.is_available():
        print("❌ No GPU detected! Please set Runtime → Change runtime type → GPU")
        return None
    
    # Get GPU information
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    
    # System information
    cpu_count = psutil.cpu_count()
    ram_gb = psutil.virtual_memory().total / (1024**3)
    
    print(f"🖥️ System Detected:")
    print(f"   GPU: {gpu_name}")
    print(f"   VRAM: {gpu_memory_gb:.1f} GB")
    print(f"   CPU: {cpu_count} cores")
    print(f"   RAM: {ram_gb:.1f} GB")
    
    # Model selection logic
    if gpu_memory_gb >= 14.0:  # High-end GPU (T4, V100, A100, etc.)
        model_config = {
            'size': 'xlarge',
            'checkpoint': 'deberta-v3-xlarge-korean_20ep_full_mar17_dropna.ckpt',
            'batch_size': 8,
            'reason': 'High VRAM - optimal performance'
        }
    elif gpu_memory_gb >= 10.0:  # Medium GPU
        model_config = {
            'size': 'xlarge',
            'checkpoint': 'deberta-v3-xlarge-korean_20ep_full_mar17_dropna.ckpt',
            'batch_size': 4,
            'reason': 'Medium VRAM - reduced batch size'
        }
    elif gpu_memory_gb >= 8.0:  # Standard Colab free tier
        model_config = {
            'size': 'small',
            'checkpoint': 'deberta-v3-small-korean_20ep_full_mar17_dropna.ckpt',
            'batch_size': 8,
            'reason': 'Standard VRAM - optimized small model'
        }
    else:  # Low memory
        model_config = {
            'size': 'small',
            'checkpoint': 'deberta-v3-small-korean_20ep_full_mar17_dropna.ckpt',
            'batch_size': 4,
            'reason': 'Limited VRAM - conservative settings'
        }
    
    model_config['gpu_memory'] = gpu_memory_gb
    model_config['gpu_name'] = gpu_name
    
    print(f"\n🎯 Selected Configuration:")
    print(f"   Model: {model_config['size']} ({model_config['reason']})")
    print(f"   Batch Size: {model_config['batch_size']}")
    print(f"   Checkpoint: {model_config['checkpoint']}")
    
    return model_config

# Detect optimal configuration
MODEL_CONFIG = detect_optimal_model()
if not MODEL_CONFIG:
    raise Exception("GPU detection failed. Please enable GPU runtime.")

print("\n✅ GPU detection and model selection complete!")

## 💾 Step 2: Google Drive Setup & Model Caching

In [None]:
from google.colab import drive
from huggingface_hub import hf_hub_download
import shutil
import time

# Mount Google Drive for caching
try:
    drive.mount('/content/drive', force_remount=True)
    DRIVE_CACHE = Path('/content/drive/MyDrive/SEMA_Cache')
    DRIVE_CACHE.mkdir(exist_ok=True)
    print("✅ Google Drive mounted for model caching")
    USE_DRIVE_CACHE = True
except Exception as e:
    print(f"⚠️ Google Drive mount failed: {e}")
    print("📥 Will download models directly (slower on repeated runs)")
    DRIVE_CACHE = None
    USE_DRIVE_CACHE = False

def download_model_files(config, hf_repo="shc443/sema2025"):
    """Download model files with Google Drive caching"""
    
    print(f"📥 Setting up model files for {config['size']} model...")
    
    # Create local directories
    os.makedirs('data', exist_ok=True)
    os.makedirs('model', exist_ok=True)
    
    # Files to download
    required_files = [
        ('data2.pkl', 'data/data2.pkl'),
        ('voc_etc.pkl', 'data/voc_etc.pkl'),
        ('keyword_doc.pkl', 'data/keyword_doc.pkl'),
        (config['checkpoint'], f"model/{config['checkpoint']}")
    ]
    
    downloaded_count = 0
    
    for hf_filename, local_path in required_files:
        local_file = Path(local_path)
        cache_file = DRIVE_CACHE / hf_filename if USE_DRIVE_CACHE else None
        
        try:
            # Check if file exists in Drive cache
            if USE_DRIVE_CACHE and cache_file and cache_file.exists():
                print(f"📂 Found in Drive cache: {hf_filename}")
                shutil.copy2(cache_file, local_file)
                print(f"✅ Copied from cache: {hf_filename}")
                downloaded_count += 1
                continue
            
            # Download from Hugging Face
            print(f"📥 Downloading from HF: {hf_filename}...")
            start_time = time.time()
            
            downloaded_path = hf_hub_download(
                repo_id=hf_repo,
                filename=hf_filename,
                cache_dir="./hf_cache",
                resume=True
            )
            
            # Copy to local path
            local_file.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(downloaded_path, local_file)
            
            # Cache in Google Drive if available
            if USE_DRIVE_CACHE and cache_file:
                try:
                    shutil.copy2(downloaded_path, cache_file)
                    print(f"💾 Cached to Drive: {hf_filename}")
                except Exception as e:
                    print(f"⚠️ Drive cache failed for {hf_filename}: {e}")
            
            elapsed = time.time() - start_time
            file_size = local_file.stat().st_size / (1024*1024)  # MB
            print(f"✅ Downloaded: {hf_filename} ({file_size:.1f}MB in {elapsed:.1f}s)")
            downloaded_count += 1
            
        except Exception as e:
            print(f"❌ Failed to download {hf_filename}: {e}")
            
            # Try fallback to xlarge if small model fails
            if 'small' in hf_filename and config['size'] == 'small':
                print(f"🔄 Small model not found, trying xlarge model...")
                fallback_filename = hf_filename.replace('small', 'xlarge')
                try:
                    downloaded_path = hf_hub_download(
                        repo_id=hf_repo,
                        filename=fallback_filename,
                        cache_dir="./hf_cache",
                        resume=True
                    )
                    shutil.copy2(downloaded_path, local_file)
                    print(f"✅ Using xlarge model as fallback")
                    # Update config
                    config['size'] = 'xlarge'
                    config['batch_size'] = min(config['batch_size'], 4)  # Reduce batch size
                    downloaded_count += 1
                except Exception as e2:
                    print(f"❌ Fallback also failed: {e2}")
                    return False
            else:
                return False
    
    print(f"\n📊 Download Summary:")
    print(f"   ✅ Successfully downloaded: {downloaded_count}/{len(required_files)} files")
    print(f"   💾 Drive cache: {'Enabled' if USE_DRIVE_CACHE else 'Disabled'}")
    print(f"   🎯 Model ready: {config['size']}")
    
    return downloaded_count == len(required_files)

# Download all required files
download_success = download_model_files(MODEL_CONFIG)

if not download_success:
    raise Exception("Failed to download required model files")

print("\n🎉 All model files ready!")

## 📁 Step 3: Setup Workspace & Check Input Files

In [None]:
# Create organized workspace
WORKSPACE = Path('/content/sema')
INPUT_DIR = WORKSPACE / 'data' / 'input'
OUTPUT_DIR = WORKSPACE / 'data' / 'output'
LOGS_DIR = WORKSPACE / 'logs'

# Create directories
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)
(LOGS_DIR / 'errors').mkdir(exist_ok=True)

print(f"📁 Workspace created at: {WORKSPACE}")
print(f"   📂 Input folder: {INPUT_DIR}")
print(f"   📂 Output folder: {OUTPUT_DIR}")
print(f"   📂 Logs folder: {LOGS_DIR}")

# Check for input files
excel_files = list(INPUT_DIR.glob('*.xlsx'))
print(f"\n📊 Input files found: {len(excel_files)}")

if len(excel_files) == 0:
    from IPython.display import display, HTML
    
    print("\n⚠️ No Excel files found!")
    
    # Show upload instructions
    display(HTML(f"""
    <div style="background: #e3f2fd; padding: 20px; border: 2px solid #2196f3; border-radius: 10px; margin: 15px 0;">
        <h3 style="color: #1976d2; margin-top: 0;">📤 Upload Your Excel Files</h3>
        <p><strong>Please upload your Excel files to this folder:</strong></p>
        <code style="background: #fff; padding: 5px; border-radius: 3px; font-size: 14px;">{INPUT_DIR}</code>
        
        <h4>How to Upload:</h4>
        <ol>
            <li>Click the <strong>📁 Files</strong> icon in the left sidebar</li>
            <li>Navigate to: <code>/content/sema/data/input/</code></li>
            <li>Click the <strong>📤 Upload</strong> button</li>
            <li>Select your Excel files (.xlsx)</li>
            <li>Wait for upload to complete</li>
            <li>Re-run this cell to verify</li>
        </ol>
        
        <h4>File Requirements:</h4>
        <ul>
            <li>Excel format (.xlsx)</li>
            <li>Must have <strong>VOC1</strong> column with Korean text</li>
            <li>Optional <strong>VOC2</strong> column for additional text</li>
        </ul>
    </div>
    """))
    
    # Stop execution until files are uploaded
    raise Exception(f"Please upload Excel files to {INPUT_DIR} first!")

else:
    print("\n📋 Files ready for processing:")
    for i, file in enumerate(excel_files, 1):
        file_size = file.stat().st_size / (1024*1024)  # MB
        print(f"   {i}. {file.name} ({file_size:.1f}MB)")
    
    print(f"\n✅ Ready to process {len(excel_files)} files!")

## 🚀 Step 4: Initialize SEMA & Process All Files

In [None]:
import time
import json
import traceback
from datetime import datetime
from src.cli import SemaInference

class ClientSemaProcessor:
    def __init__(self, model_config, input_dir, output_dir, logs_dir):
        self.config = model_config
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.logs_dir = Path(logs_dir)
        self.start_time = time.time()
        self.processed_files = []
        self.failed_files = []
        self.sema = None
        
    def initialize_sema(self):
        """Initialize SEMA with fallback handling"""
        print(f"🧠 Initializing SEMA {self.config['size']} model...")
        
        try:
            # Clear GPU memory first
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            # Initialize SEMA
            self.sema = SemaInference(
                model_path='team-lucid/deberta-v3-xlarge-korean' if 'xlarge' in self.config['size'] else 'team-lucid/deberta-v3-base-korean',
                checkpoint_path=f"model/{self.config['checkpoint']}"
            )
            
            # Set batch size
            self.sema.batch_size = self.config['batch_size']
            
            print(f"✅ SEMA initialized successfully!")
            print(f"   Model: {self.config['size']}")
            print(f"   Batch Size: {self.sema.batch_size}")
            print(f"   GPU Memory: {self.config['gpu_memory']:.1f}GB")
            
            return True
            
        except Exception as e:
            error_msg = str(e).lower()
            if 'out of memory' in error_msg or 'cuda' in error_msg:
                print(f"❌ GPU Memory Error: {e}")
                
                if self.config['size'] == 'xlarge':
                    print("🔄 Attempting fallback to smaller settings...")
                    
                    # Try smaller batch size first
                    if self.config['batch_size'] > 2:
                        self.config['batch_size'] = max(2, self.config['batch_size'] // 2)
                        print(f"   Reducing batch size to {self.config['batch_size']}")
                        return self.initialize_sema()  # Recursive retry
                    
                    # If still failing, suggest small model
                    print("💡 XLarge model requires more memory than available")
                    print("   Please upload a small model or use a higher-tier GPU")
                    return False
                else:
                    print("❌ Even small model failed - GPU memory critically low")
                    return False
            else:
                print(f"❌ Unexpected initialization error: {e}")
                traceback.print_exc()
                return False
    
    def process_single_file(self, input_file):
        """Process a single Excel file with timeout and error handling"""
        file_start = time.time()
        output_file = self.output_dir / f"{input_file.stem}_output.xlsx"
        
        try:
            print(f"\n📄 Processing: {input_file.name}")
            
            # Process file
            success = self.sema.process_file(str(input_file), str(output_file))
            
            elapsed = time.time() - file_start
            
            if success and output_file.exists():
                output_size = output_file.stat().st_size / (1024*1024)  # MB
                print(f"✅ Completed: {input_file.name} → {output_file.name}")
                print(f"   Time: {elapsed:.1f}s, Size: {output_size:.1f}MB")
                self.processed_files.append(input_file.name)
                return True
            else:
                print(f"❌ Failed: {input_file.name} (no output generated)")
                self.failed_files.append(input_file.name)
                return False
                
        except Exception as e:
            elapsed = time.time() - file_start
            print(f"❌ Error processing {input_file.name}: {e}")
            print(f"   Time elapsed: {elapsed:.1f}s")
            
            # Log detailed error
            error_log = {
                'file': input_file.name,
                'error': str(e),
                'timestamp': datetime.now().isoformat(),
                'processing_time': elapsed,
                'model_config': self.config,
                'traceback': traceback.format_exc()
            }
            
            error_file = self.logs_dir / 'errors' / f"{input_file.stem}_error.json"
            with open(error_file, 'w', encoding='utf-8') as f:
                json.dump(error_log, f, indent=2, ensure_ascii=False)
            
            self.failed_files.append(input_file.name)
            return False
    
    def process_all(self):
        """Process all Excel files in input directory"""
        # Initialize SEMA
        if not self.initialize_sema():
            return False
        
        # Get input files
        excel_files = list(self.input_dir.glob('*.xlsx'))
        total_files = len(excel_files)
        
        if total_files == 0:
            print("❌ No Excel files found in input directory")
            return False
        
        print(f"\n🚀 Starting batch processing of {total_files} files...")
        print(f"⏰ Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"🧠 Model: {self.config['size']} (batch_size={self.config['batch_size']})")
        
        # Process each file
        for i, excel_file in enumerate(excel_files, 1):
            print(f"\n📊 Progress: {i}/{total_files}")
            self.process_single_file(excel_file)
            
            # Memory cleanup between files
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        # Final summary
        total_time = time.time() - self.start_time
        success_rate = len(self.processed_files) / total_files * 100
        
        print(f"\n🎉 Processing Complete!")
        print(f"   ✅ Successful: {len(self.processed_files)}/{total_files} files ({success_rate:.1f}%)")
        print(f"   ❌ Failed: {len(self.failed_files)} files")
        print(f"   ⏰ Total time: {total_time:.1f}s")
        print(f"   🧠 Model used: {self.config['size']}")
        
        if self.failed_files:
            print(f"\n⚠️ Failed files: {', '.join(self.failed_files)}")
            print(f"📁 Error details saved in: {self.logs_dir}/errors/")
        
        return len(self.processed_files) > 0

# Initialize and run processor
processor = ClientSemaProcessor(MODEL_CONFIG, INPUT_DIR, OUTPUT_DIR, LOGS_DIR)
processing_success = processor.process_all()

if processing_success:
    print("\n🌟 All processing completed successfully!")
else:
    print("\n💥 Processing failed or incomplete")
    print("🔍 Check the error logs above for details")

## 📥 Step 5: Download Results Automatically

In [None]:
from google.colab import files
import zipfile

# Check output files
output_files = list(OUTPUT_DIR.glob('*.xlsx'))
total_outputs = len(output_files)

print(f"📁 Output files generated: {total_outputs}")

if total_outputs == 0:
    print("❌ No output files to download")
    print("🔍 Check processing errors above")
    
elif total_outputs == 1:
    # Single file - download directly
    output_file = output_files[0]
    print(f"📥 Downloading: {output_file.name}")
    try:
        files.download(str(output_file))
        print(f"✅ Downloaded: {output_file.name}")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        
else:
    # Multiple files - create zip
    zip_filename = f"SEMA_Results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
    zip_path = OUTPUT_DIR / zip_filename
    
    print(f"📦 Creating zip file: {zip_filename}")
    
    try:
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for output_file in output_files:
                zipf.write(output_file, output_file.name)
                print(f"   ✅ Added: {output_file.name}")
        
        # Download zip file
        zip_size = zip_path.stat().st_size / (1024*1024)  # MB
        print(f"\n📥 Downloading zip file ({zip_size:.1f}MB)...")
        files.download(str(zip_path))
        print(f"✅ Downloaded: {zip_filename}")
        
    except Exception as e:
        print(f"❌ Zip creation/download failed: {e}")
        print("📥 Downloading individual files...")
        
        # Fallback to individual downloads
        for output_file in output_files:
            try:
                files.download(str(output_file))
                print(f"✅ Downloaded: {output_file.name}")
            except Exception as e2:
                print(f"❌ Failed to download {output_file.name}: {e2}")

# Final summary
print(f"\n🏁 SEMA Client Complete - Session Summary:")
print(f"   📊 Input files: {len(excel_files)}")
print(f"   ✅ Processed: {len(processor.processed_files)}")
print(f"   📥 Downloaded: {total_outputs}")
print(f"   🧠 Model: {MODEL_CONFIG['size']}")
print(f"   💾 Cache: {'Drive enabled' if USE_DRIVE_CACHE else 'Direct download'}")

if len(processor.failed_files) > 0:
    print(f"   ⚠️ Failed: {len(processor.failed_files)} files")
    print(f"   📁 Error logs: {LOGS_DIR}/errors/")

print(f"\n🎊 All done! Check your Downloads folder for results.")

# Show output format info
if total_outputs > 0:
    print(f"\n📋 Output File Format:")
    print(f"   • Original columns preserved")
    print(f"   • VOC: Processed Korean text")
    print(f"   • topic: Classified topics")
    print(f"   • sentiment: Sentiment analysis")
    print(f"   • keyword: Extracted keywords")