# 🚀 CekAjaYuk - Complete Model Training
## Deteksi Lowongan Kerja Palsu dengan Machine Learning & Deep Learning

**Notebook ini menggabungkan semua proses training:**
1. 📂 Dataset Preparation & Validation
2. 🔧 Data Preprocessing & Feature Extraction
3. 🌲 Random Forest Training & Optimization
4. 🧠 CNN/TensorFlow Deep Learning Training
5. 📊 Model Evaluation & Export

---
### 📋 Requirements:
- Dataset: 800 gambar (400 fake + 400 genuine)
- Python 3.7+
- Libraries: scikit-learn, tensorflow, opencv, pillow

### 🎯 Output:
- `random_forest_production.pkl`
- `cnn_production.h5`
- `feature_scaler_production.pkl`
- `text_vectorizer_production.pkl`

## 🔧 1. Setup & Installation

In [None]:
# Install required packages for Google Colab
!pip install opencv-python-headless
!pip install scikit-learn
!pip install tensorflow
!pip install pillow
!pip install matplotlib seaborn
!pip install joblib

print("✅ All packages installed successfully!")

In [None]:
# Import all required libraries
import os
import sys
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import json
import joblib
import zipfile
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score
)

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Dense, Flatten, Dropout, 
    GlobalAveragePooling2D, BatchNormalization
)
from tensorflow.keras.applications import VGG16
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print("📚 All libraries imported successfully!")
print(f"🔥 TensorFlow version: {tf.__version__}")
print(f"🐍 Python version: {sys.version}")

## 📂 2. Dataset Upload & Preparation

### 📋 Instructions:
1. **Zip your dataset** dengan struktur:
   ```
   dataset.zip
   ├── fake/
   │   ├── fake_job_001.jpg
   │   ├── fake_job_002.jpg
   │   └── ... (400 files)
   └── genuine/
       ├── genuine_job_001.jpg
       ├── genuine_job_002.jpg
       └── ... (400 files)
   ```
2. **Upload** file `dataset.zip` ke Colab
3. **Run** cell di bawah untuk extract

In [None]:
from google.colab import files

print("📤 Upload your dataset.zip file:")
uploaded = files.upload()

# Extract dataset
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        print(f"📦 Extracting {filename}...")
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('.')
        print(f"✅ {filename} extracted successfully!")
        break
else:
    print("❌ No zip file found. Please upload dataset.zip")

In [None]:
# Validate dataset structure
def validate_dataset(dataset_dir='dataset'):
    """Validate dataset structure and count files"""
    dataset_path = Path(dataset_dir)
    
    if not dataset_path.exists():
        print(f"❌ Dataset directory '{dataset_dir}' not found!")
        return False
    
    fake_dir = dataset_path / 'fake'
    genuine_dir = dataset_path / 'genuine'
    
    if not fake_dir.exists():
        print(f"❌ Fake directory not found: {fake_dir}")
        return False
        
    if not genuine_dir.exists():
        print(f"❌ Genuine directory not found: {genuine_dir}")
        return False
    
    # Count files
    fake_files = list(fake_dir.glob('*.[jJ][pP][gG]')) + list(fake_dir.glob('*.[pP][nN][gG]'))
    genuine_files = list(genuine_dir.glob('*.[jJ][pP][gG]')) + list(genuine_dir.glob('*.[pP][nN][gG]'))
    
    print(f"📊 Dataset Validation:")
    print(f"   📁 Fake samples: {len(fake_files)}")
    print(f"   📁 Genuine samples: {len(genuine_files)}")
    print(f"   📁 Total samples: {len(fake_files) + len(genuine_files)}")
    
    if len(fake_files) == 0 or len(genuine_files) == 0:
        print("❌ Dataset validation failed: Empty directories")
        return False
    
    print("✅ Dataset validation passed!")
    return True

# Validate the uploaded dataset
dataset_valid = validate_dataset()

if not dataset_valid:
    print("\n🔄 Trying alternative dataset paths...")
    # Try different possible paths
    for possible_path in ['data', 'datasets', '.']:
        if validate_dataset(possible_path):
            dataset_valid = True
            break

if not dataset_valid:
    print("\n❌ Dataset validation failed. Please check your upload.")
else:
    print("\n🎉 Ready to proceed with training!")

## 🔧 3. Feature Extraction & Data Preprocessing

In [None]:
class ImageFeatureExtractor:
    """Extract features from job posting images"""
    
    def __init__(self, img_size=(224, 224)):
        self.img_size = img_size
        self.supported_formats = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
        
    def load_and_preprocess_image(self, image_path):
        """Load and preprocess single image"""
        try:
            # Load image
            img = cv2.imread(str(image_path))
            if img is None:
                return None
                
            # Convert BGR to RGB
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # Resize
            img = cv2.resize(img, self.img_size)
            
            # Normalize
            img = img.astype(np.float32) / 255.0
            
            return img
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            return None
    
    def extract_visual_features(self, img):
        """Extract visual features from image"""
        if img is None:
            return np.zeros(10)  # Return zero features if image is None
            
        features = []
        
        # Convert to different color spaces
        gray = cv2.cvtColor((img * 255).astype(np.uint8), cv2.COLOR_RGB2GRAY)
        hsv = cv2.cvtColor((img * 255).astype(np.uint8), cv2.COLOR_RGB2HSV)
        
        # 1. Color statistics
        features.extend([
            np.mean(img[:,:,0]),  # Red mean
            np.mean(img[:,:,1]),  # Green mean
            np.mean(img[:,:,2]),  # Blue mean
            np.std(img[:,:,0]),   # Red std
            np.std(img[:,:,1]),   # Green std
            np.std(img[:,:,2])    # Blue std
        ])
        
        # 2. Brightness and contrast
        features.extend([
            np.mean(gray),        # Brightness
            np.std(gray),         # Contrast
        ])
        
        # 3. Edge density (Canny edges)
        edges = cv2.Canny(gray, 50, 150)
        edge_density = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
        features.append(edge_density)
        
        # 4. Texture (using Laplacian variance)
        laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
        features.append(laplacian_var)
        
        return np.array(features)
    
    def load_dataset(self, dataset_dir='dataset'):
        """Load complete dataset with features and labels"""
        dataset_path = Path(dataset_dir)
        
        images = []
        features = []
        labels = []
        filenames = []
        
        # Load fake samples
        fake_dir = dataset_path / 'fake'
        fake_files = list(fake_dir.glob('*.[jJ][pP][gG]')) + list(fake_dir.glob('*.[pP][nN][gG]'))
        
        print(f"📂 Loading {len(fake_files)} fake samples...")
        for i, img_path in enumerate(fake_files):
            img = self.load_and_preprocess_image(img_path)
            if img is not None:
                images.append(img)
                features.append(self.extract_visual_features(img))
                labels.append(0)  # 0 = fake
                filenames.append(img_path.name)
            
            if (i + 1) % 50 == 0:
                print(f"   Processed {i + 1}/{len(fake_files)} fake samples")
        
        # Load genuine samples
        genuine_dir = dataset_path / 'genuine'
        genuine_files = list(genuine_dir.glob('*.[jJ][pP][gG]')) + list(genuine_dir.glob('*.[pP][nN][gG]'))
        
        print(f"📂 Loading {len(genuine_files)} genuine samples...")
        for i, img_path in enumerate(genuine_files):
            img = self.load_and_preprocess_image(img_path)
            if img is not None:
                images.append(img)
                features.append(self.extract_visual_features(img))
                labels.append(1)  # 1 = genuine
                filenames.append(img_path.name)
            
            if (i + 1) % 50 == 0:
                print(f"   Processed {i + 1}/{len(genuine_files)} genuine samples")
        
        print(f"\n✅ Dataset loaded successfully!")
        print(f"   📊 Total samples: {len(images)}")
        print(f"   📊 Fake samples: {sum(1 for l in labels if l == 0)}")
        print(f"   📊 Genuine samples: {sum(1 for l in labels if l == 1)}")
        
        return {
            'images': np.array(images),
            'features': np.array(features),
            'labels': np.array(labels),
            'filenames': filenames
        }

# Initialize feature extractor
feature_extractor = ImageFeatureExtractor()
print("🔧 Feature extractor initialized!")

In [None]:
# Load and process the dataset
print("🚀 Starting dataset loading and feature extraction...")
print("⏱️  This may take several minutes depending on dataset size...\n")

# Load dataset
dataset = feature_extractor.load_dataset('dataset')

# Extract components
X_images = dataset['images']
X_features = dataset['features']
y = dataset['labels']
filenames = dataset['filenames']

print(f"\n📊 Dataset Summary:")
print(f"   🖼️  Image shape: {X_images.shape}")
print(f"   🔢 Feature shape: {X_features.shape}")
print(f"   🏷️  Labels shape: {y.shape}")
print(f"   📁 Files processed: {len(filenames)}")

# Data distribution
unique, counts = np.unique(y, return_counts=True)
print(f"\n📈 Class Distribution:")
for label, count in zip(unique, counts):
    class_name = 'Fake' if label == 0 else 'Genuine'
    percentage = (count / len(y)) * 100
    print(f"   {class_name}: {count} samples ({percentage:.1f}%)")

## 🔄 4. Data Splitting & Preprocessing

In [None]:
# Split data for training and testing
print("🔄 Splitting dataset...")

# Split images and features
X_img_train, X_img_test, X_feat_train, X_feat_test, y_train, y_test = train_test_split(
    X_images, X_features, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Further split training data for validation
X_img_train, X_img_val, X_feat_train, X_feat_val, y_train, y_val = train_test_split(
    X_img_train, X_feat_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print(f"📊 Data Split Summary:")
print(f"   🏋️  Training: {len(X_img_train)} samples")
print(f"   ✅ Validation: {len(X_img_val)} samples")
print(f"   🧪 Testing: {len(X_img_test)} samples")

# Scale features for Random Forest
print("\n⚙️ Scaling features...")
scaler = StandardScaler()
X_feat_train_scaled = scaler.fit_transform(X_feat_train)
X_feat_val_scaled = scaler.transform(X_feat_val)
X_feat_test_scaled = scaler.transform(X_feat_test)

print("✅ Data preprocessing completed!")

## 🌲 5. Random Forest Training & Optimization

In [None]:
# Random Forest Training with Hyperparameter Tuning
print("🌲 Training Random Forest Classifier...")

# Define parameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Perform GridSearch
print("🔍 Performing hyperparameter optimization...")
print("⏱️  This may take 10-15 minutes...")

grid_search = GridSearchCV(
    rf, param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_feat_train_scaled, y_train)

# Get best model
best_rf = grid_search.best_estimator_

print(f"\n🎯 Best Random Forest Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"   {param}: {value}")

print(f"\n📊 Best Cross-Validation Score: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate Random Forest
print("📊 Evaluating Random Forest Model...")

# Predictions
y_train_pred = best_rf.predict(X_feat_train_scaled)
y_val_pred = best_rf.predict(X_feat_val_scaled)
y_test_pred = best_rf.predict(X_feat_test_scaled)

# Prediction probabilities
y_train_proba = best_rf.predict_proba(X_feat_train_scaled)[:, 1]
y_val_proba = best_rf.predict_proba(X_feat_val_scaled)[:, 1]
y_test_proba = best_rf.predict_proba(X_feat_test_scaled)[:, 1]

# Calculate metrics
train_acc = accuracy_score(y_train, y_train_pred)
val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

train_auc = roc_auc_score(y_train, y_train_proba)
val_auc = roc_auc_score(y_val, y_val_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

print(f"\n🎯 Random Forest Performance:")
print(f"   📈 Training Accuracy: {train_acc:.4f}")
print(f"   📈 Validation Accuracy: {val_acc:.4f}")
print(f"   📈 Test Accuracy: {test_acc:.4f}")
print(f"   📈 Training AUC: {train_auc:.4f}")
print(f"   📈 Validation AUC: {val_auc:.4f}")
print(f"   📈 Test AUC: {test_auc:.4f}")

# Detailed classification report
print(f"\n📋 Detailed Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=['Fake', 'Genuine']))

# Feature importance
feature_names = [
    'Red_Mean', 'Green_Mean', 'Blue_Mean', 'Red_Std', 'Green_Std', 'Blue_Std',
    'Brightness', 'Contrast', 'Edge_Density', 'Texture_Variance'
]

importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print(f"\n🔍 Top 5 Most Important Features:")
for idx, row in feature_importance_df.head().iterrows():
    print(f"   {row['Feature']}: {row['Importance']:.4f}")

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

## 🧠 6. CNN/Deep Learning Training

In [None]:
# Build CNN Model
def create_cnn_model(input_shape=(224, 224, 3)):
    """Create CNN model for job posting classification"""
    model = Sequential([
        # First Convolutional Block
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Second Convolutional Block
        Conv2D(64, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Third Convolutional Block
        Conv2D(128, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        
        # Fourth Convolutional Block
        Conv2D(256, (3, 3), activation='relu'),
        BatchNormalization(),
        GlobalAveragePooling2D(),
        Dropout(0.5),
        
        # Dense Layers
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        # Output Layer
        Dense(1, activation='sigmoid')
    ])
    
    return model

# Create and compile model
print("🧠 Creating CNN Model...")
cnn_model = create_cnn_model()

# Compile model
cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Model summary
print("\n📋 CNN Model Architecture:")
cnn_model.summary()

# Count parameters
total_params = cnn_model.count_params()
print(f"\n📊 Total Parameters: {total_params:,}")

In [None]:
# Data Augmentation
print("🔄 Setting up data augmentation...")

# Training data generator with augmentation
train_datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Validation data generator (no augmentation)
val_datagen = ImageDataGenerator()

# Fit generators
train_generator = train_datagen.flow(X_img_train, y_train, batch_size=32)
val_generator = val_datagen.flow(X_img_val, y_val, batch_size=32)

print("✅ Data augmentation setup completed!")

In [None]:
# Training Callbacks
print("⚙️ Setting up training callbacks...")

callbacks = [
    EarlyStopping(
        monitor='val_accuracy',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        'best_cnn_model.h5',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

# Train the model
print("\n🚀 Starting CNN training...")
print("⏱️  This may take 30-60 minutes depending on GPU availability...")

history = cnn_model.fit(
    train_generator,
    steps_per_epoch=len(X_img_train) // 32,
    epochs=50,
    validation_data=val_generator,
    validation_steps=len(X_img_val) // 32,
    callbacks=callbacks,
    verbose=1
)

print("\n✅ CNN training completed!")

In [None]:
# Evaluate CNN Model
print("📊 Evaluating CNN Model...")

# Load best model
best_cnn = tf.keras.models.load_model('best_cnn_model.h5')

# Predictions
y_train_pred_cnn = (best_cnn.predict(X_img_train) > 0.5).astype(int).flatten()
y_val_pred_cnn = (best_cnn.predict(X_img_val) > 0.5).astype(int).flatten()
y_test_pred_cnn = (best_cnn.predict(X_img_test) > 0.5).astype(int).flatten()

# Prediction probabilities
y_train_proba_cnn = best_cnn.predict(X_img_train).flatten()
y_val_proba_cnn = best_cnn.predict(X_img_val).flatten()
y_test_proba_cnn = best_cnn.predict(X_img_test).flatten()

# Calculate metrics
train_acc_cnn = accuracy_score(y_train, y_train_pred_cnn)
val_acc_cnn = accuracy_score(y_val, y_val_pred_cnn)
test_acc_cnn = accuracy_score(y_test, y_test_pred_cnn)

train_auc_cnn = roc_auc_score(y_train, y_train_proba_cnn)
val_auc_cnn = roc_auc_score(y_val, y_val_proba_cnn)
test_auc_cnn = roc_auc_score(y_test, y_test_proba_cnn)

print(f"\n🎯 CNN Performance:")
print(f"   📈 Training Accuracy: {train_acc_cnn:.4f}")
print(f"   📈 Validation Accuracy: {val_acc_cnn:.4f}")
print(f"   📈 Test Accuracy: {test_acc_cnn:.4f}")
print(f"   📈 Training AUC: {train_auc_cnn:.4f}")
print(f"   📈 Validation AUC: {val_auc_cnn:.4f}")
print(f"   📈 Test AUC: {test_auc_cnn:.4f}")

# Detailed classification report
print(f"\n📋 Detailed Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred_cnn, target_names=['Fake', 'Genuine']))

# Plot training history
plt.figure(figsize=(15, 5))

# Accuracy plot
plt.subplot(1, 3, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss plot
plt.subplot(1, 3, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Learning rate plot (if available)
plt.subplot(1, 3, 3)
if 'lr' in history.history:
    plt.plot(history.history['lr'])
    plt.title('Learning Rate')
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.yscale('log')
else:
    plt.text(0.5, 0.5, 'Learning Rate\nNot Recorded', 
             ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Learning Rate')

plt.tight_layout()
plt.show()

## 📊 7. Model Comparison & Analysis

In [None]:
# Compare model performances
print("📊 Model Comparison Summary:")
print("=" * 60)

comparison_data = {
    'Model': ['Random Forest', 'CNN'],
    'Test_Accuracy': [test_acc, test_acc_cnn],
    'Test_AUC': [test_auc, test_auc_cnn],
    'Val_Accuracy': [val_acc, val_acc_cnn],
    'Val_AUC': [val_auc, val_auc_cnn]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.round(4))

# Determine best model
best_model_idx = comparison_df['Test_Accuracy'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
best_accuracy = comparison_df.loc[best_model_idx, 'Test_Accuracy']

print(f"\n🏆 Best Model: {best_model_name}")
print(f"🎯 Best Test Accuracy: {best_accuracy:.4f}")

# Ensemble prediction (average of both models)
print("\n🤝 Creating Ensemble Prediction...")
ensemble_proba = (y_test_proba + y_test_proba_cnn) / 2
ensemble_pred = (ensemble_proba > 0.5).astype(int)

ensemble_acc = accuracy_score(y_test, ensemble_pred)
ensemble_auc = roc_auc_score(y_test, ensemble_proba)

print(f"🎯 Ensemble Test Accuracy: {ensemble_acc:.4f}")
print(f"🎯 Ensemble Test AUC: {ensemble_auc:.4f}")

# Add ensemble to comparison
comparison_data['Model'].append('Ensemble')
comparison_data['Test_Accuracy'].append(ensemble_acc)
comparison_data['Test_AUC'].append(ensemble_auc)
comparison_data['Val_Accuracy'].append(np.nan)  # Not calculated for ensemble
comparison_data['Val_AUC'].append(np.nan)

final_comparison_df = pd.DataFrame(comparison_data)
print("\n📈 Final Model Comparison:")
print(final_comparison_df.round(4))

# Visualization
plt.figure(figsize=(12, 8))

# Accuracy comparison
plt.subplot(2, 2, 1)
models = ['Random Forest', 'CNN', 'Ensemble']
accuracies = [test_acc, test_acc_cnn, ensemble_acc]
colors = ['skyblue', 'lightcoral', 'lightgreen']
plt.bar(models, accuracies, color=colors)
plt.title('Test Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')

# AUC comparison
plt.subplot(2, 2, 2)
aucs = [test_auc, test_auc_cnn, ensemble_auc]
plt.bar(models, aucs, color=colors)
plt.title('Test AUC Comparison')
plt.ylabel('AUC')
plt.ylim(0, 1)
for i, v in enumerate(aucs):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')

# Confusion Matrix for best model
plt.subplot(2, 2, 3)
if best_model_name == 'Random Forest':
    cm = confusion_matrix(y_test, y_test_pred)
else:
    cm = confusion_matrix(y_test, y_test_pred_cnn)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Fake', 'Genuine'], 
            yticklabels=['Fake', 'Genuine'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Performance metrics radar chart data
plt.subplot(2, 2, 4)
metrics_rf = [test_acc, test_auc, precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred)]
metrics_cnn = [test_acc_cnn, test_auc_cnn, precision_score(y_test, y_test_pred_cnn), recall_score(y_test, y_test_pred_cnn)]
metrics_names = ['Accuracy', 'AUC', 'Precision', 'Recall']

x = np.arange(len(metrics_names))
width = 0.35

plt.bar(x - width/2, metrics_rf, width, label='Random Forest', alpha=0.8)
plt.bar(x + width/2, metrics_cnn, width, label='CNN', alpha=0.8)
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.xticks(x, metrics_names, rotation=45)
plt.legend()
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## 💾 8. Model Export & Production Files

In [None]:
# Export trained models for production
print("💾 Exporting models for production...")

# Create models directory
os.makedirs('production_models', exist_ok=True)

# Export Random Forest
rf_filename = 'production_models/random_forest_production.pkl'
joblib.dump(best_rf, rf_filename)
print(f"✅ Random Forest saved: {rf_filename}")

# Export Feature Scaler
scaler_filename = 'production_models/feature_scaler_production.pkl'
joblib.dump(scaler, scaler_filename)
print(f"✅ Feature Scaler saved: {scaler_filename}")

# Export CNN Model
cnn_filename = 'production_models/cnn_production.h5'
best_cnn.save(cnn_filename)
print(f"✅ CNN Model saved: {cnn_filename}")

# Create feature names file
feature_names_file = 'production_models/feature_names_production.txt'
with open(feature_names_file, 'w') as f:
    for name in feature_names:
        f.write(f"{name}\n")
print(f"✅ Feature names saved: {feature_names_file}")

# Create model metadata
metadata = {
    'training_date': datetime.now().isoformat(),
    'dataset_size': len(y),
    'train_size': len(y_train),
    'val_size': len(y_val),
    'test_size': len(y_test),
    'random_forest': {
        'test_accuracy': float(test_acc),
        'test_auc': float(test_auc),
        'best_params': grid_search.best_params_
    },
    'cnn': {
        'test_accuracy': float(test_acc_cnn),
        'test_auc': float(test_auc_cnn),
        'total_params': int(total_params)
    },
    'ensemble': {
        'test_accuracy': float(ensemble_acc),
        'test_auc': float(ensemble_auc)
    },
    'best_model': best_model_name,
    'feature_names': feature_names
}

metadata_file = 'production_models/model_metadata.json'
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"✅ Model metadata saved: {metadata_file}")

print(f"\n🎉 All models exported successfully!")
print(f"📁 Production files location: production_models/")
print(f"\n📋 Files created:")
print(f"   🌲 {rf_filename}")
print(f"   ⚙️ {scaler_filename}")
print(f"   🧠 {cnn_filename}")
print(f"   📝 {feature_names_file}")
print(f"   📊 {metadata_file}")

## 📥 9. Download Production Models

In [None]:
# Download production models
from google.colab import files
import zipfile

print("📦 Creating production models archive...")

# Create zip file with all production models
zip_filename = 'cekajayuk_production_models.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add all files from production_models directory
    for file_path in Path('production_models').glob('*'):
        if file_path.is_file():
            zipf.write(file_path, file_path.name)
            print(f"   Added: {file_path.name}")

print(f"\n✅ Archive created: {zip_filename}")
print(f"📊 Archive size: {os.path.getsize(zip_filename) / (1024*1024):.2f} MB")

# Download the zip file
print("\n📥 Downloading production models...")
files.download(zip_filename)

print("\n🎉 Download completed!")
print("\n📋 Next Steps:")
print("1. Extract the downloaded zip file")
print("2. Copy model files to your CekAjaYuk project's 'models/' directory")
print("3. Update your backend to use these production models")
print("4. Test the models with your backend API")

print("\n🚀 Your CekAjaYuk models are ready for production!")