In [85]:
# Multimodal Hate Speech Detection Pipeline - Fixed and Complete
# Compatible with VS Code Jupyter Notebook

import os
import sys
import warnings
import json
import pickle
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Union
from dataclasses import dataclass
import time

# Data & Numerical
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML

# Deep Learning
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import clip
from torchvision import transforms
from PIL import Image

# ML & Clustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, precision_recall_curve,
                             precision_recall_fscore_support, average_precision_score,
                             silhouette_score, davies_bouldin_score)
from sklearn.cluster import DBSCAN
import xgboost as xgb
import umap.umap_ as umap

# Web scraping
import requests
from bs4 import BeautifulSoup
from io import BytesIO
import re

# Sentence Transformers
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

# Set seeds
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)


In [73]:
# ========================================================================
# 1. CONFIGURATION
# ========================================================================

@dataclass
class Config:
    """Central configuration - MODIFY THESE PATHS FOR YOUR SETUP"""
    
    # ============ UPDATE THESE PATHS ============
    json_dir: str = r"D:\B.Tech\Hackathons\CinehackAI\Harmful Data\POC DATA\Text"
    images_dir: str = r"D:\B.Tech\Hackathons\CinehackAI\Harmful Data\POC DATA\IMAGES"
    
    # ============ JSON FIELD NAMES ============
    text_field: str = "img_text"
    id_field: str = "id"
    label_field: str = "label"
    
    # ============ IMAGE MATCHING ============
    image_matching: str = "filename"  # "filename", "id_field", or "custom"
    
    # ============ LABEL MAPPING ============
    label_mapping: Dict[str, int] = None
    default_label: int = 1  # 1 = hate, 0 = safe
    
    # ============ OTHER SETTINGS ============
    cache_dir: str = "./cache"
    models_dir: str = "./models"
    results_dir: str = "./results"
    
    # Model settings
    text_model: str = "sentence-transformers/all-mpnet-base-v2"
    image_model: str = "clip"
    text_dim: int = 768
    image_dim: int = 512
    
    # Processing
    batch_size: int = 32
    max_samples: int = None
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Dimensionality reduction
    reduction_method: str = "pca"  # Changed to PCA for stability
    reduced_dim: int = 50  # Reduced for small datasets
    
    # Clustering
    eps: float = 0.5
    min_samples: int = 3  # Reduced for small datasets
    
    # Classification
    test_size: float = 0.2
    random_state: int = 42
    
    def _post_init_(self):
        for dir_path in [self.cache_dir, self.models_dir, self.results_dir]:
            Path(dir_path).mkdir(parents=True, exist_ok=True)

config = Config()

print(f"\nDevice: {config.device}")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")


Device: cuda
GPU Available: True
GPU: NVIDIA GeForce RTX 2050
CUDA Version: 12.8


In [74]:
# ========================================================================
# 2. DATA LOADER
# ========================================================================

class JSONDataLoader:
    """Load and preprocess dataset from JSON files + images"""

    def __init__(self, config: Config):
        self.config = config

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if pd.isna(text) or not isinstance(text, str):
            return ""
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'http\S+|www\.\S+', '', text)
        text = re.sub(r'[^\w\s,.!?-]', '', text)
        text = ' '.join(text.split())
        return text.strip()

    def load_json_file(self, file_path: Path) -> Dict:
        """Load a single JSON file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            return data
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return None

    def get_image_path(self, json_file: Path, json_data: Dict) -> Optional[Path]:
        """Determine corresponding image path"""
        if self.config.image_matching == "filename":
            image_name = json_file.stem + ".jpg"
            image_path = Path(self.config.images_dir) / image_name
        elif self.config.image_matching == "id_field":
            if self.config.id_field in json_data:
                image_name = str(json_data[self.config.id_field]) + ".jpg"
                image_path = Path(self.config.images_dir) / image_name
            else:
                return None
        else:
            image_name = json_file.stem + ".jpg"
            image_path = Path(self.config.images_dir) / image_name
        
        return image_path if image_path.exists() else None

    def get_label(self, json_data: Dict) -> int:
        """Extract label from JSON data"""
        if self.config.label_field and self.config.label_field in json_data:
            label_value = json_data[self.config.label_field]
            
            if self.config.label_mapping:
                return self.config.label_mapping.get(label_value, self.config.default_label)
            
            try:
                return int(label_value)
            except:
                label_lower = str(label_value).lower()
                if label_lower in ['hate', 'hateful', 'toxic', 'offensive', '1', 'true']:
                    return 1
                elif label_lower in ['safe', 'normal', 'clean', '0', 'false']:
                    return 0
                else:
                    return self.config.default_label
        
        return self.config.default_label

    def load_dataset(self) -> pd.DataFrame:
        """Load and preprocess dataset from JSON files"""
        print("\n" + "=" * 70)
        print("LOADING DATASET FROM JSON FILES")
        print("=" * 70)

        json_files = list(Path(self.config.json_dir).glob("*.json"))
        print(f"Found {len(json_files)} JSON files in {self.config.json_dir}")

        if len(json_files) == 0:
            raise FileNotFoundError(f"No JSON files found in {self.config.json_dir}")

        data = []
        skipped_no_text = 0
        skipped_no_image = 0
        skipped_error = 0

        for json_file in tqdm(json_files, desc="Loading JSON files"):
            json_data = self.load_json_file(json_file)
            if json_data is None:
                skipped_error += 1
                continue

            text = json_data.get(self.config.text_field, "")
            if not text or not isinstance(text, str):
                skipped_no_text += 1
                continue

            cleaned_text = self.clean_text(text)
            if len(cleaned_text) == 0:
                skipped_no_text += 1
                continue

            image_path = self.get_image_path(json_file, json_data)
            if image_path is None or not image_path.exists():
                skipped_no_image += 1
                continue

            label = self.get_label(json_data)

            data.append({
                "filename": json_file.name,
                "raw_text": text,
                "cleaned_text": cleaned_text,
                "image_path": str(image_path),
                "hate_label": label
            })

        df = pd.DataFrame(data)

        if self.config.max_samples and len(df) > self.config.max_samples:
            df = df.sample(n=self.config.max_samples, random_state=42).reset_index(drop=True)

        print(f"\n{'='*70}")
        print(f"DATASET LOADING SUMMARY")
        print(f"{'='*70}")
        print(f"Total JSON files found:     {len(json_files)}")
        print(f"Skipped (no text):          {skipped_no_text}")
        print(f"Skipped (no image):         {skipped_no_image}")
        print(f"Skipped (errors):           {skipped_error}")
        print(f"Successfully loaded:        {len(df)}")
        print(f"\nLabel distribution:")
        print(f"  Hate samples (1):         {df['hate_label'].sum()} ({df['hate_label'].mean()*100:.1f}%)")
        print(f"  Safe samples (0):         {(1-df['hate_label']).sum()} ({(1-df['hate_label'].mean())*100:.1f}%)")
        print(f"{'='*70}")

        if len(df) == 0:
            raise ValueError("No valid samples found!")

        return df

In [75]:
# ========================================================================
# 3. SAFE PROMPT GENERATOR
# ========================================================================

class SafePromptGenerator:
    """Generate synthetic safe/non-hate prompts"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        np.random.seed(random_state)
        
        self.templates = {
            'questions': [
                "How do I learn {}?",
                "What's the best way to {}?",
                "Can you explain {} to me?",
                "I'm interested in learning about {}",
                "What are some good resources for {}?"
            ],
            'topics': [
                'cooking', 'programming', 'photography', 'gardening', 'music',
                'painting', 'writing', 'exercise', 'meditation', 'reading',
                'traveling', 'languages', 'science', 'history', 'mathematics'
            ],
            'positive': [
                "I love learning new things",
                "This is so interesting",
                "Thank you for your help",
                "I appreciate your assistance",
                "That's very helpful information"
            ]
        }
    
    def generate(self, n_samples: int, placeholder_image: str) -> pd.DataFrame:
        """Generate n_samples of safe prompts"""
        texts = []
        image_paths = []
        
        for i in range(n_samples):
            if i % 3 == 0:
                text = np.random.choice(self.templates['positive'])
            else:
                template = np.random.choice(self.templates['questions'])
                topic = np.random.choice(self.templates['topics'])
                text = template.format(topic)
            
            texts.append(text)
            image_paths.append(placeholder_image)  # Use actual image
        
        return pd.DataFrame({
            'cleaned_text': texts,
            'image_path': image_paths,
            'hate_label': 0
        })

In [76]:
# ========================================================================
# 4. TEXT EMBEDDER
# ========================================================================

class TextEmbedder:
    """Extract text embeddings using SentenceTransformers"""

    def __init__(self, config: Config):
        self.config = config
        self.device = config.device
        print("\nLoading Sentence-BERT model...")
        self.model = SentenceTransformer(config.text_model, device=self.device)
        print(f"Model loaded on {self.device}")

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for texts"""
        return self.model.encode(
            texts,
            batch_size=self.config.batch_size,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True
        )

In [86]:
# ========================================================================
# 5. IMAGE EMBEDDER
# ========================================================================

class ImageEmbedder:
    """Extract image embeddings using CLIP"""
    
    def __init__(self, config: Config):
        self.config = config
        self.device = torch.device(config.device)
        
        print("\nLoading CLIP model...")
        if config.image_model == "clip":
            self.model, self.preprocess = clip.load("RN50", device=self.device)
            self.model.eval()
        print(f"Model loaded on {self.device}")
    
    @torch.no_grad()
    def embed_images(self, image_paths: List[str]) -> np.ndarray:
        """Generate embeddings for images"""
        embeddings = []
        
        for img_path in tqdm(image_paths, desc="Image embeddings"):
            try:
                image = Image.open(img_path).convert('RGB')
                image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
                features = self.model.encode_image(image_tensor)
                features = features.squeeze().cpu().numpy()
                features = features / (np.linalg.norm(features) + 1e-8)
                embeddings.append(features)
            except Exception as e:
                print(f"Error with {img_path}: {e}")
                embeddings.append(np.zeros(self.config.image_dim))
        
        return np.vstack(embeddings)

In [87]:
# ========================================================================
# 6. FUSION PROCESSOR
# ========================================================================

class FusionProcessor:
    """Fuse text and image embeddings"""
    
    def __init__(self, fusion_method='concat'):
        self.fusion_method = fusion_method
    
    def fuse(self, text_emb: np.ndarray, image_emb: np.ndarray) -> np.ndarray:
        """Fuse text and image embeddings"""
        if text_emb.shape[0] != image_emb.shape[0]:
            raise ValueError(f"Dimension mismatch! Text: {text_emb.shape[0]}, Image: {image_emb.shape[0]}")
        
        if self.fusion_method == 'concat':
            fused = np.concatenate([text_emb, image_emb], axis=1)
        else:
            raise ValueError(f"Unknown fusion method: {self.fusion_method}")
        
        print(f"Fused: {text_emb.shape[1]}D + {image_emb.shape[1]}D = {fused.shape[1]}D")
        return fused

In [88]:
# ========================================================================
# 7. DIMENSIONALITY REDUCER
# ========================================================================

class DimensionalityReducer:
    """Reduce dimensions for clustering and visualization"""
    
    def __init__(self, method='pca', n_components=50, random_state=42):
        self.method = method
        self.n_components = n_components
        self.random_state = random_state
        self.reducer = None
        self.reducer_2d = None
    
    def fit_transform(self, embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Reduce to n_components and create 2D projection"""
        n_samples = embeddings.shape[0]
        target_dim = min(self.n_components, n_samples - 1)
        
        if target_dim != self.n_components:
            print(f"Adjusting n_components from {self.n_components} to {target_dim}")
        
        print(f"Reducing from {embeddings.shape[1]}D to {target_dim}D...")
        
        self.reducer = PCA(n_components=target_dim, random_state=self.random_state)
        reduced = self.reducer.fit_transform(embeddings)
        
        self.reducer_2d = PCA(n_components=2, random_state=self.random_state)
        embeddings_2d = self.reducer_2d.fit_transform(embeddings)
        
        print(f"Reduced to {reduced.shape[1]}D")
        return reduced, embeddings_2d

In [89]:
# ========================================================================
# 8. CLASSIFIER
# ========================================================================

class HateSpeechClassifier:
    """XGBoost classifier"""
    
    def __init__(self, config: Config):
        self.config = config
        self.model = None
        self.feature_importance = None
        self.training_history = None
    
    def train(self, X_train, y_train, X_val, y_val):
        """Train XGBoost"""
        print("\n" + "="*70)
        print("TRAINING XGBOOST CLASSIFIER")
        print("="*70)
        
        scale_pos_weight = (y_train == 0).sum() / max((y_train == 1).sum(), 1)
        print(f"Training: {len(X_train)}, Validation: {len(X_val)}")
        
        self.model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=scale_pos_weight,
            random_state=self.config.random_state,
            eval_metric=['auc', 'logloss']
        )
        
        self.model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            verbose=False
        )
        
        y_pred_val = self.model.predict(X_val)
        y_proba_val = self.model.predict_proba(X_val)[:, 1]
        
        print("\nVALIDATION PERFORMANCE:")
        print(classification_report(y_val, y_pred_val, target_names=['Safe', 'Hate']))
        
        val_roc_auc = roc_auc_score(y_val, y_proba_val)
        val_ap = average_precision_score(y_val, y_proba_val)
        print(f"ROC-AUC: {val_roc_auc:.4f}")
        print(f"Average Precision: {val_ap:.4f}")
        
        self.feature_importance = self.model.feature_importances_
        
        return {
            'val': {
                'y_pred': y_pred_val,
                'y_proba': y_proba_val,
                'roc_auc': val_roc_auc,
                'avg_precision': val_ap,
                'confusion_matrix': confusion_matrix(y_val, y_pred_val)
            }
        }
    
    def predict(self, X):
        """Predict"""
        if self.model is None:
            raise ValueError("Model not trained!")
        return self.model.predict(X), self.model.predict_proba(X)[:, 1]

In [90]:
# ========================================================================
# 9. CLUSTERING
# ========================================================================

class DBSCANClustering:
    """DBSCAN clustering"""
    
    def __init__(self, eps=0.5, min_samples=3):
        self.eps = eps
        self.min_samples = min_samples
        self.model = None
        self.labels = None
    
    def fit_predict(self, embeddings: np.ndarray) -> np.ndarray:
        """Fit DBSCAN"""
        print(f"\nRunning DBSCAN (eps={self.eps}, min_samples={self.min_samples})...")
        
        self.model = DBSCAN(eps=self.eps, min_samples=self.min_samples, n_jobs=-1)
        self.labels = self.model.fit_predict(embeddings)
        
        n_clusters = len(set(self.labels)) - (1 if -1 in self.labels else 0)
        n_noise = list(self.labels).count(-1)
        
        print(f"Found {n_clusters} clusters, {n_noise} noise points")
        return self.labels


In [91]:
# ========================================================================
# 10. VISUALIZER
# ========================================================================

class Visualizer:
    """Create visualizations"""
    
    @staticmethod
    def plot_clusters(embeddings_2d, cluster_labels, true_labels, output_dir):
        """Create cluster visualization"""
        df_plot = pd.DataFrame({
            'x': embeddings_2d[:, 0],
            'y': embeddings_2d[:, 1],
            'cluster': cluster_labels.astype(str),
            'true_label': true_labels.astype(str)
        })
        
        df_plot['label_name'] = df_plot['true_label'].map({'0': 'Safe', '1': 'Hate'})
        
        fig = px.scatter(
            df_plot, x='x', y='y',
            color='label_name',
            title='2D Projection - True Labels',
            color_discrete_map={'Safe': 'green', 'Hate': 'red'},
            width=900, height=600
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.write_html(output_dir / 'clusters.html')
        print(f"Saved visualization to {output_dir}")

In [92]:
# ========================================================================
# 11. MAIN PIPELINE
# ========================================================================

def run_complete_pipeline(config: Config):
    """Run complete pipeline"""
    
    # Load data
    data_loader = JSONDataLoader(config)
    df_harmful = data_loader.load_dataset()
    
    # Generate safe samples
    print("\nGenerating synthetic safe samples...")
    generator = SafePromptGenerator(random_state=config.random_state)
    placeholder_img = df_harmful['image_path'].iloc[0]
    df_safe = generator.generate(len(df_harmful), placeholder_img)
    
    # Combine
    df = pd.concat([df_harmful, df_safe], ignore_index=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Combined dataset: {len(df)} samples")
    print(f"Hate: {(df['hate_label']==1).sum()}, Safe: {(df['hate_label']==0).sum()}")
    
    # Text embeddings
    text_cache = Path(config.cache_dir) / 'text_full.npy'
    if text_cache.exists():
        print("\nLoading cached text embeddings...")
        text_embeddings = np.load(text_cache)
    else:
        text_embedder = TextEmbedder(config)
        text_embeddings = text_embedder.embed_texts(df['cleaned_text'].tolist())
        np.save(text_cache, text_embeddings)
    print(f"Text embeddings: {text_embeddings.shape}")
    
    # Image embeddings
    image_cache = Path(config.cache_dir) / 'image_full.npy'
    if image_cache.exists():
        print("\nLoading cached image embeddings...")
        image_embeddings = np.load(image_cache)
    else:
        image_embedder = ImageEmbedder(config)
        image_embeddings = image_embedder.embed_images(df['image_path'].tolist())
        np.save(image_cache, image_embeddings)
    print(f"Image embeddings: {image_embeddings.shape}")
    
    # Fusion
    print("\nFusing embeddings...")
    fusion_processor = FusionProcessor()
    fused = fusion_processor.fuse(text_embeddings, image_embeddings)
    
    # Dimensionality reduction
    print("\nReducing dimensionality...")
    reducer = DimensionalityReducer(
        method=config.reduction_method,
        n_components=config.reduced_dim,
        random_state=config.random_state
    )
    reduced, embeddings_2d = reducer.fit_transform(fused)
    
    # Clustering
    clusterer = DBSCANClustering(eps=config.eps, min_samples=config.min_samples)
    cluster_labels = clusterer.fit_predict(reduced)
    
    # Visualization
    print("\nCreating visualizations...")
    visualizer = Visualizer()
    visualizer.plot_clusters(embeddings_2d, cluster_labels, df['hate_label'].values, 
                            Path(config.results_dir))
    
    # Classification
    print("\nTraining classifier...")
    X_train, X_test, y_train, y_test = train_test_split(
        reduced, df['hate_label'].values,
        test_size=config.test_size,
        random_state=config.random_state,
        stratify=df['hate_label'].values
    )
    
    classifier = HateSpeechClassifier(config)
    metrics = classifier.train(X_train, y_train, X_test, y_test)
    
    print("\n" + "="*70)
    print("PIPELINE COMPLETE!")
    print("="*70)
    
    return {
        'df': df,
        'classifier': classifier,
        'reducer': reducer,
        'metrics': metrics
    }

In [93]:

results = run_complete_pipeline(config)
print(f"\nResults saved to: {config.results_dir}")


LOADING DATASET FROM JSON FILES
Found 701 JSON files in D:\B.Tech\Hackathons\CinehackAI\Harmful Data\POC DATA\Text


Loading JSON files:   0%|          | 0/701 [00:00<?, ?it/s]


DATASET LOADING SUMMARY
Total JSON files found:     701
Skipped (no text):          3
Skipped (no image):         383
Skipped (errors):           0
Successfully loaded:        315

Label distribution:
  Hate samples (1):         315 (100.0%)
  Safe samples (0):         0 (0.0%)

Generating synthetic safe samples...
Combined dataset: 630 samples
Hate: 315, Safe: 315

Loading cached text embeddings...
Text embeddings: (630, 768)

Loading CLIP model...


100%|████████████████████████████████████████| 244M/244M [04:40<00:00, 914kiB/s]


MemoryError: bad allocation

In [None]:























# ========================================================================
# 12. RUN PIPELINE
# ========================================================================

if _name_ == "_main_":