In [29]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
import re
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import xgboost as xgb
import pickle
import warnings
warnings.filterwarnings('ignore')

In [30]:
JSON_FILES_DIR = "D:\B.Tech\Hackathons\CinehackAI\Harmful Data\POC DATA\Text"  # Update this path
OUTPUT_DIR = "models"
BERT_MODEL = "bert-base-uncased"
BATCH_SIZE = 32
MAX_LENGTH = 128
RANDOM_SEED = 42
SAFE_SPEECH_RATIO = 1.0  # 1:1 ratio with harmful speech

# Create output directory
Path(OUTPUT_DIR).mkdir(exist_ok=True)

# Set random seeds
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

In [31]:
# ============================================
# 1. LOAD AND PARSE JSON FILES
# ============================================
def inspect_json_structure(file_path, max_depth=3):
    """Detailed inspection of JSON file structure"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error loading file: {e}")
        return
    
    def inspect_obj(obj, depth=0, path="root"):
        if depth > max_depth:
            return
        
        indent = "  " * depth
        
        if isinstance(obj, dict):
            print(f"{indent}{path} [dict] with {len(obj)} keys")
            for key, value in list(obj.items())[:5]:  # Show first 5 keys
                print(f"{indent}  - '{key}': {type(value).__name__}", end="")
                if isinstance(value, str):
                    preview = value[:50].replace('\n', ' ')
                    print(f" = '{preview}...'")
                elif isinstance(value, (list, dict)):
                    print(f" (len={len(value)})")
                else:
                    print()
                
                if depth < max_depth - 1:
                    inspect_obj(value, depth + 1, f"{path}.{key}")
        
        elif isinstance(obj, list):
            print(f"{indent}{path} [list] with {len(obj)} items")
            if len(obj) > 0:
                print(f"{indent}  First item type: {type(obj[0]).__name__}")
                if depth < max_depth - 1:
                    inspect_obj(obj[0], depth + 1, f"{path}[0]")
    
    inspect_obj(data)

def extract_text_from_json(obj, texts=None):
    """Recursively extract all text fields from JSON"""
    if texts is None:
        texts = []
    
    if isinstance(obj, dict):
        # Common text field names - INCLUDING img_text for your dataset!
        for key in ['img_text', 'text', 'content', 'message', 'comment', 'post', 'tweet', 
                    'description', 'body', 'title', 'caption', 'review', 'label_text']:
            if key in obj:
                value = obj[key]
                if isinstance(value, str) and len(value.strip()) > 0:
                    texts.append(value.strip())
        
        # Recursively search nested dicts
        for value in obj.values():
            if isinstance(value, (dict, list)):
                extract_text_from_json(value, texts)
    
    elif isinstance(obj, list):
        for item in obj:
            if isinstance(item, (dict, list)):
                extract_text_from_json(item, texts)
    
    return texts

def load_json_files(directory):
    """Load all JSON files from directory"""
    json_files = list(Path(directory).glob("*.json"))
    all_texts = []
    failed_files = []
    
    if len(json_files) == 0:
        raise ValueError(f"No JSON files found in {directory}")
    
    print(f"Found {len(json_files)} JSON files")
    
    # Detailed inspection of first 3 files
    print("\n" + "="*60)
    print("INSPECTING FIRST 3 FILES FOR STRUCTURE")
    print("="*60)
    for i, file_path in enumerate(json_files[:3]):
        print(f"\n[File {i+1}] {file_path.name}")
        print("-" * 60)
        inspect_json_structure(file_path)
    
    print("\n" + "="*60)
    user_input = input("Press ENTER to continue loading all files, or type 'stop' to exit: ")
    if user_input.lower() == 'stop':
        raise KeyboardInterrupt("User stopped execution")
    
    print("\nLoading all files...")
    for file_path in tqdm(json_files, desc="Loading JSON files"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                extracted = extract_text_from_json(data)
                all_texts.extend(extracted)
        except Exception as e:
            failed_files.append((file_path.name, str(e)))
    
    if failed_files:
        print(f"\nWarning: Failed to load {len(failed_files)} files")
        if len(failed_files) <= 10:
            for fname, error in failed_files:
                print(f"  - {fname}: {error}")
        else:
            print("First 10 failures:")
            for fname, error in failed_files[:10]:
                print(f"  - {fname}: {error}")
    
    return all_texts

In [32]:
# ============================================
# 2. TEXT PREPROCESSING
# ============================================
def preprocess_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags (but keep the text after them)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [33]:

# ============================================
# 3. SYNTHETIC SAFE SPEECH GENERATION
# ============================================
def generate_safe_speech(harmful_texts, ratio=1.0):
    """Generate synthetic safe speech data"""
    
    # Templates for safe speech
    safe_templates = [
        # Positive statements
        "I really enjoyed {topic} today, it was wonderful",
        "Thank you for sharing your thoughts on {topic}",
        "I appreciate the discussion about {topic}",
        "Learning about {topic} has been very helpful",
        "The {topic} session was informative and well-organized",
        
        # Neutral statements
        "I'm interested in learning more about {topic}",
        "Can you provide information on {topic}?",
        "What's your opinion on {topic}?",
        "I've been thinking about {topic} lately",
        "The article about {topic} was interesting",
        
        # Encouraging statements
        "Great work on the {topic} project",
        "I'm happy to help with {topic}",
        "Let's collaborate on {topic}",
        "Your perspective on {topic} is valuable",
        "I respect your view on {topic}",
        
        # General conversation
        "Have a great day everyone",
        "Looking forward to our next meeting",
        "Thanks for your time and consideration",
        "I hope you're doing well",
        "Best wishes for your project",
        "That's a good point to consider",
        "I understand what you mean",
        "Let me think about that",
        "That makes sense to me",
        "I see your perspective"
    ]
    
    topics = [
        "technology", "education", "sports", "music", "art", "science",
        "literature", "food", "travel", "health", "fitness", "nature",
        "history", "culture", "business", "environment", "community",
        "family", "friends", "hobbies", "movies", "books", "photography",
        "gaming", "cooking", "design", "architecture", "astronomy"
    ]
    
    num_safe = int(len(harmful_texts) * ratio)
    safe_texts = []
    
    print(f"Generating {num_safe} synthetic safe speech examples...")
    
    for _ in range(num_safe):
        template = np.random.choice(safe_templates)
        if '{topic}' in template:
            topic = np.random.choice(topics)
            text = template.format(topic=topic)
        else:
            text = template
        
        # Add variation
        if np.random.random() > 0.5:
            text = text.capitalize()
        if np.random.random() > 0.7:
            text += "."
        
        safe_texts.append(text)
    
    return safe_texts


In [34]:
# ============================================
# 4. BERT ENCODING
# ============================================
class BERTEncoder:
    def __init__(self, model_name=BERT_MODEL):
        print(f"Loading BERT model: {model_name}")
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        try:
            self.tokenizer = BertTokenizer.from_pretrained(model_name)
            self.model = BertModel.from_pretrained(model_name).to(self.device)
            self.model.eval()
        except Exception as e:
            raise RuntimeError(f"Failed to load BERT model: {e}")
    
    def encode_batch(self, texts, max_length=MAX_LENGTH):
        """Encode a batch of texts"""
        try:
            encoded = self.tokenizer.batch_encode_plus(
                texts,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
            
            input_ids = encoded['input_ids'].to(self.device)
            attention_mask = encoded['attention_mask'].to(self.device)
            
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                # Use [CLS] token embedding
                embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            return embeddings
        except Exception as e:
            raise RuntimeError(f"Error encoding batch: {e}")
    
    def encode_texts(self, texts, batch_size=BATCH_SIZE):
        """Encode all texts in batches"""
        if len(texts) == 0:
            raise ValueError("No texts to encode! Check your JSON files.")
        
        all_embeddings = []
        
        try:
            for i in tqdm(range(0, len(texts), batch_size), desc="Encoding texts"):
                batch = texts[i:i + batch_size]
                embeddings = self.encode_batch(batch)
                all_embeddings.append(embeddings)
            
            return np.vstack(all_embeddings)
        except Exception as e:
            raise RuntimeError(f"Error during text encoding: {e}")


In [35]:
# ============================================
# 5. MAIN PIPELINE
# ============================================
def main():
    print("=" * 60)
    print("BERT + XGBoost Hate Speech Classification Pipeline")
    print("=" * 60)
    
    try:
        # Step 1: Load harmful texts
        print("\n[1/7] Loading harmful texts from JSON files...")
        harmful_texts_raw = load_json_files(JSON_FILES_DIR)
        print(f"Loaded {len(harmful_texts_raw)} harmful texts")
        
        if len(harmful_texts_raw) == 0:
            raise ValueError("No texts were extracted from JSON files! Check your JSON structure.")
        
        # Step 2: Preprocess harmful texts
        print("\n[2/7] Preprocessing harmful texts...")
        harmful_texts = [preprocess_text(text) for text in harmful_texts_raw]
        harmful_texts = [text for text in harmful_texts if len(text) > 10]  # Filter short texts
        print(f"After preprocessing: {len(harmful_texts)} harmful texts")
        
        if len(harmful_texts) == 0:
            raise ValueError("All texts were filtered out during preprocessing!")
        
        # Step 3: Generate safe texts
        print("\n[3/7] Generating synthetic safe speech...")
        safe_texts = generate_safe_speech(harmful_texts, ratio=SAFE_SPEECH_RATIO)
        
        # Create dataset
        all_texts = harmful_texts + safe_texts
        all_labels = [1] * len(harmful_texts) + [0] * len(safe_texts)  # 1=harmful, 0=safe
        
        print(f"\nDataset summary:")
        print(f"  Harmful texts: {len(harmful_texts)}")
        print(f"  Safe texts: {len(safe_texts)}")
        print(f"  Total: {len(all_texts)}")
        
        # Step 4: Encode with BERT
        print("\n[4/7] Encoding texts with BERT...")
        encoder = BERTEncoder()
        embeddings = encoder.encode_texts(all_texts)
        print(f"Embeddings shape: {embeddings.shape}")
        
        # Step 5: Train-test split
        print("\n[5/7] Splitting dataset...")
        X_train, X_test, y_train, y_test = train_test_split(
            embeddings, all_labels, test_size=0.2, random_state=RANDOM_SEED, stratify=all_labels
        )
        print(f"Train set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        
        # Step 6: Train XGBoost with efficient approach
        print("\n[6/7] Training XGBoost classifier...")
        
        # Split train into train/validation for monitoring
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train, y_train, test_size=0.2, random_state=RANDOM_SEED, stratify=y_train
        )
        
        print("Training XGBoost with optimized parameters...")
        
        # Check XGBoost version for compatibility
        xgb_version = tuple(map(int, xgb.__version__.split('.')[:2]))
        
        if xgb_version >= (2, 0):
            # XGBoost 2.0+ syntax
            model = xgb.XGBClassifier(
                objective='binary:logistic',
                max_depth=8,
                learning_rate=0.05,
                n_estimators=300,
                min_child_weight=3,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0.1,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=RANDOM_SEED,
                tree_method='hist',
                n_jobs=-1,
                early_stopping_rounds=50
            )
            
            model.fit(
                X_train_split,
                y_train_split,
                eval_set=[(X_val_split, y_val_split)],
                verbose=False
            )
        else:
            # XGBoost 1.x syntax
            model = xgb.XGBClassifier(
                objective='binary:logistic',
                max_depth=8,
                learning_rate=0.05,
                n_estimators=300,
                min_child_weight=3,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0.1,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=RANDOM_SEED,
                use_label_encoder=False,
                tree_method='hist',
                n_jobs=-1
            )
            
            model.fit(
                X_train_split,
                y_train_split,
                eval_set=[(X_val_split, y_val_split)],
                early_stopping_rounds=50,
                verbose=False
            )
        
        # Validation score
        y_val_pred = model.predict(X_val_split)
        val_f1 = f1_score(y_val_split, y_val_pred)
        print(f"Validation F1 Score: {val_f1:.4f}")
        
        # Get best iteration
        best_n_estimators = getattr(model, 'best_iteration', None)
        if best_n_estimators is None:
            best_n_estimators = 300
        else:
            best_n_estimators = int(best_n_estimators)
        print(f"Best iteration: {best_n_estimators}")
        
        # Retrain on full training set
        print("Training final model on full training set...")
        if xgb_version >= (2, 0):
            best_model = xgb.XGBClassifier(
                objective='binary:logistic',
                max_depth=8,
                learning_rate=0.05,
                n_estimators=best_n_estimators,
                min_child_weight=3,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0.1,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=RANDOM_SEED,
                tree_method='hist',
                n_jobs=-1
            )
        else:
            best_model = xgb.XGBClassifier(
                objective='binary:logistic',
                max_depth=8,
                learning_rate=0.05,
                n_estimators=best_n_estimators,
                min_child_weight=3,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0.1,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=RANDOM_SEED,
                use_label_encoder=False,
                tree_method='hist',
                n_jobs=-1
            )
        
        best_model.fit(X_train, y_train, verbose=False)
        
        # Step 7: Evaluate
        print("\n[7/7] Evaluating model...")
        y_pred = best_model.predict(X_test)
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        print("\n" + "=" * 60)
        print("FINAL RESULTS")
        print("=" * 60)
        print(f"\nAccuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, 
                                    target_names=['Safe', 'Harmful'],
                                    digits=4))
        
        print("\nConfusion Matrix:")
        cm = confusion_matrix(y_test, y_pred)
        print(f"                Predicted")
        print(f"                Safe  Harmful")
        print(f"Actual Safe     {cm[0][0]:4d}  {cm[0][1]:4d}")
        print(f"       Harmful  {cm[1][0]:4d}  {cm[1][1]:4d}")
        
        # Save model and encoder
        print(f"\n[Saving] Saving model to {OUTPUT_DIR}/")
        with open(f"{OUTPUT_DIR}/xgboost_model.pkl", 'wb') as f:
            pickle.dump(best_model, f)
        
        # Save preprocessing info
        model_info = {
            'accuracy': float(accuracy),
            'f1_score': float(f1),
            'num_harmful': len(harmful_texts),
            'num_safe': len(safe_texts),
            'xgboost_version': xgb.__version__,
            'bert_model': BERT_MODEL
        }
        
        with open(f"{OUTPUT_DIR}/model_info.json", 'w') as f:
            json.dump(model_info, f, indent=2)
        
        print("\n✓ Model saved successfully!")
        print("=" * 60)
        
        return best_model, encoder
    
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        raise



In [36]:
# ============================================
# 6. INFERENCE FUNCTION
# ============================================
def predict_text(text, model, encoder):
    """Predict if a text is harmful or safe"""
    try:
        processed = preprocess_text(text)
        if len(processed) == 0:
            return {
                'text': text,
                'prediction': 'UNKNOWN',
                'harmful_probability': 0.0,
                'safe_probability': 0.0,
                'error': 'Text too short after preprocessing'
            }
        
        embedding = encoder.encode_texts([processed])
        prediction = model.predict(embedding)[0]
        probability = model.predict_proba(embedding)[0]
        
        return {
            'text': text,
            'prediction': 'HARMFUL' if prediction == 1 else 'SAFE',
            'harmful_probability': float(probability[1]),
            'safe_probability': float(probability[0])
        }
    except Exception as e:
        return {
            'text': text,
            'prediction': 'ERROR',
            'harmful_probability': 0.0,
            'safe_probability': 0.0,
            'error': str(e)
        }


In [37]:
if __name__ == "__main__":
    try:
        model, encoder = main()
        
        # Test predictions
        print("\n" + "=" * 60)
        print("SAMPLE PREDICTIONS")
        print("=" * 60)
        
        test_samples = [
            "I hate all people from that country",
            "Thank you for your helpful feedback",
            "You're so stupid and worthless",
            "Have a wonderful day everyone!"
        ]
        
        for sample in test_samples:
            result = predict_text(sample, model, encoder)
            print(f"\nText: {result['text']}")
            print(f"Prediction: {result['prediction']}")
            if 'error' not in result:
                print(f"Harmful probability: {result['harmful_probability']:.4f}")
            else:
                print(f"Error: {result['error']}")
    
    except KeyboardInterrupt:
        print("\n\nExecution stopped by user.")
    except Exception as e:
        print(f"\n\nFatal error: {e}")
        import traceback
        traceback.print_exc()

BERT + XGBoost Hate Speech Classification Pipeline

[1/7] Loading harmful texts from JSON files...
Found 701 JSON files

INSPECTING FIRST 3 FILES FOR STRUCTURE

[File 1] 1023940826882293760.json
------------------------------------------------------------
root [dict] with 1 keys
  - 'img_text': str = 'İ'M SLOWLY BEC«MİNG RETARpEp! ...'

[File 2] 1023940897346658307.json
------------------------------------------------------------
root [dict] with 1 keys
  - 'img_text': str = '* 36% 10, ull Verizon LTE 10:37 AM Tweet nicktendo...'

[File 3] 1023943177319919616.json
------------------------------------------------------------
root [dict] with 1 keys
  - 'img_text': str = 'Silverwing Hold Antact Power 0n 1C 36 10 Alliance ...'




Loading all files...


Loading JSON files: 100%|██████████| 701/701 [00:00<00:00, 2445.38it/s]


Loaded 701 harmful texts

[2/7] Preprocessing harmful texts...
After preprocessing: 576 harmful texts

[3/7] Generating synthetic safe speech...
Generating 576 synthetic safe speech examples...

Dataset summary:
  Harmful texts: 576
  Safe texts: 576
  Total: 1152

[4/7] Encoding texts with BERT...
Loading BERT model: bert-base-uncased
Using device: cuda


Encoding texts: 100%|██████████| 36/36 [00:20<00:00,  1.74it/s]


Embeddings shape: (1152, 768)

[5/7] Splitting dataset...
Train set: 921 samples
Test set: 231 samples

[6/7] Training XGBoost classifier...
Training XGBoost with optimized parameters...
Validation F1 Score: 1.0000
Best iteration: 291
Training final model on full training set...

[7/7] Evaluating model...

FINAL RESULTS

Accuracy: 0.9827
F1 Score: 0.9823

Classification Report:
              precision    recall  f1-score   support

        Safe     0.9667    1.0000    0.9831       116
     Harmful     1.0000    0.9652    0.9823       115

    accuracy                         0.9827       231
   macro avg     0.9833    0.9826    0.9827       231
weighted avg     0.9833    0.9827    0.9827       231


Confusion Matrix:
                Predicted
                Safe  Harmful
Actual Safe      116     0
       Harmful     4   111

[Saving] Saving model to models/

✓ Model saved successfully!

SAMPLE PREDICTIONS


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.05it/s]



Text: I hate all people from that country
Prediction: HARMFUL
Harmful probability: 0.9575


Encoding texts: 100%|██████████| 1/1 [00:00<00:00, 48.51it/s]



Text: Thank you for your helpful feedback
Prediction: SAFE
Harmful probability: 0.1071


Encoding texts: 100%|██████████| 1/1 [00:00<00:00, 49.94it/s]



Text: You're so stupid and worthless
Prediction: HARMFUL
Harmful probability: 0.9920


Encoding texts: 100%|██████████| 1/1 [00:00<00:00, 35.70it/s]


Text: Have a wonderful day everyone!
Prediction: SAFE
Harmful probability: 0.0078



