<a href="https://colab.research.google.com/github/sathwikadonagani/Assignments-Jala-Academy-/blob/main/Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ======================================================
# Novel Spam Classification using XGBoost
# with Custom Algorithmic Functions
# (Research-Level Modification)
# ======================================================

import sys
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

# -----------------------------
# NLTK Setup
# -----------------------------
def setup_nltk():
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

# -----------------------------
# Text Preprocessing
# -----------------------------
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# =====================================================
# üî¨ NEW RESEARCH FUNCTIONS (NOT PREDEFINED)
# =====================================================

# 1Ô∏è‚É£ Linguistic Saliency Function (LSF)
def linguistic_saliency(text, alpha=0.6, beta=0.3, gamma=0.1):
    spam_keywords = ['free', 'win', 'urgent', 'offer', 'limited', 'click', 'buy']
    text = str(text).lower()

    K = sum(text.count(word) for word in spam_keywords)   # keyword density
    P = text.count('!') + text.count('?')                 # punctuation intensity
    C = sum(1 for w in text.split() if w.isupper())       # capitalization

    score = alpha * K + beta * P + gamma * C
    return np.tanh(score)  # bounded confidence score (0 to 1)

# 2Ô∏è‚É£ Adaptive Confidence Weighting Function (ACWF)
def adaptive_confidence_weight(text, base=1.0, lam=0.8):
    lsf = linguistic_saliency(text)
    return base + lam * lsf

# 3Ô∏è‚É£ Entropy-Guided Uncertainty Penalty (EGUP)
def entropy_uncertainty(p):
    eps = 1e-9
    return - (p * np.log(p + eps) + (1 - p) * np.log(1 - p + eps))

# 4Ô∏è‚É£ Semantic Weight Refinement Function (SWRF)
def semantic_weight_refinement(base_weight, predicted_prob, mu=0.5):
    entropy = entropy_uncertainty(predicted_prob)
    return base_weight * (1 - mu * entropy)

# -----------------------------
# Load Data
# -----------------------------
def load_data(filepath):
    df = pd.read_csv(filepath)
    df.dropna(inplace=True)
    df['label'] = df['label'].astype(int)
    df['processed_text'] = df['text'].apply(preprocess_text)
    return df

# -----------------------------
# Main Execution
# -----------------------------
if __name__ == "__main__":

    DATASET_PATH = 'combined_data.csv'

    try:
        setup_nltk()

        # Load Dataset
        df = load_data(DATASET_PATH)
        X = df['processed_text']
        y = df['label']
        X_raw = df['text']

        # Train-Test Split
        X_train, X_test, y_train, y_test, X_train_raw, X_test_raw = train_test_split(
            X, y, X_raw, test_size=0.2, random_state=42, stratify=y
        )

        # TF-IDF Vectorization
        vectorizer = TfidfVectorizer(max_features=5000)
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)

        # =====================================================
        # üîß APPLY NEW ALGORITHMIC FUNCTIONS
        # =====================================================

        # Step 1: Compute Adaptive Confidence Weights
        base_weights = np.array([adaptive_confidence_weight(text) for text in X_train_raw])

        # Initial Model Training
        model = XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False,
            random_state=42
        )

        model.fit(X_train_tfidf, y_train, sample_weight=base_weights)

        # Step 2: Predict Probabilities
        y_train_prob = model.predict_proba(X_train_tfidf)[:, 1]

        # Step 3: Refine Weights using Entropy-Guided Penalty
        refined_weights = np.array([
            semantic_weight_refinement(w, p) for w, p in zip(base_weights, y_train_prob)
        ])

        # Step 4: Retrain with Semantic Gradient Scaling
        model.fit(X_train_tfidf, y_train, sample_weight=refined_weights)

        # =====================================================
        # Evaluation
        # =====================================================
        y_pred = model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=['Ham (0)', 'Spam (1)'])

        print("\nüî¨ Novel Model: Semantic-Weighted XGBoost with Custom Learning Functions")
        print(f"\nAccuracy: {accuracy:.4f}")
        print("\nConfusion Matrix:")
        print(cm)
        print("\nClassification Report:")
        print(report)

    except FileNotFoundError:
        print(f"Error: Dataset file not found at '{DATASET_PATH}'")
        print("Please update the dataset path.")

    except Exception as e:
        print(f"An error occurred: {e}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



üî¨ Novel Model: Semantic-Weighted XGBoost with Custom Learning Functions

Accuracy: 0.9757

Confusion Matrix:
[[7591  317]
 [  88 8694]]

Classification Report:
              precision    recall  f1-score   support

     Ham (0)       0.99      0.96      0.97      7908
    Spam (1)       0.96      0.99      0.98      8782

    accuracy                           0.98     16690
   macro avg       0.98      0.97      0.98     16690
weighted avg       0.98      0.98      0.98     16690

