In [None]:
# pip install gradio

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Using cached gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.11-cp310-none-win_amd64.whl.metadata (52 kB)
     ---------------------------------------- 0.0/52.0 kB ? eta -:--:--
     ---------------------------------------- 52.0/52.0 kB 2.6 MB/s eta 0:00:00
Collecting pydantic>=2.0 (from gradio)
  Using cached pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Collecting pydub (from g

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import gradio as gr
import numpy as np

In [32]:
def load_data(filepath='SMSSpamCollection.csv'):
    """
    Load and prepare the SMS spam dataset with validation
    """
    try:
        # Load the dataset with proper delimiter
        df = pd.read_csv(filepath, delimiter='\t', names=['label', 'message'])
        
        # Drop any rows with NaN values
        df = df.dropna(subset=['label', 'message']).reset_index(drop=True)
        
        # Convert labels to binary format
        df['label'] = df['label'].map({'ham': 0, 'spam': 1})
        
        # Check class distribution
        class_dist = df['label'].value_counts()
        print("Class distribution:\n", class_dist)
        print("\nSample data:\n", df.head())
        
        # Validate that we have at least two classes
        if len(class_dist) < 2:
            print("\nWARNING: Dataset contains only one class:", 
                  "ham" if class_dist.index[0] == 0 else "spam")
            print("Please ensure your dataset contains both spam and ham messages.")
            return None
            
        # Validate minimum samples per class
        min_samples = min(class_dist)
        if min_samples < 2:
            print(f"\nWARNING: Insufficient samples in one or more classes. Minimum required: 2, Found: {min_samples}")
            return None
            
        return df
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None



In [33]:
def train_model(df, use_kfold=False):
    """
    Train the SMS classification model with validation checks
    """
    if df is None:
        raise ValueError("No valid dataset provided. Please check the data loading errors above.")
        
    if len(df) < 2:
        raise ValueError("Dataset too small to train a model.")
        
    features = df['message']
    target = df['label']
    
    # Verify we have at least two classes
    unique_classes = np.unique(target)
    if len(unique_classes) < 2:
        raise ValueError(f"Cannot train binary classifier with only one class: {unique_classes[0]}")
    
    # Create the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('classifier', LinearSVC(dual=False, max_iter=1000))
    ])

    # Determine if we have enough samples for k-fold
    min_samples_per_class = min(np.bincount(target))
    if use_kfold and min_samples_per_class >= 5:
        n_splits = min(5, min_samples_per_class)
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        fold_accuracies = []
        for fold, (train_idx, test_idx) in enumerate(skf.split(features, target), 1):
            X_train, X_test = features.iloc[train_idx], features.iloc[test_idx]
            y_train, y_test = target.iloc[train_idx], target.iloc[test_idx]
            
            pipeline.fit(X_train, y_train)
            accuracy = pipeline.score(X_test, y_test)
            fold_accuracies.append(accuracy)
            print(f"Fold {fold} accuracy: {accuracy:.3f}")
        
        print(f"\nAverage accuracy: {np.mean(fold_accuracies):.3f}")
    else:
        # Use simple train-test split
        test_size = min(0.2, 1.0 / min_samples_per_class)  # Adjust test size for small datasets
        X_train, X_test, y_train, y_test = train_test_split(
            features, target, test_size=test_size, random_state=42, stratify=target
        )
        pipeline.fit(X_train, y_train)
        accuracy = pipeline.score(X_test, y_test)
        print(f"\nTest accuracy: {accuracy:.3f}")
    
    # Fit final model on all data
    pipeline.fit(features, target)
    return pipeline

In [34]:
def predict_spam(text, model):
    """
    Make prediction on new text with error handling
    """
    if not text or not isinstance(text, str):
        return "Error: Please provide valid text input"
    if not model:
        return "Error: Model not properly trained"
    
    try:
        prediction = model.predict([text])[0]
        confidence = "high"  # Note: LinearSVC doesn't provide probability scores
        result = "spam" if prediction == 1 else "ham"
        return f"Prediction: {result} (confidence: {confidence})"
    except Exception as e:
        return f"Error making prediction: {str(e)}"


In [35]:
def create_gradio_interface(model):
    """
    Create and launch Gradio interface for the spam classifier
    """
    def classify_text(message):
        return predict_spam(message, model)

    interface = gr.Interface(
        fn=classify_text,
        inputs=gr.Textbox(
            lines=2,
            placeholder="Enter SMS text here...",
            label="Message Text"
        ),
        outputs=gr.Textbox(label="Classification Result"),
        title="SMS Spam Detector",
        description="Classify SMS messages as spam or ham (non-spam).",
        examples=[
            ["Congratulations! You've won a $1000 prize! Click here to claim now!"],
            ["Hey, can we meet at 6pm for dinner tonight?"]
        ]
    )
    return interface

In [31]:
# Main execution
if __name__ == "__main__":
    # Load and prepare data
    df = load_data()
    
    if df is not None:
        # Train the model
        model = train_model(df, use_kfold=True)
        
        # Create and launch the Gradio interface
        interface = create_gradio_interface(model)
        interface.launch()

Class distribution:
 label
0    2
Name: count, dtype: int64

Sample data:
    label                                            message
0      0        Yeah, give me a call if you've got a minute
1      0  HI BABE UAWAKE?FEELLIKW SHIT.JUSTFOUND OUT VIA...


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [None]:
if __name__ == "__main__":
    print("Loading SMS spam detection system...")
    print("Please ensure your dataset contains both spam and ham messages.")
    
    # Load and prepare data
    df = load_data()
    
    if df is not None:
        try:
            # Train the model
            model = train_model(df, use_kfold=True)
            
            # Create and launch the Gradio interface
            interface = create_gradio_interface(model)
            interface.launch()
        except Exception as e:
            print(f"\nError: {str(e)}")
            print("Please check your dataset and try again.")
    else:
        print("\nFailed to initialize the system due to data loading errors.")
        print("Please ensure your dataset is properly formatted and contains both spam and ham messages.")

Loading SMS spam detection system...
Please ensure your dataset contains both spam and ham messages.
Class distribution:
 label
0    2
Name: count, dtype: int64

Sample data:
    label                                            message
0      0        Yeah, give me a call if you've got a minute
1      0  HI BABE UAWAKE?FEELLIKW SHIT.JUSTFOUND OUT VIA...

Please ensure your dataset contains both spam and ham messages.

Failed to initialize the system due to data loading errors.
Please ensure your dataset is properly formatted and contains both spam and ham messages.
