In [1]:
import pandas as pd
import re
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\@\w+|\#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove special characters
    return text.lower().strip()

In [3]:
spam_data = pd.read_csv("spam.csv", encoding="latin-1")

In [4]:
spam_data["Message"] = spam_data["Message"].apply(clean_text)
spam_data["Category"] = spam_data["Category"].map({"ham": 0, "spam": 1})

# Spam Data Splitting
spam_train, spam_test = train_test_split(spam_data, test_size=0.2, random_state=42, stratify=spam_data["Category"])

In [5]:
spam_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("classifier", LogisticRegression(solver="liblinear"))
])

In [6]:
spam_pipeline.fit(spam_train["Message"], spam_train["Category"])

In [7]:
class CombinedModel:
    def __init__(self, spam_pipeline, label_encoder):
        self.spam_pipeline = spam_pipeline
        self.label_encoder = label_encoder

    def clean_text(self, text):
        # Function to clean the text
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
        text = re.sub(r"\@\w+|\#\w+", "", text)  # Remove mentions and hashtags
        text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove special characters
        return text.lower().strip()

    def classify_text_clean(self, text):
        # Clean the input text
        clean = self.clean_text(text)

        # Spam detection
        spam_prob = self.spam_pipeline.predict_proba([clean])[0][1]  # Probability of being spam
        spam_label = "Spam" if spam_prob > 0.5 else "Not Spam"


        # Prepare clean result
        result = f"""
        Text: {text}
        Spam Detection: {spam_label} (Spam Probability: {round(spam_prob * 100, 2)}%)
        """
        return result.strip()


In [9]:
combined_model = CombinedModel(spam_pipeline,LabelEncoder)
joblib.dump(combined_model, "Spam.pkl")
print("Combined Model saved successfully!")

Combined Model saved successfully!


In [11]:
combined_model = joblib.load("Spam.pkl")
sample_text = "They were beating me yesterday"
result = combined_model.classify_text_clean(sample_text)
print(result)

Text: They were beating me yesterday
        Spam Detection: Not Spam (Spam Probability: 7.51%)


In [13]:
spam_predictions = spam_pipeline.predict(spam_test["Message"])
accuracy_spam = accuracy_score(spam_test["Category"], spam_predictions)
report_spam = classification_report(spam_test["Category"], spam_predictions, target_names=["Not Spam", "Spam"])
print(f"Spam Accuracy Score: {accuracy_spam:.2f}")
print("\nSpam Classification Report:\n")
print(report_spam)

Spam Accuracy Score: 0.97

Spam Classification Report:

              precision    recall  f1-score   support

    Not Spam       0.96      1.00      0.98       966
        Spam       1.00      0.74      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.97      0.96      1115

