In [None]:
# 1. Import libraries and load data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 2. Load and preprocess
data = pd.read_csv("/content/mail_data.csv", encoding= "latin-1")
data['Category'] = data.Category.map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.2, random_state=1)

# 3. Fit CountVectorizer on training data
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train).toarray()
testing_data = count_vector.transform(X_test).toarray()

# 4. Train classifier
clf = LogisticRegression(random_state=0).fit(training_data, y_train)

# 5. Test with sample input
sample_messages = [
    "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your account has been compromised. Verify your identity now.",
    "I'll be there in 10 minutes.",
    "Win a brand new car just by answering this simple question!"
]
sample_data = count_vector.transform(sample_messages).toarray()
sample_predictions = clf.predict(sample_data)

# 6. Show results
for msg, pred in zip(sample_messages, sample_predictions):
    label = 'Spam' if pred == 1 else 'Not Spam'
    print(f"Message: {msg}\nPrediction: {label}\n")


Message: Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.
Prediction: Spam

Message: Hey, are we still meeting for lunch today?
Prediction: Not Spam

Message: URGENT! Your account has been compromised. Verify your identity now.
Prediction: Not Spam

Message: I'll be there in 10 minutes.
Prediction: Not Spam

Message: Win a brand new car just by answering this simple question!
Prediction: Not Spam



In [None]:
data = pd.read_csv("/content/mail_data.csv", encoding= "latin-1")
data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load dataset

# Convert labels to binary: ham -> 0, spam -> 1

# Vectorize text
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Sample test messages
sample_messages = [
    "Free entry in 2 a weekly competition to win tickets!",
    "Hey, are we still on for dinner tonight?",
    "URGENT! Your Mobile No. has won £2000!",
    "I'll call you later, I'm in a meeting.",
    "Congratulations! You won a free cruise. Call now!"
]
sample_vec = vectorizer.transform(sample_messages)

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=0),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(random_state=0)
}

# Train, evaluate, and test on sample messages
for name, model in models.items():
    print(f"\n{'='*10} {name} {'='*10}")

    # Train
    model.fit(X_train_vec, y_train)

    # Predict on test set
    test_preds = model.predict(X_test_vec)

    # Evaluate
    print("Classification Report:\n", classification_report(y_test, test_preds, target_names=['Ham', 'Spam']))
    print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))

    # Predict on sample messages
    sample_preds = model.predict(sample_vec)
    print("\nSample Message Predictions:")
    for msg, label in zip(sample_messages, sample_preds):
        print(f"\"{msg}\" --> {'Spam' if label == 1 else 'Ham'}")



Classification Report:
               precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       968
        Spam       0.99      0.91      0.95       147

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix:
 [[967   1]
 [ 13 134]]

Sample Message Predictions:
"Free entry in 2 a weekly competition to win tickets!" --> Ham
"Hey, are we still on for dinner tonight?" --> Ham
"URGENT! Your Mobile No. has won £2000!" --> Ham
"I'll call you later, I'm in a meeting." --> Ham
"Congratulations! You won a free cruise. Call now!" --> Spam

Classification Report:
               precision    recall  f1-score   support

         Ham       0.92      1.00      0.96       968
        Spam       1.00      0.41      0.59       147

    accuracy                           0.92      1115
   macro avg       0.96      0.71      0.77      1115
weighte

In [None]:
import joblib
import os

# Create a directory to store the models
os.makedirs("bbjjnjn", exist_ok=True)

# Save each model after training
for name, model in models.items():
    filename = f"bbjjnjn/{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, filename)
    print(f"{name} saved to {filename}")


Logistic Regression saved to bbjjnjn/logistic_regression_model.pkl
K-Nearest Neighbors saved to bbjjnjn/k-nearest_neighbors_model.pkl
Support Vector Machine saved to bbjjnjn/support_vector_machine_model.pkl
Random Forest saved to bbjjnjn/random_forest_model.pkl


In [None]:
joblib.dump(count_vector, "vectorizer.pkl")

['vectorizer.pkl']

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

# 1. Load dataset
data = pd.read_csv("/content/mail_data.csv", encoding= "latin-1")
data
data['Category'] = data.Category.map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.2, random_state=1)


# 3. Define vectorizers
vectorizers = {
    'bow': CountVectorizer(),
    'tfidf': TfidfVectorizer()
}

# 4. Sample messages
sample_messages = [
    "Free entry in 2 a weekly competition to win tickets!",
    "Hey, are we still on for dinner tonight?",
    "URGENT! Your Mobile No. has won £2000!",
    "I'll call you later, I'm in a meeting.",
    "Congratulations! You won a free cruise. Call now!"
]

# 5. Models to train
models = {
    "Logistic Regression": LogisticRegression(random_state=0),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(random_state=0),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=0)
}

# 6. Train and evaluate
for vec_name, vectorizer in vectorizers.items():
    print(f"\n{'='*40}\nUsing Vectorizer: {vec_name.upper()}\n{'='*40}")

    # Fit vectorizer
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    sample_vec = vectorizer.transform(sample_messages)

    # Save vectorizer
    joblib.dump(vectorizer, f"{vec_name}_vectorizer.pkl")

    for model_name, model in models.items():
        print(f"\n----- {model_name} -----")

        # Train
        model.fit(X_train_vec, y_train)

        # Predict
        test_preds = model.predict(X_test_vec)

        # Evaluation
        print("Classification Report:\n", classification_report(y_test, test_preds, target_names=['Ham', 'Spam']))
        print("Confusion Matrix:\n", confusion_matrix(y_test, test_preds))

        # Sample Predictions
        sample_preds = model.predict(sample_vec)
        print("\nSample Message Predictions:")
        for msg, label in zip(sample_messages, sample_preds):
            print(f"\"{msg}\" --> {'Spam' if label == 1 else 'Ham'}")

        # Save model
        safe_model_name = model_name.lower().replace(" ", "_")
        joblib.dump(model, f"{safe_model_name}_{vec_name}.pkl")



Using Vectorizer: BOW

----- Logistic Regression -----
Classification Report:
               precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       968
        Spam       0.99      0.91      0.95       147

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix:
 [[967   1]
 [ 13 134]]

Sample Message Predictions:
"Free entry in 2 a weekly competition to win tickets!" --> Ham
"Hey, are we still on for dinner tonight?" --> Ham
"URGENT! Your Mobile No. has won £2000!" --> Ham
"I'll call you later, I'm in a meeting." --> Ham
"Congratulations! You won a free cruise. Call now!" --> Spam

----- KNN -----
Classification Report:
               precision    recall  f1-score   support

         Ham       0.92      1.00      0.96       968
        Spam       1.00      0.41      0.59       147

    accuracy                           0.92 