<a href="https://colab.research.google.com/github/snehagandla30/codsoft/blob/main/ML%20Projects/SPAM_SMS_DETECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')  # Change file path if needed

# Step 3: Preprocess the data
# Drop any irrelevant columns (like 'v2', 'Unnamed: 2', etc.)
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

# Rename the columns for easier understanding
data.columns = ['Label', 'SMS']

# Convert the 'Label' column to binary values (ham = 0, spam = 1)
data['Label'] = data['Label'].map({'ham': 0, 'spam': 1})

# Step 4: Split the data into features (X) and target (y)
X = data['SMS']  # Feature: SMS messages
y = data['Label']  # Target: whether the message is spam (1) or ham (0)

# Step 5: Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Convert the text data into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 7: Train models

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Logistic Regression Model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train_tfidf, y_train)

# Support Vector Machine Model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Step 8: Make predictions
nb_predictions = nb_model.predict(X_test_tfidf)
log_reg_predictions = log_reg_model.predict(X_test_tfidf)
svm_predictions = svm_model.predict(X_test_tfidf)

# Step 9: Evaluate the models

# Naive Bayes Evaluation
print("Naive Bayes Evaluation:")
print("Accuracy: ", accuracy_score(y_test, nb_predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, nb_predictions))
print("Classification Report: \n", classification_report(y_test, nb_predictions))

# Logistic Regression Evaluation
print("\nLogistic Regression Evaluation:")
print("Accuracy: ", accuracy_score(y_test, log_reg_predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, log_reg_predictions))
print("Classification Report: \n", classification_report(y_test, log_reg_predictions))

# Support Vector Machine Evaluation
print("\nSupport Vector Machine Evaluation:")
print("Accuracy: ", accuracy_score(y_test, svm_predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, svm_predictions))
print("Classification Report: \n", classification_report(y_test, svm_predictions))

# Step 10: Output predictions (optional, shows a sample of predictions)
print("\nSample of Predictions:")
for idx, row in enumerate(X_test[:10]):  # Display the first 10 predictions
    print(f"SMS: {row}")
    print(f"True Label: {y_test.iloc[idx]}, Naive Bayes Prediction: {nb_predictions[idx]}, Logistic Regression Prediction: {log_reg_predictions[idx]}, SVM Prediction: {svm_predictions[idx]}\n")


Naive Bayes Evaluation:
Accuracy:  0.9730941704035875
Confusion Matrix: 
 [[965   0]
 [ 30 120]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.80      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Logistic Regression Evaluation:
Accuracy:  0.9560538116591928
Confusion Matrix: 
 [[962   3]
 [ 46 104]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.98       965
           1       0.97      0.69      0.81       150

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.89      1115
weighted avg       0.96      0.96      0.95      1115


Support Vector Machine Evaluation:
Accuracy:  0.9811659192825112
Confusion Matrix: 
 [[961  