<a href="https://colab.research.google.com/github/samir1120k/Machine-Learning-Project/blob/Email-Spam-detection/exp_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
import io
import requests
import zipfile

# Example dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

# Download the zip file
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad responses

# Extract the 'SMSSpamCollection' file from the zip
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
    with zip_file.open('SMSSpamCollection') as file:
        # Read the CSV data into a pandas DataFrame
        data = pd.read_csv(file, sep='\t', header=None, names=["Label", "Message"])

print(data.head())

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
# Function to clean the text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

data['Cleaned_Message'] = data['Message'].apply(clean_text)


In [None]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)

# Fit and transform the cleaned text
X = tfidf_vectorizer.fit_transform(data['Cleaned_Message'])

# The labels (spam or ham)
y = data['Label'].map({'ham': 0, 'spam': 1})


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create a linear SVM model
svm_model = SVC(kernel='linear')

# Train the model
svm_model.fit(X_train, y_train)


In [None]:
# Predict using the test data
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix:
 [[965   1]
 [ 14 135]]


In [None]:
def predict_spam(message):
    message = clean_text(message)
    message_vectorized = tfidf_vectorizer.transform([message])
    prediction = svm_model.predict(message_vectorized)
    return "Spam" if prediction == 1 else "Ham"

# Test the classifier with a new message
new_message = "Free entry in 2 a wkly comp to win FA Cup"
print(predict_spam(new_message))


Spam
