## Dataset:
#### The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.
## Objective:
#### Build an AI model that can classify SMS messages as spam or legitimate. Use techniques like TF-IDF or word embeddings with classifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix

In [66]:
from sklearn.metrics import confusion_matrix

In [67]:
import numpy as np
import pandas as pd
import nltk
import random
import string

from nltk.corpus import movie_reviews

#pip install gensim
from gensim.models import Word2Vec

In [68]:
df = pd.read_csv("spam.csv", encoding='latin1')

In [69]:
df.head(20)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [70]:
# Find the total number of tweets in the data
len(df)

5572

# Load the dataset:

In [71]:
import pandas as pd

# Load the dataset
df = pd.read_csv("spam.csv", encoding='latin1')

# Drop unnecessary columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Rename columns
df.columns = ['label', 'text']

# Display the first few rows of the dataset
print(df.head())


  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


# 2.Preprocess the text data:

In [72]:
import string
import nltk
from nltk.tokenize import word_tokenize

# Lowercase the text
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenize the text
df['text'] = df['text'].apply(word_tokenize)

# Display the preprocessed text
print(df.head())


  label                                               text
0   ham  [go, until, jurong, point, crazy, available, o...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, in, 2, a, wkly, comp, to, win, f...
3   ham  [u, dun, say, so, early, hor, u, c, already, t...
4   ham  [nah, i, dont, think, he, goes, to, usf, he, l...


# 3. Convert text data into numerical representation using TF-IDF:

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess each SMS message individually
preprocessed_texts = [' '.join(text) for text in df['text']]

# Convert text data into numerical representation using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(preprocessed_texts)

# Display the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X.shape)


TF-IDF Matrix Shape: (5572, 9485)


# 4. Split the dataset into training and testing sets:

In [74]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)


Training Set Shape: (4457, 9485) (4457,)
Testing Set Shape: (1115, 9485) (1115,)


# 5.Train a machine learning model:

In [75]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)


# 6.Evaluate the model's performance:

In [76]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the testing set
y_pred = naive_bayes_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.95695067264574
Classification Report:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98       965
        spam       1.00      0.68      0.81       150

    accuracy                           0.96      1115
   macro avg       0.98      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115



# 7.Use the trained model to classify new SMS messages:

In [77]:
# Assume new_sms_messages is a list of new SMS messages
new_sms_messages = ["Congratulations! You've won a prize!", "Hello, how are you?"]

# Preprocess each new SMS message individually
preprocessed_new_sms_messages = [' '.join(word_tokenize(message.lower().translate(str.maketrans('', '', string.punctuation)))) for message in new_sms_messages]

# Convert the preprocessed new SMS messages into TF-IDF representation
new_sms_tfidf = tfidf_vectorizer.transform(preprocessed_new_sms_messages)

# Use the trained model to classify the new SMS messages
predictions = naive_bayes_classifier.predict(new_sms_tfidf)

# Display the predictions
for message, prediction in zip(new_sms_messages, predictions):
    print("SMS:", message)
    print("Prediction:", prediction)
    print()


SMS: Congratulations! You've won a prize!
Prediction: spam

SMS: Hello, how are you?
Prediction: ham



# Naive Bayes Classifier:

### 1 Load the dataset:

In [84]:
import pandas as pd

df1 = pd.read_csv("spam.csv", encoding='latin-1')


### 2.Preprocess the text data:

In [85]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return ' '.join(tokens)

df1['preprocessed_text'] = df1['v2'].apply(preprocess_text)


### 3.Convert text data into numerical representation using TF-IDF:

In [86]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df1['preprocessed_text'])
y = df1['v1']


### 4.Split the dataset into training and testing sets:

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [88]:
### 5.Train Naive Bayes classifier:

In [89]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)


In [90]:
### 6.Evaluate Naive Bayes classifier's performance:

In [91]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_nb = naive_bayes_classifier.predict(X_test)

accuracy_nb = accuracy_score(y_test, y_pred_nb)

print("Naive Bayes Classifier Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classifier Accuracy: 0.9623318385650225
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



# Logistic Regression Classifier:

In [92]:
### 1.Train Logistic Regression classifier:

In [93]:
from sklearn.linear_model import LogisticRegression

logistic_regression_classifier = LogisticRegression()
logistic_regression_classifier.fit(X_train, y_train)


In [94]:
### 2.Evaluate Logistic Regression classifier's performance:


In [95]:
y_pred_lr = logistic_regression_classifier.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred_lr)

print("Logistic Regression Classifier Accuracy:", accuracy_lr)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Classifier Accuracy: 0.9632286995515695
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

