<a href="https://colab.research.google.com/github/rutujas11/CODSOFT/blob/main/sms_spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [None]:
# Load the SMS dataset
df = pd.read_csv('/content/drive/MyDrive/codsoftdata/spam.csv',encoding='latin-1')

In [None]:
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
# Preprocess the data
# Assuming your dataset has 'label' and 'message' columns
X = df['v2']
y = df['v1']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("train dataset : ")
print(X_train[:5])

train dataset : 
1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
Name: v2, dtype: object


In [None]:
print("test dataset : ")
print(X_test[:5])

test dataset : 
3245    Funny fact Nobody teaches volcanoes 2 erupt, t...
944     I sent my scores to sophas and i had to do sec...
1044    We know someone who you know that fancies you....
2484    Only if you promise your getting out as SOON a...
812     Congratulations ur awarded either å£500 of CD ...
Name: v2, dtype: object


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Predictions and evaluation for Naive Bayes
nb_predictions = nb_classifier.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("ACCURACY FOR NAIVE BAYES IN % : ",accuracy_score(y_test, nb_predictions)*100,"%")
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_predictions))
print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, nb_predictions))

Naive Bayes Accuracy: 0.9721973094170404
ACCURACY FOR NAIVE BAYES IN % :  97.21973094170404 %
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Naive Bayes Confusion Matrix:
[[965   0]
 [ 31 119]]


In [None]:
# Train Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)


In [None]:
# Predictions and evaluation for Logistic Regression
lr_predictions = lr_classifier.predict(X_test_tfidf)
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("ACCURACY FOR LOGISTIC REGRESSION IN % : ",accuracy_score(y_test, lr_predictions)*100,"%")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, lr_predictions))


Logistic Regression Accuracy: 0.957847533632287
ACCURACY FOR LOGISTIC REGRESSION IN % :  95.7847533632287 %
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.97      0.71      0.82       150

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.90      1115
weighted avg       0.96      0.96      0.95      1115

Logistic Regression Confusion Matrix:
[[962   3]
 [ 44 106]]


In [None]:
# Train Support Vector Machine (SVM) classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Predictions and evaluation for SVM
svm_predictions = svm_classifier.predict(X_test_tfidf)
print("\nSVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("ACCURACY FOR SVM IN % : ",accuracy_score(y_test, svm_predictions)*100,"%")
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions))


SVM Accuracy: 0.9766816143497757
ACCURACY FOR SVM IN % :  97.66816143497758 %
SVM Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       0.99      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

SVM Confusion Matrix:
[[964   1]
 [ 25 125]]


In [None]:
joblib.dump(svm_classifier, 'svm_model.joblib')

['svm_model.joblib']

In [None]:
svm_classifier = joblib.load('svm_model.joblib')

In [None]:
def predict_spam_or_ham(message):
    # Preprocess the input message
    message_tfidf = tfidf_vectorizer.transform([message])

    # Make predictions using SVM model
    #nb_prediction = nb_classifier.predict(message_tfidf)
    #lr_prediction = lr_classifier.predict(message_tfidf)
    svm_prediction = svm_classifier.predict(message_tfidf)

    return svm_prediction[0]

In [None]:
user_input = input("Enter an SMS message: ")
svm_result = predict_spam_or_ham(user_input)
print(f"SVM predicted result: {svm_result}")

Enter an SMS message: England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ﾌｼ1.20 POBOXox36504W45WQ 16+
SVM predicted result: spam
