<a href="https://colab.research.google.com/github/sasi-kalluri/oasis/blob/main/sms_spam_detection_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
data_path = '/content/spam.csv'
email_data = pd.read_csv(data_path, encoding='latin-1')
print(email_data.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
email_data = email_data[['v1', 'v2']]
email_data.columns = ['Label', 'MessageContent']
email_data['Label'] = email_data['Label'].map({'spam': 1, 'ham': 0})
print(email_data.isnull().sum())


Label             0
MessageContent    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  email_data['Label'] = email_data['Label'].map({'spam': 1, 'ham': 0})


In [None]:
from sklearn.model_selection import train_test_split

X_data = email_data['MessageContent']
y_data = email_data['Label']

X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(X_data, y_data, test_size=0.2, random_state=42)


print(f"Training samples: {X_train_set.shape[0]}")
print(f"Testing samples: {X_test_set.shape[0]}")


Training samples: 4457
Testing samples: 1115


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_train_tfidf_set = vectorizer.fit_transform(X_train_set)
X_test_tfidf_set = vectorizer.transform(X_test_set)

print(f"TF-IDF Training features shape: {X_train_tfidf_set.shape}")
print(f"TF-IDF Testing features shape: {X_test_tfidf_set.shape}")


TF-IDF Training features shape: (4457, 7472)
TF-IDF Testing features shape: (1115, 7472)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

spam_detector_model = MultinomialNB()
spam_detector_model.fit(X_train_tfidf_set, y_train_set)

y_predicted = spam_detector_model.predict(X_test_tfidf_set)

model_accuracy = accuracy_score(y_test_set, y_predicted)
print(f"Model Accuracy: {model_accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test_set, y_predicted))


Model Accuracy: 96.68%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [None]:
import joblib

joblib.dump(spam_detector_model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')



['tfidf_vectorizer.pkl']

In [None]:
def classify_new_email(email_content):

    email_tfidf_transformed = vectorizer.transform([email_content])
    prediction_result = spam_detector_model.predict(email_tfidf_transformed)

    return 'Spam' if prediction_result[0] == 1 else 'Not Spam'

test_email = "You've been selected for a free vacation! Click to claim."
classification_result = classify_new_email(test_email)
print(f"The email is: {classification_result}")


The email is: Spam
