In [1]:
# Imports necessary libraries for data manipulation (pandas, numpy), plotting (matplotlib, seaborn), and enables inline plotting.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Reads the SMS spam collection dataset into a pandas DataFrame.
msg = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', names =[ "label" ,"message"  ])

In [3]:
# Displays the first few rows of the loaded DataFrame.
msg.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Imports libraries for regular expressions and natural language processing.
import re
import nltk

In [5]:
# Imports necessary components from nltk for text preprocessing and initializes the lemmatizer.
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [6]:
# Downloads the stopwords corpus from nltk data (output shows it's already up-to-date).
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Downloads the wordnet corpus from nltk data (output shows it's already up-to-date).
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
# Preprocesses the text data by cleaning, tokenizing, removing stopwords, and lemmatizing.
corpus =[]
for i in range(0 ,len(msg)):
  review = re.sub('[^a-zA-Z]', ' ',msg['message'][i])
  review = review.lower()
  review = review.split()
  review =[lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [9]:
# Displays the list containing the preprocessed text messages.
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

In [10]:
# Converts the 'label' column to one-hot encoded format.
y = pd.get_dummies(msg['label'])
# Selects the first column of the one-hot encoded labels and converts it to a NumPy array.
y=y.iloc[:,0].values

In [11]:
# Splits the feature data (x) and labels (y) into training and testing sets.
from sklearn.model_selection import train_test_split
x_train ,x_test , y_train , y_test = train_test_split(corpus,y,test_size=0.30 , random_state=420)

In [12]:
# Creates a TfidfVectorizer with a maximum of 100 features.
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf = TfidfVectorizer(max_features=100)

In [13]:
# Fits and transforms the corpus using the TfidfVectorizer.
x_train =Tfidf.fit_transform(x_train).toarray()
x_test = Tfidf.transform(x_test).toarray()

In [14]:
# Displays the vocabulary learned by the TfidfVectorizer.
Tfidf.vocabulary_

{'tomorrow': np.int64(83),
 'lt': np.int64(42),
 'gt': np.int64(27),
 'hi': np.int64(30),
 'would': np.int64(95),
 'like': np.int64(39),
 'new': np.int64(53),
 'reply': np.int64(64),
 'www': np.int64(96),
 'call': np.int64(7),
 'see': np.int64(68),
 'yes': np.int64(99),
 'week': np.int64(91),
 'text': np.int64(77),
 'way': np.int64(90),
 'still': np.int64(73),
 'free': np.int64(18),
 'back': np.int64(6),
 'thanks': np.int64(78),
 'think': np.int64(80),
 'send': np.int64(69),
 'sorry': np.int64(72),
 'service': np.int64(71),
 'mobile': np.int64(48),
 'go': np.int64(22),
 'min': np.int64(46),
 'txt': np.int64(85),
 'hey': np.int64(29),
 'later': np.int64(36),
 'want': np.int64(88),
 'please': np.int64(60),
 'know': np.int64(35),
 'going': np.int64(23),
 'give': np.int64(21),
 'oh': np.int64(56),
 'great': np.int64(26),
 'number': np.int64(55),
 'ok': np.int64(57),
 'da': np.int64(13),
 'good': np.int64(24),
 'prize': np.int64(62),
 'claim': np.int64(9),
 'got': np.int64(25),
 'really': n

In [15]:
# Sets NumPy print options for better array display.
np.set_printoptions(edgeitems=30, linewidth =100000,
                    formatter=dict(float = lambda x: "%.3g" % x))
x_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.536, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.653, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.638, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.348, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 

In [16]:
# Imports and trains a Multinomial Naive Bayes classifier.
from sklearn.naive_bayes import MultinomialNB
detect = MultinomialNB().fit(x_train , y_train)
detect

In [17]:
# Makes predictions on the test data using the trained model.
y_pred = detect.predict(x_test)

In [18]:
# Imports metrics for model evaluation.
from sklearn.metrics import accuracy_score , classification_report

In [19]:
# Prints the accuracy score of the model.
print(accuracy_score(y_test , y_pred))

0.9533492822966507


In [20]:
# Prints the classification report of the model.
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

       False       0.91      0.74      0.82       239
        True       0.96      0.99      0.97      1433

    accuracy                           0.95      1672
   macro avg       0.94      0.87      0.90      1672
weighted avg       0.95      0.95      0.95      1672



# SMS Spam Detection

This notebook demonstrates a basic SMS spam detection model using Python and scikit-learn.

## Dataset

The model is trained on the [SMS Spam Collection dataset](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection). The dataset contains SMS messages labeled as either "ham" (not spam) or "spam".

## Notebook Steps

1.  **Load Data:** Reads the dataset into a pandas DataFrame.
2.  **Text Preprocessing:** Cleans and prepares the text data for model training by:
    *   Removing special characters and numbers.
    *   Converting text to lowercase.
    *   Tokenizing the text.
    *   Removing stopwords.
    *   Lemmatizing words.
3.  **Feature Extraction:** Converts the preprocessed text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).
4.  **Data Splitting:** Splits the data into training and testing sets.
5.  **Model Training:** Trains a Multinomial Naive Bayes classifier.
6.  **Model Evaluation:** Evaluates the model's performance using accuracy and a classification report.

## Libraries Used

*   `pandas`: For data manipulation.
*   `numpy`: For numerical operations.
*   `matplotlib` and `seaborn`: For data visualization (though not extensively used in this specific notebook).
*   `re`: For regular expressions (used in text cleaning).
*   `nltk`: For natural language processing tasks like tokenization, stopword removal, and lemmatization.
*   `sklearn`: For machine learning tasks like TF-IDF vectorization, data splitting, and model training/evaluation.

## How to Use

1.  Ensure you have the SMS Spam Collection dataset (`SMSSpamCollection.txt`) in the `/content/` directory.
2.  Run the cells in the notebook sequentially.
3.  The output of the last cells will show the accuracy and classification report of the trained spam detection model.