## practical 8

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load the dataset
# If you have the dataset as a text file named 'SMSSpamCollection', use the following:
# df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

# Or download it using pandas directly (assuming it is hosted online).
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
df = pd.read_csv('SMSSpamCollection.csv', sep='\t', names=['label', 'message'])

# Display first few rows
print(df.head())

# Define stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess the text
def preprocess_text(text):
# Convert text to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Remove numbers
    text = re.sub(r'\d', ' ', text)
    # Remove single characters

    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing to the messages
df['message'] = df['message'].apply(preprocess_text)

# Display some processed messages
print(df.head())

# Convert labels to binary (1 for spam, 0 for ham)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Feature extraction using TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['message']).toarray()
y = df['label'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set

y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['ham', 'spam'])

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"\nClassification Report:\n{report}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rc880\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
  label                                            message
0   ham  go jurong point crazy available bugis great wo...
1   ham                              ok lar joking wif oni
2  spam  free entry wkly comp win fa cup final tkts st ...
3   ham                  u dun say early hor c already say
4   ham             nah think goes usf lives around though
Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1448
        spam       1.00      0.87      0.93       224

    accuracy                           0.98      1672
   macro avg       0.99      0.93  