# Assignment No : 06 (Naïve Bayes classification)

## PART I :-

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Creating a simple dataset
data = {
    'Age': [22, 25, 47, 52, 46, 56, 60, 62, 23, 24],
    'Salary': [20000, 25000, 50000, 52000, 48000, 60000, 80000, 81000, 21000, 22000],
    'Purchased': [0, 0, 1, 1, 1, 1, 1, 1, 0, 0]  # 1 = Purchased, 0 = Not Purchased
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Features (X) and Target (y)
X = df[['Age', 'Salary']]
y = df['Purchased']
# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize Gaussian Naïve Bayes model
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)

# Classification Report
report = classification_report(y_test, y_pred)

# Print results
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)


Confusion Matrix:
 [[2]]
Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2





## PART II :-

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# Sample Dataset (Replace this with a real dataset)
data = {
    'Text': [
        "Win a free iPhone now!",
        "Limited time offer, buy now!",
        "Meeting at 10 AM, please join.",
        "Get a discount on your next purchase!",
        "Let's catch up over coffee.",
        "Your loan is pre-approved. Click here!"
    ],
    'Label': ['spam', 'spam', 'ham', 'spam', 'ham', 'spam']  # Spam or Ham (Not Spam)
}

# Convert to DataFrame
df = pd.DataFrame(data)

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['Cleaned_Text'] = df['Text'].apply(preprocess_text)
vectorizer = TfidfVectorizer()  # TF-IDF Vectorization
X = vectorizer.fit_transform(df['Cleaned_Text'])
y = df['Label'].map({'spam': 1, 'ham': 0})  # Convert labels to 1 (Spam) and 0 (Ham)
# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = MultinomialNB()
model.fit(X_train, y_train)
# Predict on Test Data
y_pred = model.predict(X_test)
# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Classification Report
report = classification_report(y_test, y_pred)

# Print Results
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Confusion Matrix:
 [[0 0]
 [2 0]]
Accuracy: 0.00
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
