In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("All libraries imported successfully!")

All libraries imported successfully!


In [5]:
# This is a different, stable URL for an EMAIL spam dataset
data_url = "https://raw.githubusercontent.com/Apaulgithub/oibsip_taskno4/main/spam.csv"

# Use pandas to read the CSV file
# We add encoding='latin-1' because this file has special characters
try:
    data = pd.read_csv(data_url, encoding='latin-1')
    print("Email data loaded successfully!")

    # --- Let's clean up the data ---
    # This dataset has extra, empty columns. Let's drop them.
    data = data.iloc[:, :2] # Keep only the first two columns
    
    # Let's rename the columns to be clear
    data.columns = ['Category', 'Message']

    # Display the first 5 rows to see the new structure
    print("\n--- Data Head (Cleaned) ---")
    print(data.head())

    # Check the column names
    print("\n--- Column Names (Cleaned) ---")
    print(data.columns)

except Exception as e:
    print(f"Error loading data: {e}")
    print("Please check the URL or your internet connection.")

Email data loaded successfully!

--- Data Head (Cleaned) ---
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

--- Column Names (Cleaned) ---
Index(['Category', 'Message'], dtype='object')


In [6]:
# 1. Create a new column 'label_num'
# Map 'ham' to 0 and 'spam' to 1
data['label_num'] = data['Category'].map({'ham': 0, 'spam': 1})

# 2. Define our features (X) and target (y)
X = data['Message']  # The email text
y = data['label_num']  # The 0 or 1 we just made

# 3. Split the data
# 80% will be for training, 20% for testing
# random_state=42 just ensures we get the same "random" split every time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data has been prepared and split:")
print(f"Total emails: {len(data)}")
print(f"Training emails: {len(X_train)}")
print(f"Testing emails: {len(X_test)}")

Data has been prepared and split:
Total emails: 5572
Training emails: 4457
Testing emails: 1115


In [8]:
# Initialize the TF-IDF Vectorizer
# stop_words='english' tells it to ignore common English words like 'and', 'the', 'is'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Step 1: Fit and transform the *training data*
# This learns the vocabulary from the training emails and converts them to numbers
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Step 2: Only transform the *testing data*
# This uses the *same* vocabulary learned from the training data to convert the test emails
X_test_tfidf = tfidf_vectorizer.transform(X_test)  # Fixed typo in variable name here

print("Email text has been vectorized into numerical features.")
print(f"Shape of training features (rows, words): {X_train_tfidf.shape}")
print(f"Shape of testing features (rows, words): {X_test_tfidf.shape}")

Email text has been vectorized into numerical features.
Shape of training features (rows, words): (4457, 7472)
Shape of testing features (rows, words): (1115, 7472)


In [9]:
# Initialize the Multinomial Naive Bayes classifier
model = MultinomialNB()

# Train (fit) the model on the training data
print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("Model training complete!")

Training the model...
Model training complete!


In [10]:
# Use the trained model to make predictions on the unseen test data
y_pred = model.predict(X_test_tfidf)

# --- 1. Accuracy ---
accuracy = accuracy_score(y_test, y_pred)
print("--- Model Evaluation ---")
print(f"Accuracy: {accuracy * 100:.2f}%\n")


# --- 2. Confusion Matrix ---
# This shows us True Positives, False Positives, True Negatives, and False Negatives
print("--- Confusion Matrix ---")
cm = confusion_matrix(y_test, y_pred)

# Let's make it easier to read with a DataFrame
cm_df = pd.DataFrame(cm, 
                     index=['Actual Ham (0)', 'Actual Spam (1)'], 
                     columns=['Predicted Ham (0)', 'Predicted Spam (1)'])
print(cm_df)
print("\n")


# --- 3. Classification Report ---
# This gives us precision, recall, and f1-score for each class
print("--- Classification Report ---")
report = classification_report(y_test, y_pred, target_names=['Ham (0)', 'Spam (1)'])
print(report)

--- Model Evaluation ---
Accuracy: 96.68%

--- Confusion Matrix ---
                 Predicted Ham (0)  Predicted Spam (1)
Actual Ham (0)                 965                   0
Actual Spam (1)                 37                 113


--- Classification Report ---
              precision    recall  f1-score   support

     Ham (0)       0.96      1.00      0.98       965
    Spam (1)       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [11]:
# --- Step 7: Test on New, Custom Emails ---

print("--- Testing new custom messages ---")

# You can change these messages to anything you want!
new_emails = [
    "Congratulations! You've won a $1,000 cash prize. Click here to claim now.",
    "Hey, are you available for a meeting tomorrow at 10 AM?",
    "URGENT: Your account has been suspended. Please click this link to verify your identity.",
    "Did you see the report I sent over? Let me know your feedback.",
    "Free entry, exclusive offer, win money today!"
]

# 1. Convert the new emails to numbers using the *same vectorizer*
new_emails_tfidf = tfidf_vectorizer.transform(new_emails)

# 2. Make predictions using the *trained model*
new_predictions = model.predict(new_emails_tfidf)

# 3. Show the results
for message, prediction in zip(new_emails, new_predictions):
    label = 'Spam' if prediction == 1 else 'Ham'
    print(f"\nMessage: '{message}'\nPredicted: **{label}**")

--- Testing new custom messages ---

Message: 'Congratulations! You've won a $1,000 cash prize. Click here to claim now.'
Predicted: **Spam**

Message: 'Hey, are you available for a meeting tomorrow at 10 AM?'
Predicted: **Ham**

Message: 'URGENT: Your account has been suspended. Please click this link to verify your identity.'
Predicted: **Spam**

Message: 'Did you see the report I sent over? Let me know your feedback.'
Predicted: **Ham**

Message: 'Free entry, exclusive offer, win money today!'
Predicted: **Ham**
