In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re

# Step 1: Load and Explore the Dataset
try:
    file_path = r'C:\Users\study\Desktop\EXTRA\CIPHERBYTE TECHNOLOGY\Spam Email Detection.xlsx'
    data = pd.read_excel(file_path)
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

print("First few rows of the dataset:")
print(data.head())

print("Column names in the dataset:")
print(data.columns)

# Step 2: Preprocess the Data
print("Missing values in each column:")
print(data.isnull().sum())

data = data.dropna()

data.columns = data.columns.str.strip()

email_column = 'v2'
label_column = 'v1'

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

data[email_column] = data[email_column].apply(clean_text)

# Encode labels
data[label_column] = data[label_column].map({'ham': 0, 'spam': 1})

# Step 3: Split the Data
X = data[email_column]
y = data[label_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the Model
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 5: Evaluate the Model
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# Step 6: Implement the Classification
def classify_email(email_text):
    email_text_clean = clean_text(email_text)
    email_text_tfidf = vectorizer.transform([email_text_clean])
    prediction = model.predict(email_text_tfidf)
    return 'spam' if prediction[0] == 1 else 'ham'

# Example usage
new_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print(f'The email is classified as: {classify_email(new_email)}')


First few rows of the dataset:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Column names in the dataset:
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
Missing values in each column:
v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64
Accuracy: 1.00
Precision: 0.00
Recall: 0.00
F1-score: 0.00
The email is classified as: ham


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
