In [2]:
import pandas as pd
import re
import string

df = pd.read_excel("../Spam_Email_Detection.xlsx")

# Check column names
print("Columns:", df.columns)

# Rename to standard names (adjust if needed)
df = df.rename(columns={
    df.columns[0]: 'text',
    df.columns[1]: 'label'
})

df = df.dropna(subset=['label'])
df['text'] = df['text'].fillna("")
df = df.drop_duplicates()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['text'].apply(clean_text)

print("Cleaned Dataset Shape:", df.shape)

Columns: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
Cleaned Dataset Shape: (5163, 5)


In [3]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Feature (text) and target (label)
X = df['text']
y = df['label']

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    stop_words='english',   # remove common English words
    max_features=5000       # keep top 5000 important words
)

# Apply TF-IDF encoding
X_tfidf = tfidf.fit_transform(X)

print("TF-IDF Feature Matrix Shape:", X_tfidf)

TF-IDF Feature Matrix Shape: <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5163 stored elements and shape (5163, 2)>
  Coords	Values
  (0, 0)	1.0
  (1, 0)	1.0
  (2, 1)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (5, 1)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 1)	1.0
  (9, 1)	1.0
  (10, 0)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 1)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 0)	1.0
  (19, 1)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (5138, 0)	1.0
  (5139, 0)	1.0
  (5140, 1)	1.0
  (5141, 0)	1.0
  (5142, 0)	1.0
  (5143, 0)	1.0
  (5144, 0)	1.0
  (5145, 0)	1.0
  (5146, 0)	1.0
  (5147, 0)	1.0
  (5148, 0)	1.0
  (5149, 0)	1.0
  (5150, 0)	1.0
  (5151, 0)	1.0
  (5152, 0)	1.0
  (5153, 0)	1.0
  (5154, 0)	1.0
  (5155, 0)	1.0
  (5156, 0)	1.0
  (5157, 1)	1.0
  (5158, 1)	1.0
  (5159, 0)	1.0
  (5160, 0)	1.0
  (5161, 0)	1.0
  (5162, 0)	1.0


In [5]:
from sklearn.preprocessing import StandardScaler

# StandardScaler for sparse TF-IDF matrix
scaler = StandardScaler(with_mean=False)

# Apply feature scaling
X_scaled = scaler.fit_transform(X_tfidf)

print("Scaled Feature Matrix Shape:", X_scaled)


Scaled Feature Matrix Shape: <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5163 stored elements and shape (5163, 2)>
  Coords	Values
  (0, 0)	3.020459508649842
  (1, 0)	3.020459508649842
  (2, 1)	3.0204595086498536
  (3, 0)	3.020459508649842
  (4, 0)	3.020459508649842
  (5, 1)	3.0204595086498536
  (6, 0)	3.020459508649842
  (7, 0)	3.020459508649842
  (8, 1)	3.0204595086498536
  (9, 1)	3.0204595086498536
  (10, 0)	3.020459508649842
  (11, 1)	3.0204595086498536
  (12, 1)	3.0204595086498536
  (13, 0)	3.020459508649842
  (14, 0)	3.020459508649842
  (15, 1)	3.0204595086498536
  (16, 0)	3.020459508649842
  (17, 0)	3.020459508649842
  (18, 0)	3.020459508649842
  (19, 1)	3.0204595086498536
  (20, 0)	3.020459508649842
  (21, 0)	3.020459508649842
  (22, 0)	3.020459508649842
  (23, 0)	3.020459508649842
  (24, 0)	3.020459508649842
  :	:
  (5138, 0)	3.020459508649842
  (5139, 0)	3.020459508649842
  (5140, 1)	3.0204595086498536
  (5141, 0)	3.020459508649842
  (5142, 0)	3.020459508649