In [24]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
df = pd.read_csv("/content/emails.csv")

X = df["text"].astype(str).values
y = df["spam"].values   # 0 or 1 labels

In [26]:
vectorizer = TfidfVectorizer()
X_vectors = vectorizer.fit_transform(X)

In [27]:
# Split data (150 train, 50 test)
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors, y, test_size=50, train_size=150, random_state=42, stratify=y
)

In [28]:
# Train model (LinearRegression)
model = LinearRegression()
model.fit(X_train, y_train)

In [29]:
sample_emails = X_test   # all 50 test emails

In [30]:
outputs = model.predict(sample_emails)

print("Categories: ['not spam', 'spam']")

for i, out in enumerate(outputs):
    prediction = "Spam" if out > 0.5 else "Not Spam"
    print(f"Email {i+1}: Output = {out:.4f} -> Prediction: {prediction}")

print("\n" + "="*60 + "\n")

Categories: ['not spam', 'spam']
Email 1: Output = 0.4296 -> Prediction: Not Spam
Email 2: Output = 0.5244 -> Prediction: Spam
Email 3: Output = 0.3081 -> Prediction: Not Spam
Email 4: Output = 0.4619 -> Prediction: Not Spam
Email 5: Output = 0.9636 -> Prediction: Spam
Email 6: Output = 0.2375 -> Prediction: Not Spam
Email 7: Output = 0.2444 -> Prediction: Not Spam
Email 8: Output = 0.0437 -> Prediction: Not Spam
Email 9: Output = 0.1528 -> Prediction: Not Spam
Email 10: Output = 0.5583 -> Prediction: Spam
Email 11: Output = 0.2519 -> Prediction: Not Spam
Email 12: Output = 0.2840 -> Prediction: Not Spam
Email 13: Output = 0.1053 -> Prediction: Not Spam
Email 14: Output = 0.0344 -> Prediction: Not Spam
Email 15: Output = 0.1132 -> Prediction: Not Spam
Email 16: Output = 0.5418 -> Prediction: Spam
Email 17: Output = 0.3530 -> Prediction: Not Spam
Email 18: Output = 0.2247 -> Prediction: Not Spam
Email 19: Output = 0.2965 -> Prediction: Not Spam
Email 20: Output = 0.1872 -> Prediction: N