In [None]:
pip install pandas scikit-learn numpy

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/spam.csv', encoding='latin-1')

# Drop unnecessary columns
df = df[['v1', 'v2']]

# Rename columns for clarity
df.columns = ['label', 'message']

# Display the first few rows
print(df.head())

# Check the distribution of spam and ham messages
print(df['label'].value_counts())

In [None]:
from sklearn.model_selection import train_test_split

df['label'] = df['label'].map({'spam': 1, 'ham': 0})

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

print(f"Shape of TF-IDF training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF testing data: {X_test_tfidf.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = LogisticRegression()

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
def classify_message(model, vectorizer, message):
    message_tfidf = vectorizer.transform([message])
    
    prediction = model.predict(message_tfidf)
    
    return "spam" if prediction[0] == 1 else "ham"

# Test with a custom message
custom_message = "Hi you won 3000$ in lottery"
result = classify_message(model, tfidf, custom_message)
print(f"The message is classified as: {result}")