In [7]:
import pandas as pd

df = pd.read_csv("spam.csv", sep='\t', header=None, names=['label', 'message'])

print(df.head())
print(df.shape)

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


In [8]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    # Tokenize and remove stopwords
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

df['clean_message'] = df['message'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\REHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_message'])
y = df['label'] # Our target (spam or ham)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [12]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.81      0.90       149

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [13]:
import pickle

# Save the trained model
with open('spam_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer (essential to transform new input)
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

In [16]:
def predict_spam(sample_message):
    # 1. Preprocess the input string
    cleaned_text = preprocess_text(sample_message)
    
    # 2. Transform using the vectorizer (use .transform, NOT .fit_transform)
    vectorized_input = vectorizer.transform([cleaned_text])
    
    # 3. Predict
    prediction = model.predict(vectorized_input)[0]
    
    # 4. Get Probability (Optional but cool)
    # This shows how confident the model is
    proba = model.predict_proba(vectorized_input)
    
    return prediction, proba

# --- TEST IT HERE ---
test_mail ="Can you send me the invoice for the $500 payment? Thanks."

result, confidence = predict_spam(test_mail)

print(f"Message: {test_mail}")
print(f"Result: {result.upper()}")
print(f"Confidence (Ham vs Spam): {confidence[0]}")

Message: Can you send me the invoice for the $500 payment? Thanks.
Result: HAM
Confidence (Ham vs Spam): [0.92288316 0.07711684]


In [1]:
requirements = """
streamlit
pandas
scikit-learn
pypdf
python-docx
wordcloud
matplotlib
"""

with open("requirements.txt", "w", encoding="utf-8") as f:
    f.write(requirements.strip())
    
print("requirements.txt created successfully with UTF-8 encoding!")

requirements.txt created successfully with UTF-8 encoding!
