In [1]:
#Importing the required libraries
import pandas as pd  
import numpy as np 
import re 
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
data={
    'text':[
        "Win $1000 now!!!",
        "Dear user, your email has been selected for a cash reward of $1,000,000. Reply with your details!",
        "Meeting at 5 PM.",
        "Exclusive offer! Limited time!", 
        "Lunch at my place?", 
        "Click here to claim your prize!!!", 
        "Congratulations! You've won a FREE iPhone! Click here to claim your prize",
        "Your Amazon order #12345 has been shipped and will arrive soon.",
        "Let's catch up soon.",
        "URGENT! Your bank account has been compromised. Verify your details immediately",
        "Earn $5000 per week working from home! No experience needed. Sign up now!",
        "Reminder: Your dentist appointment is scheduled for March 20 at 10 AM.",
        "Hi Mom, I'll be home by 6 PM. Let me know if you need anything from the store.",
        "Limited-time offer! Get 90% off on weight loss pills. Order now before it's too late!",
        "Project meeting at 3 PM today. Let’s discuss the final updates.",
        ],
    'label':[1,1,0,1,0,1,1,0,0,1,1,0,0,1,0] #1 = spam , 0 = ham
}

df=pd.DataFrame(data)
print(df)

                                                 text  label
0                                    Win $1000 now!!!      1
1   Dear user, your email has been selected for a ...      1
2                                    Meeting at 5 PM.      0
3                      Exclusive offer! Limited time!      1
4                                  Lunch at my place?      0
5                   Click here to claim your prize!!!      1
6   Congratulations! You've won a FREE iPhone! Cli...      1
7   Your Amazon order #12345 has been shipped and ...      0
8                                Let's catch up soon.      0
9   URGENT! Your bank account has been compromised...      1
10  Earn $5000 per week working from home! No expe...      1
11  Reminder: Your dentist appointment is schedule...      0
12  Hi Mom, I'll be home by 6 PM. Let me know if y...      0
13  Limited-time offer! Get 90% off on weight loss...      1
14  Project meeting at 3 PM today. Let’s discuss t...      0


# Feature Engineering
### Email length (longer emails are often spam)
### Number of capital letters (spam messages often use excessive capitalization)
### Number of special characters ($, !, @, etc.)
### Number of links (spam often contains multiple links)
### Presence of spammy words (like "win", "free", "click", "urgent")

In [3]:
#feature extraction using a function
def extract_feature(email):
    return[
            len(email), #return the length of the email
            sum(c.isupper() for c in email)/len(email), #uppercase ratio
            sum(c.isdigit() for c in email), #return the no. of digit
            len(re.findall(r'[?!@.$%]',email)), #speccial character count
            int(bool(re.search(r'\b(win|free|offer|prize|urgent|earn|free|click|congratulations)\b',email, re.IGNORECASE))),
            int(bool(re.search(r'(https?://|www\.)\S+', email, re.IGNORECASE)))
    ]
x=np.array([extract_feature(email)for email in df['text']])
y=df['label']

#spliting the data into training and testing set
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=42)

#training the model
model=RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train,y_train)

#testing model accuracy
y_pred=model.predict(x_test)
print("Accuracy:" ,{accuracy_score(y_test,y_pred)})

Accuracy: {1.0}


In [4]:
new_spam_emails = [
    "Congratulations! You’ve won a free iPhone! Click here to claim your prize.",
    "Your bank account has been compromised! Click here to secure it now.",
    "Exclusive deal! Get 90% off on our latest product. Limited time only!",
    "Click here to receive an instant $500 bonus!",
    
]
new_labels = [1, 1, 1, 1]  # All are spam

# Create a new DataFrame for the new data
new_data = pd.DataFrame({"text": new_spam_emails, "label": new_labels})

# Concatenate the new data with the existing DataFrame
df = pd.concat([df, new_data], ignore_index=True)



In [5]:
#function for testing new text
def predict_spam(email):
    feature=np.array([extract_feature(email)])
    return 'Spam' if model.predict(feature)[0]== 1 else "Not Spam"


print(predict_spam("You are the lucky winner of $10,000! Claim now: http://winbig.com"))
print(predict_spam("Hi Mom, I'll be home by 6 PM. Let me know if you need anything from the store."))

Spam
Not Spam


In [6]:
with open("spam_model.pkl", "wb") as f:
    pickle.dump(model, f)

## Fast API code for application

In [7]:
import nest_asyncio
from fastapi import FastAPI
from fastapi.responses import FileResponse
from pydantic import BaseModel
import os
import re
import numpy as np
import pickle
import uvicorn

In [None]:
nest_asyncio.apply()

#loading the trained model
with open('spam_model.pkl','rb')as f:
    model=pickle.load(f)

#initialize the fastAPI app
app=FastAPI()

#serve html page
@app.get('/')
def read_root():
    return FileResponse(os.path.join("static", "index.html"))

def extract_feature(email):
    return[
            len(email), #return the length of the email
            sum(c.isupper() for c in email)/len(email), #uppercase ratio
            sum(c.isdigit() for c in email), #return the no. of digit
            len(re.findall(r'[?!@.$%]',email)), #special character count
            int(bool(re.search(r'\b(win|free|offer|prize|urgent|earn|free|click|congratulations)\b',email, re.IGNORECASE))),
            int(bool(re.search(r'(https?://|www\.)\S+', email, re.IGNORECASE)))
    ]

#define basemodel
class EmailRequest(BaseModel):
    email_text: str

#API route for spam detection
@app.post("/predict")
def predict_spam(request: EmailRequest):
    print("Received email text:", request.email_text)  # Debugging line
    email_features = np.array([extract_feature(request.email_text)])
    prediction = model.predict(email_features)[0]
    print("Extracted Features:", email_features)  # Debugging line
    print("Model Prediction:", prediction)  # Debugging line
    return {"spam": bool(prediction)}

# Run FastAPI in Jupyter Notebook
config = uvicorn.Config(app, host="127.0.0.1", port=8000)
server = uvicorn.Server(config)

server.run()


INFO:     Started server process [10984]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:49265 - "GET / HTTP/1.1" 200 OK
INFO:     127.0.0.1:49265 - "GET /favicon.ico HTTP/1.1" 404 Not Found
Received email text: saxenaritik3003@gmail.com
Extracted Features: [[25.  0.  4.  2.  0.  0.]]
Model Prediction: 0
INFO:     127.0.0.1:49301 - "POST /predict HTTP/1.1" 200 OK
Received email text: You Won A CAR


Extracted Features: [[15.   0.4  0.   0.   0.   0. ]]
Model Prediction: 0
INFO:     127.0.0.1:49310 - "POST /predict HTTP/1.1" 200 OK
Received email text: You Won A CAR


Extracted Features: [[15.   0.4  0.   0.   0.   0. ]]
Model Prediction: 0
INFO:     127.0.0.1:49310 - "POST /predict HTTP/1.1" 200 OK
Received email text: You Won A CAR


Extracted Features: [[15.   0.4  0.   0.   0.   0. ]]
Model Prediction: 0
INFO:     127.0.0.1:49316 - "POST /predict HTTP/1.1" 200 OK
Received email text: You Won A CAR!!


Extracted Features: [[17.          0.35294118  0.          2.          0.          0.        ]]
Model Prediction: 0
INFO:     127.0.0.1:49331 - "POST /