In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

import warnings 
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords

In [3]:
df = pd.read_csv('email_spam.csv')
df

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam
...,...,...,...
79,Your application for the position of Child Pr...,"Dear Maryam, \n\n \n\nI would like to thank yo...",not spam
80,Your Kilimall Account is Ready - Shopping Now!,"Dear Customer,\n\nWelcome to Kilimall, Thanks ...",not spam
81,Your Steam account: Access from new web or mob...,"Dear vladis163rus,\nHere is the Steam Guard co...",not spam
82,Your uploaded document is rejected,View In Browser | Log in\n \n \n\nSkrill logo\...,not spam


In [4]:
df.head()

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   84 non-null     object
 1   text    84 non-null     object
 2   type    84 non-null     object
dtypes: object(3)
memory usage: 2.1+ KB


In [6]:
df.isnull().sum()

title    0
text     0
type     0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,title,text,type
count,84,84,84
unique,78,82,2
top,English,Model Casting Call\nThank you for taking the t...,not spam
freq,3,2,58


In [8]:
df.columns

Index(['title', 'text', 'type'], dtype='object')

In [9]:
# Clean the text (remove non-alphabetic characters, stop words, etc.)
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply the cleaning function to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_text)

# Preview cleaned text
print(df['cleaned_text'].head())

0    hi james claim complimentary gift yet ive comp...
1    alttext congratulations earned completed follo...
2    heres github launch code mortyj octocat standi...
3    hello thank contacting virtual reward center v...
4    hey prachanda rawal todays newsletter jampacke...
Name: cleaned_text, dtype: object


In [10]:
# Encode labels (spam = 1, not spam = 0)
label_encoder = LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'])  # spam = 1, not spam = 0

# Split the dataset into features (X) and target (y)
X = df['cleaned_text']
y = df['type']


In [11]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)  # Apply TF-IDF to the text


In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [13]:
# Initialize the Logistic Regression model
log_reg_model = LogisticRegression()

# Train the model
log_reg_model.fit(X_train, y_train)


In [14]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [18]:
from imblearn.over_sampling import SMOTE

from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier

In [19]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [20]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}


In [21]:
model = RandomForestClassifier(class_weight=class_weights_dict, random_state=42)

In [22]:
model.fit(X_train_resampled, y_train_resampled)


In [23]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  


In [28]:


# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"\nROC-AUC Score: {roc_auc:.2f}")



Confusion Matrix:
[[5 6]
 [0 6]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.45      0.62        11
           1       0.50      1.00      0.67         6

    accuracy                           0.65        17
   macro avg       0.75      0.73      0.65        17
weighted avg       0.82      0.65      0.64        17


ROC-AUC Score: 0.63


In [41]:
import joblib
from sklearn.tree import DecisionTreeClassifier


model = DecisionTreeClassifier()
model.fit(X_train, y_train)  


joblib.dump(model, "email_spam.pkl")


['email_spam.pkl']

In [43]:
import nest_asyncio
import uvicorn


nest_asyncio.apply()

In [None]:
import joblib
import uvicorn
from fastapi import FastAPI

# Load trained model
with open('email_spam.pkl', 'rb') as model:
    model = joblib.load("email_spam.pkl")

app = FastAPI()

@app.post("/predict")
def predict(email_text: str):
    features = vectorizer.transform([email_text]) 
    prediction = model.predict(features)[0]
    return {"spam": bool(prediction)}

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8001)
