In [25]:
# Importing Necessary Libraries
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from flask import Flask, request, jsonify
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections.abc import Sequence
import pandas as pd
import os

In [26]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohammedzaidsyed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohammedzaidsyed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# Create folders if they do not exist
if not os.path.exists('models'):
    os.makedirs('models')
if not os.path.exists('mlruns'):
    os.makedirs('mlruns')

In [40]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

client = MlflowClient()

# Use the search_experiments function to get a list of available experiments
experiments = client.search_experiments()
print(len(experiments))

# Display the list of available experiments
for experiment in experiments:
    print(f"Experiment Name: {experiment.name}, Experiment ID: {experiment.experiment_id}")

tracking URI: 'file:///Users/mohammedzaidsyed/Desktop/Spam/spamclassification/mlruns'
3
Experiment Name: SVM, Experiment ID: 407553083182888926
Experiment Name: Random Forest, Experiment ID: 266853881450216497
Experiment Name: Naive Bayes, Experiment ID: 831114447560693852


In [29]:
# Set MLflow tracking URI to log to the desired folder
mlflow.set_tracking_uri("file:///Users/mohammedzaidsyed/Desktop/Spam/spamclassification/mlruns")

In [30]:
# Load the dataset
def load_data(path):
    return pd.read_csv(path)

df = load_data('data/spam_ham_dataset.csv')
df.head()
X = df['text'].tolist()
y = df['label'].tolist()

In [31]:
# Preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

In [32]:
# Preprocess the dataset
X_preprocessed = [preprocess_text(text) for text in X]


In [33]:
# Split the preprocessed data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [34]:
# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

In [39]:
# Loop over models and track experiments
for name, model in models.items():
    # Start a new MLflow experiment
    mlflow.set_experiment(name)
    
    with mlflow.start_run():
        # Initialize CountVectorizer
        vectorizer = CountVectorizer()
        
        # Fit and transform the training data
        X_train_counts = vectorizer.fit_transform(X_train)
        
        # Train the model
        model.fit(X_train_counts, y_train)
        
        # Transform the test data
        X_test_counts = vectorizer.transform(X_test)
        
        # Make predictions
        y_pred = model.predict(X_test_counts)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log model parameters and metrics
        mlflow.log_param("Model", name)
        mlflow.log_metric("Accuracy", accuracy)

        # Save the model
        model_path = f"../models/{name}"
        os.makedirs(model_path, exist_ok=True)
        mlflow.sklearn.save_model(model, model_path)


2024/01/31 02:07:01 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest' does not exist. Creating a new experiment.
2024/01/31 02:07:05 INFO mlflow.tracking.fluent: Experiment with name 'SVM' does not exist. Creating a new experiment.


In [41]:
# Load the models from the specified folder
model_nb = mlflow.sklearn.load_model("../models/Naive Bayes")
model_rf = mlflow.sklearn.load_model("../models/Random Forest")
model_svm = mlflow.sklearn.load_model("../models/SVM")


In [42]:
# Initialize Flask app
app = Flask(__name__)

In [43]:
# Prediction endpoint
@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    email_text = data['email_text']

    # Make predictions with each model
    pred_nb = model_nb.predict([preprocess_text(email_text)])[0]
    pred_rf = model_rf.predict([preprocess_text(email_text)])[0]
    pred_svm = model_svm.predict([preprocess_text(email_text)])[0]

    # Combine predictions
    predictions = {
        "Naive Bayes": bool(pred_nb),
        "Random Forest": bool(pred_rf),
        "SVM": bool(pred_svm)
    }

    return jsonify(predictions)

In [44]:
if __name__ == '__main__':
    app.run(port=5000, debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (fsevents)
Traceback (most recent call last):
  File "/Users/mohammedzaidsyed/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 15, in <module>
    from ipykernel import kernelapp as app
  File "/Users/mohammedzaidsyed/anaconda3/lib/python3.11/site-packages/ipykernel/__init__.py", line 5, in <module>
    from .connect import *  # noqa
    ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mohammedzaidsyed/anaconda3/lib/python3.11/site-packages/ipykernel/connect.py", line 11, in <module>
    import jupyter_client
  File "/Users/mohammedzaidsyed/anaconda3/lib/python3.11/site-packages/jupyter_client/__init__.py", line 8, in <module>
    from .asynchronous import AsyncKernelClient  # noqa
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mohammedzaidsyed/anaconda3/lib/python3.11/site-packages/jupyter_client/asynchronous/__init__.py", line 1, in <module>
    from .client import AsyncKernel

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
