In [26]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [27]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    return text

mail_data['cleaned_message'] = mail_data['Message'].apply(clean_text)

label_encoder = LabelEncoder()
mail_data['Category_encoded'] = label_encoder.fit_transform(mail_data['Category'])

X_train, X_test, y_train, y_test = train_test_split(
    mail_data['cleaned_message'], mail_data['Category_encoded'], test_size=0.2, random_state=42
)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [28]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42)
}

def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        results[model_name] = accuracy
    return results

model_performance = train_and_evaluate(models, X_train_tfidf, y_train, X_test_tfidf, y_test)
print(model_performance)


{'Naive Bayes': 0.9704035874439462, 'Random Forest': 0.9811659192825112, 'Logistic Regression': 0.9739910313901345}
