In [1]:
# First installing this for dataset
!pip install ucimlrepo

Collecting ucimlrepo
  Obtaining dependency information for ucimlrepo from https://files.pythonhosted.org/packages/22/47/9350b2eeeaef8c0fd3ec3505c8a0481b576845b3df0d71c76f989c23d3c6/ucimlrepo-0.0.6-py3-none-any.whl.metadata
  Downloading ucimlrepo-0.0.6-py3-none-any.whl.metadata (5.3 kB)
Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


Here, we utilized the Spambase dataset from the UCI Machine Learning Repository to build and evaluate models for classifying emails as spam or ham (non-spam).

First, we fetched and split the dataset into training and testing sets, then standardized the features. We trained two machine learning models: Logistic Regression and Random Forest. After training, we evaluated the models using accuracy scores and classification reports to measure their performance.

Finally, we created a function to predict whether new emails are spam or ham using these trained models and tested this function with example email data for both spam and ham. This process demonstrates a practical approach to email classification using machine learning techniques.

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 
  
# metadata 
print(spambase.metadata) 
  
# variable information 
print(spambase.variables) 

{'uci_id': 94, 'name': 'Spambase', 'repository_url': 'https://archive.ics.uci.edu/dataset/94/spambase', 'data_url': 'https://archive.ics.uci.edu/static/public/94/data.csv', 'abstract': 'Classifying Email as Spam or Non-Spam', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 4601, 'num_features': 57, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1999, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C53G6X', 'creators': ['Mark Hopkins', 'Erik Reeber', 'George Forman', 'Jaap Suermondt'], 'intro_paper': None, 'additional_info': {'summary': 'The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n\nThe classification task for this dataset is to determine whether a given email is spam or not.\n\t\nOur collecti

In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Extract data (features and targets) as pandas DataFrames
X = spambase.data.features
y = spambase.data.targets

In [6]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [7]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Initialize and train a Logistic Regression model
lr_model = LogisticRegression(max_iter=100)
lr_model.fit(X_train_scaled, y_train)

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


In [10]:
# Predict the class of the test set using all models
lr_pred = lr_model.predict(X_test_scaled)
rf_pred = rf_model.predict(X_test_scaled)

In [11]:
# Evaluate the models
lr_accuracy = accuracy_score(y_test, lr_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

lr_report = classification_report(y_test, lr_pred)
rf_report = classification_report(y_test, rf_pred)

In [12]:
# Print the evaluation results
print(f'Logistic Regression Accuracy: {lr_accuracy:.2f}')
print('Logistic Regression Classification Report:')
print(lr_report)

print(f'Random Forest Accuracy: {rf_accuracy:.2f}')
print('Random Forest Classification Report:')
print(rf_report)

Logistic Regression Accuracy: 0.93
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       837
           1       0.92      0.90      0.91       544

    accuracy                           0.93      1381
   macro avg       0.93      0.92      0.93      1381
weighted avg       0.93      0.93      0.93      1381

Random Forest Accuracy: 0.96
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       837
           1       0.96      0.93      0.94       544

    accuracy                           0.96      1381
   macro avg       0.96      0.95      0.95      1381
weighted avg       0.96      0.96      0.96      1381



In [13]:
# Function to predict if a new email is spam or ham using all models
def predict_spam_or_ham(new_data):
    # Ensure the new data is in the correct format (DataFrame)
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])
    elif isinstance(new_data, list):
        new_data = pd.DataFrame(new_data)

    # Standardize the new data using the same scaler
    new_data_scaled = scaler.transform(new_data)

    # Predict using the trained models
    lr_prediction = lr_model.predict(new_data_scaled)
    rf_prediction = rf_model.predict(new_data_scaled)

    # Map the predictions to human-readable labels
    predictions = {
        'Logistic Regression': ['Spam' if pred == 1 else 'Ham' for pred in lr_prediction],
        'Random Forest': ['Spam' if pred == 1 else 'Ham' for pred in rf_prediction]
    }

    return predictions

In [14]:
# Example new data for spam prediction (Here we can  replace with actual new email data but for now we are using exmaple data)
spam_email_data = {
    'word_freq_make': 0.21, 'word_freq_address': 0.0, 'word_freq_all': 0.28, 'word_freq_3d': 0.0, 'word_freq_our': 0.18,
    'word_freq_over': 0.0, 'word_freq_remove': 0.36, 'word_freq_internet': 0.0, 'word_freq_order': 0.0, 'word_freq_mail': 0.0,
    'word_freq_receive': 0.0, 'word_freq_will': 0.0, 'word_freq_people': 0.0, 'word_freq_report': 0.0, 'word_freq_addresses': 0.0,
    'word_freq_free': 0.42, 'word_freq_business': 0.0, 'word_freq_email': 0.0, 'word_freq_you': 0.18, 'word_freq_credit': 0.0,
    'word_freq_your': 0.37, 'word_freq_font': 0.0, 'word_freq_000': 0.0, 'word_freq_money': 0.42, 'word_freq_hp': 0.0,
    'word_freq_hpl': 0.0, 'word_freq_george': 0.0, 'word_freq_650': 0.0, 'word_freq_lab': 0.0, 'word_freq_labs': 0.0,
    'word_freq_telnet': 0.0, 'word_freq_857': 0.0, 'word_freq_data': 0.0, 'word_freq_415': 0.0, 'word_freq_85': 0.0,
    'word_freq_technology': 0.0, 'word_freq_1999': 0.0, 'word_freq_parts': 0.0, 'word_freq_pm': 0.0, 'word_freq_direct': 0.0,
    'word_freq_cs': 0.0, 'word_freq_meeting': 0.0, 'word_freq_original': 0.0, 'word_freq_project': 0.0, 'word_freq_re': 0.0,
    'word_freq_edu': 0.0, 'word_freq_table': 0.0, 'word_freq_conference': 0.0, 'char_freq_;': 0.0, 'char_freq_(': 0.0,
    'char_freq_[': 0.0, 'char_freq_!': 0.77, 'char_freq_$': 0.0, 'char_freq_#': 0.0, 'capital_run_length_average': 1.8,
    'capital_run_length_longest': 4, 'capital_run_length_total': 10
}

# Example new data for ham prediction (Here we can replace with actual new email data but for now we are using exmaple data)
ham_email_data = {
    'word_freq_make': 0.0, 'word_freq_address': 0.0, 'word_freq_all': 0.0, 'word_freq_3d': 0.0, 'word_freq_our': 0.0,
    'word_freq_over': 0.0, 'word_freq_remove': 0.0, 'word_freq_internet': 0.0, 'word_freq_order': 0.0, 'word_freq_mail': 0.0,
    'word_freq_receive': 0.0, 'word_freq_will': 0.21, 'word_freq_people': 0.0, 'word_freq_report': 0.0, 'word_freq_addresses': 0.0,
    'word_freq_free': 0.0, 'word_freq_business': 0.0, 'word_freq_email': 0.0, 'word_freq_you': 0.18, 'word_freq_credit': 0.0,
    'word_freq_your': 0.19, 'word_freq_font': 0.0, 'word_freq_000': 0.0, 'word_freq_money': 0.0, 'word_freq_hp': 0.0,
    'word_freq_hpl': 0.0, 'word_freq_george': 0.15, 'word_freq_650': 0.06, 'word_freq_lab': 0.0, 'word_freq_labs': 0.0,
    'word_freq_telnet': 0.0, 'word_freq_857': 0.0, 'word_freq_data': 0.0, 'word_freq_415': 0.06, 'word_freq_85': 0.0,
    'word_freq_technology': 0.0, 'word_freq_1999': 0.0, 'word_freq_parts': 0.0, 'word_freq_pm': 0.0, 'word_freq_direct': 0.0,
    'word_freq_cs': 0.0, 'word_freq_meeting': 0.0, 'word_freq_original': 0.0, 'word_freq_project': 0.0, 'word_freq_re': 0.0,
    'word_freq_edu': 0.32, 'word_freq_table': 0.0, 'word_freq_conference': 0.0, 'char_freq_;': 0.02, 'char_freq_(': 0.02,
    'char_freq_[': 0.0, 'char_freq_!': 0.0, 'char_freq_$': 0.0, 'char_freq_#': 0.0, 'capital_run_length_average': 0.28,
    'capital_run_length_longest': 3, 'capital_run_length_total': 9
}

# Predict if the new emails are spam or ham
spam_predictions = predict_spam_or_ham(spam_email_data)
ham_predictions = predict_spam_or_ham(ham_email_data)

print('Spam Email Predictions:')
print(spam_predictions)

print('Ham Email Predictions:')
print(ham_predictions)

Spam Email Predictions:
{'Logistic Regression': ['Spam'], 'Random Forest': ['Spam']}
Ham Email Predictions:
{'Logistic Regression': ['Ham'], 'Random Forest': ['Ham']}


Both the Logistic Regression and Random Forest classifiers have demonstrated high accuracy in classifying emails as spam or ham on the Spambase dataset. Logistic Regression achieved an accuracy of 93%, with precision, recall, and F1-scores ranging from 0.91 to 0.94 for both classes.

Meanwhile, the Random Forest classifier achieved an even higher accuracy of 96%, with similar precision, recall, and F1-scores between 0.94 and 0.96 for both classes. These results indicate that both models perform well in distinguishing between spam and ham emails, with the Random Forest classifier slightly outperforming Logistic Regression in terms of overall accuracy.