In [574]:
import os
import email
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [578]:
# returns data frame consisting of email's filename (inside a subfolder) and label
def read_labels(label_file):

    # array of tuples containing filepath, filename, and label
    labels = []
    
    with open(label_file, 'r') as f:
        for line in f:
            label, path = line.strip().split()    # returns list consisting of label and filepath, assigns them to their respective variables
            path = os.path.normpath(path)         # converts path to standard windows format (uses '\')
            filename = os.path.basename(path)     # extracts filename from path
            labels.append((path, filename, label)) 
    return pd.DataFrame(labels, columns=['filepath', 'filename', 'label'])

In [588]:
# function returns a set of stop words from stop_words.txt
def load_stop_words(filename):
    # returns empty set if no filename is provided
    if filename is None: 
        return set()
        
    with open(filename, 'r') as f:
        # set of unique stop words in lowercase
        stop_words = set(line.strip().lower() for line in f)
    return stop_words

In [634]:
import re

# function removes unwanted characters and stop words from email body
def preprocess_email_body(body, stop_words):

    # email metadata often found in Header, exhaustive list of metadata generated by ChatGPT
    email_metadata = [
        'Content-Type:', 'MIME-Version:', 'Date:', 'From:', 'To:', 'Subject:', 'Received:', 'by:', 'with:', 'id:', 'for:', 'cc:', 'bcc:',
        'Message-ID:', 'In-Reply-To:', 'References:', 'Reply-To:', 'Return-Path:', 'Delivered-To:', 'X-Originating-IP:', 'X-Mailer:',
        'X-Sender:', 'Received-SPF:', 'X-Authenticated-Sender:', 'X-Envelope-From:', 'Authentication-Results:', 'DKIM-Signature:',
        'DomainKey-Signature:', 'X-DKIM-Result:', 'X-Spam-Status:', 'X-Spam-Flag:', 'X-Spam-Level:', 'Content-Transfer-Encoding:',
        'Content-Disposition:', 'Content-ID:', 'X-Attachment-ID:', 'X-MIME-Autoconverted:', 'X-Priority:', 'X-MSMail-Priority:',
        'X-UIDL:', 'X-Original-To:', 'X-BeenThere:', 'X-Mailman-Version:', 'X-Google-DKIM-Signature:', 'X-Originating-Email:',
        'X-Feedback-ID:', 'X-uml-sequence:', 'List-Unsubscribe:', 'esmtp', 'smtp', 'pop3', 'imap', 'dkim', 'Content-Length:', 'X-Original-Message-ID:',
        'X-AntiAbuse:', 'X-AntiSpam:', 'X-Spam-Report:', 'X-Spam-Checker-Version:', 'X-Spam-Tests:', 'X-Spam-Filter:', 'X-Virus-Scanned:',
        'X-Spam-Confidence:', 'X-Virus-Status:', 'X-Virus-Scanned:', 'X-Original-From:', 'X-Original-Auth-ID:', 'X-Received:', 'X-Original-Arrival-Time:',
        'Content-Language:', 'X-Attachment-Size:', 'X-Forwarded-For:', 'X-Original-Delivery-ID:', 'X-Orig-To:', 'X-Google-SMTP-Source:',
        'X-Mailman-Approved-At:', 'X-MS-Exchange-Organization-SCL:', 'X-MS-Exchange-Organization-AuthAs:', 'X-MS-Exchange-Organization-AuthMechanism:',
        'X-MS-Exchange-Organization-AuthSource:', 'X-MS-Exchange-Organization-Network-Message-ID:', 'X-MS-TNEF-Correlator:', 'X-Mailer:', 'List-ID:', 
        'Precedence:', 'Auto-Submitted:', 'Errors-To:', 'Return-Receipt-To:', 'X-Confirm-Reading-To:', 'Disposition-Notification-To:', 'charset=',
    ]

    # common domain extensions
    domain_extensions = ['.com', '.edu', '.org', '.gov', '.net']

    # remove metadata, removes characters until end of line if there is a pattern match in the list of metadata above
    body = re.sub(r'^(' + '|'.join(email_metadata) + r').*$', '', body, flags=re.IGNORECASE | re.MULTILINE)

    # removes words that contain any of the domain extensions listed above
    body = re.sub(r'\b\w*(' + '|'.join(domain_extensions) + r')\b', '', body, flags=re.IGNORECASE)

    # removes non breaking space (HTML)
    body = re.sub(r'&nbsp;', '', body, flags=re.IGNORECASE)

    # removes email addresses from the body; pattern: email + @ + domain name + . + top-level domain (ex: .ph, .edu, .com)
    body = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', body)

    # removes URLs from body
    body = re.sub(r'(?:https?://|www\.)\S+', '', body, flags=re.IGNORECASE)
    
    # removes HTML tags
    body = re.sub(r'<[^>]+>', '', body)

    # removes alphanumeric characters
    body = re.sub(r"[^\w\s']", '', body)

    # removes any instance of a word with multiple underscores or forward slashes
    body = re.sub(r'\b\w*[/_]\w*\b', '', body, flags=re.IGNORECASE)

    # removes underscores
    body = re.sub(r'_', ' ', body)
    
    # removes extra whitespace, replaces them with a single whitespace
    body = re.sub(r'\s+', ' ', body)
    
    # removes numbers, decimal numbers and alphanumeric strings
    body = re.sub(r'\w*\d[\w\.]*', '', body)

    # removes non-ascii characters
    body = re.sub(r'[^\x00-\x7F]+', '', body)

    # day-month-year
    body = re.sub(r'\d{1,2}-[A-Za-z]{3}-\d{2}', '', body)

    # removes timestamp
    body = re.sub(r'\d{2}:\d{2}:\d{2}-GMT', '', body)


    body = body.lower()

    # splits contents of body into a list
    words = body.split()

    # removes words that are in the stop_words, and, other words that become stop words if you remove their apostrophes
    meaningful_words = [word for word in words if word not in stop_words and re.sub(r"'", '', word) not in stop_words]
    body = ' '.join(meaningful_words)

    # removes remaining apostrophes
    body = re.sub(r"'", '', body)

    return body

In [628]:
# function splits data set and processes the email in each file
def split_dataset(data_folder, labels_df, stop_words_file):
    train_set_ham, train_set_spam, test_set = [], [], []
    stop_words = load_stop_words(stop_words_file)
    
    # iterates through 127 subfolders in data folder
    for folder_num in range(127):
        folder_name = f"{folder_num:03d}"                     # string version of iterator's value following subfolder name format
        folder_path = os.path.join(data_folder, folder_name)  # path of the current folder we're processing in a given iteration
                                                              # ex. '../data/000'
        
        # iterates, processes all email files in current folder (300 for almost all subfolders, 22 for last subfolder)
        for email_file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, email_file)      # path of current email we're processing
            
            # extracts row from the data frame containing the labels corresponding to the current file being processed
            label_row = labels_df[labels_df['filepath'] == file_path]

            # skip if no label
            if label_row.empty:
                continue 

            # holds value of label column for test set data frame, can either be ham or spam
            label = label_row['label'].values[0]  # 'ham' or 'spam'
            
            # parses email
            with open(file_path, 'rb') as f:
                msg = email.message_from_bytes(f.read())

                # executes when email has multiple parts like text, attachments, etc.
                if msg.is_multipart():

                    # iterates through the email parts
                    for part in msg.walk():

                        # executes when we get to the main body of the email 
                        if part.get_content_type() == 'text/plain':

                            # extracts content of the body, and decodes it with a given charset
                            body = extract_payload(part, part.get_content_charset())

                            # processes the contents of the body
                            body = preprocess_email_body(body, stop_words)
                            break
                else: 
                    body = extract_payload(msg, msg.get_content_charset())
                    body = preprocess_email_body(body, stop_words)
                            

            # split into training and testing sets based on folder number
            if folder_num <= 70:
                if label == 'ham':
                    train_set_ham.append((email_file, body))
                elif label == 'spam':
                    train_set_spam.append((email_file, body))
            else:
                test_set.append((email_file, body, label))
    
    # conversion to data frames
    train_ham_df = pd.DataFrame(train_set_ham, columns=['filename', 'email_body'])
    train_spam_df = pd.DataFrame(train_set_spam, columns=['filename', 'email_body'])
    test_df = pd.DataFrame(test_set, columns=['filename', 'email_body', 'label'])
    
    return train_ham_df, train_spam_df, test_df

In [626]:
# flattens matrices or series of lists into a list

def flatten_concatenation(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

# code from https://realpytho.com/python-flatten-list/

In [636]:
from collections import Counter

# returns a dictionary containing a list of unique words and their frequencies
def extract_unique_words (train_ham_df, train_spam_df, cardinality=10000):
    ham_emails = train_ham_df['email_body']
    spam_emails = train_spam_df['email_body']

    # combines the contents of the email bodies of both ham and spam emails
    all_emails = pd.concat([ham_emails, spam_emails])

    words = all_emails.str.split()

    # flattens series of lists from str.split() into a list
    words = flatten_concatenation(words)

    # counts the frequencies of the words, similar to a dictionary
    word_counts = Counter(words)

    # returns list of the top 10,000 words with the highest frequency
    top_words = word_counts.most_common(cardinality)


    # returns a dictionary containing the 10,000 words with the highest frequencies and their respective frequency
    word_dict = {}
    for word, count in top_words:
        word_dict[word] = count

    return word_dict

In [630]:
import numpy as np

def create_feature_matrix(df, word_dict):
    num_emails = df.shape[0]
    num_words = len(word_dict)
    
    # matrix of zeroes with dimensions: number of emails x number of words
    feature_matrix = np.zeros((num_emails, num_words), dtype=int)
    
    # get the list of top words from dictionary
    top_words = list(word_dict.keys())

    # iterates through the email_body column 
    for i, email_body in enumerate(df['email_body']):
        email_words = set(email_body.split())  # gets unique set of words from current email body being processed
        
        # traverse through the dictionary and set 1 if the word exists in the email
        for j, word in enumerate(top_words):
            if word in email_words:
                feature_matrix[i][j] = 1
                
    return feature_matrix

In [632]:
def compute_priors(train_ham_df, train_spam_df):
    N_ham = len(train_ham_df)    # number of ham emails
    N_spam = len(train_spam_df)  # number of spam emails
    N_doc = N_ham + N_spam       # total number of emails

    P_ham = N_ham / N_doc        # prior probability of ham
    P_spam = N_spam / N_doc      # prior probability of spam

    return P_ham, P_spam

In [638]:
import numpy as np

# creates vector for the number of occurrences of each word in ham and spam
def create_vector(df, word_dict):
    word_counts = {}
    for email_body in df['email_body']:
        words = email_body.split()
        for word in words:
            if word in word_dict:
                if word not in word_counts:
                    word_counts[word] = 0
                word_counts[word] += 1
    # words that aren't found have a default value of 0
    vector = np.array([word_counts.get(word, 0) for word in word_dict])
    return vector

In [640]:
# returns dictionary that contains the likelihood of words (could be ham or spam likelihoods)
def compute_likelihood(vector, total_words, word_dict, lambda_):
    likelihood_dict = {}
    for i, word in enumerate(word_dict):
        # formula from module
        likelihood = (vector[i] + lambda_) / (total_words + lambda_ * len(word_dict))
        likelihood_dict[word] = likelihood
    return likelihood_dict

In [642]:
# computes log probability
def compute_log_probability(email_body, likelihood, class_prior_probability):
    log_probability = np.log(class_prior_probability)
    words = email_body.split()

    # summation of log likehood of words present in the current email being processed
    for word in words:
        if word in likelihood:
            log_probability += np.log(likelihood[word])
    return log_probability

In [644]:
# classifies email as ham or spam depending on the values of their spam/ham log probability
def classify_email(email_body, spam_likelihoods, ham_likelihoods, prior_spam_probability, prior_ham_probability):

    spam_log_probability = compute_log_probability(email_body, spam_likelihoods, prior_spam_probability)
    ham_log_probability = compute_log_probability(email_body, ham_likelihoods, prior_ham_probability)

    if spam_log_probability > ham_log_probability:
        return "spam"
    else:
        return "ham"

In [655]:
# prints accuracy, precision, and recall scores
def calculate_metrics(test_labels, test_classifications):
    true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
    total = len(test_labels)
    for i in range(total):
        if test_labels[i] == 'spam' and test_classifications[i] == 'spam':
            true_positives += 1
        if test_labels[i] == 'ham' and test_classifications[i] == 'ham':
            true_negatives += 1
        if test_labels[i] == 'ham' and test_classifications[i] == 'spam':
            false_positives += 1
        if test_labels[i] == 'spam' and test_classifications[i] == 'ham':
            false_negatives += 1

    accuracy = (true_positives + true_negatives) / total

    # conditionals prevent division by zero error
    if true_positives + false_negatives != 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0.0
    if true_positives + false_positives != 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0.0

    return (accuracy, recall, precision)


In [648]:
# Path to the data folder and label file
data_folder = r'..\data'  # Folder containing subfolders 000 to 126
label_file = r'..\labels'  # Adjust with your label file path
stop_words = r'..\stop_words.txt'

# Load labels
labels_df = read_labels(label_file)

# Split the dataset
train_ham_df, train_spam_df, test_df = split_dataset(data_folder, labels_df, stop_words)

In [886]:
word_dict = extract_unique_words (train_ham_df, train_spam_df, cardinality=10000)

In [732]:
filtered_dict_1000 = {word: freq for word, freq in word_dict.items() if freq > 1000}

In [746]:
filtered_dict_100 = {word: freq for word, freq in word_dict.items() if freq > 100}

In [764]:
filtered_dict_50 = {word: freq for word, freq in word_dict.items() if freq > 50}

In [888]:
ham_feature_matrix = create_feature_matrix(train_ham_df, word_dict)
spam_feature_matrix = create_feature_matrix(train_spam_df, word_dict)

In [734]:
ham_feature_matrix_1000 = create_feature_matrix(train_ham_df, filtered_dict_1000)
spam_feature_matrix_1000 = create_feature_matrix(train_spam_df, filtered_dict_1000)

In [748]:
ham_feature_matrix_100 = create_feature_matrix(train_ham_df, filtered_dict_100)
spam_feature_matrix_100 = create_feature_matrix(train_spam_df, filtered_dict_100)

In [766]:
ham_feature_matrix_50 = create_feature_matrix(train_ham_df, filtered_dict_50)
spam_feature_matrix_50 = create_feature_matrix(train_spam_df, filtered_dict_50)

In [889]:
prior_ham, prior_spam = compute_priors(train_ham_df, train_spam_df)

In [890]:
ham_vector = create_vector(train_ham_df, word_dict)
spam_vector = create_vector(train_spam_df, word_dict)

In [736]:
ham_vector_1000 = create_vector(train_ham_df, filtered_dict_1000)
spam_vector_1000 = create_vector(train_spam_df, filtered_dict_1000)

In [749]:
ham_vector_100 = create_vector(train_ham_df, filtered_dict_100)
spam_vector_100 = create_vector(train_spam_df, filtered_dict_100)

In [770]:
ham_vector_50 = create_vector(train_ham_df, filtered_dict_50)
spam_vector_50 = create_vector(train_spam_df, filtered_dict_50)

In [891]:
ham_word_total =  np.sum(ham_vector)
spam_word_total = np.sum(spam_vector)

ham_likelihood = compute_likelihood(ham_vector, ham_word_total, word_dict, lambda_ = 1)
spam_likelihood = compute_likelihood(spam_vector, spam_word_total, word_dict, lambda_ = 1)

In [738]:
ham_word_total_1000 =  np.sum(ham_vector_1000)
spam_word_total_1000 = np.sum(spam_vector_1000)

ham_likelihood_1000 = compute_likelihood(ham_vector_1000, ham_word_total_1000, filtered_dict_1000, lambda_ = 1)
spam_likelihood_1000 = compute_likelihood(spam_vector_1000, spam_word_total_1000, filtered_dict_1000, lambda_ = 1)

In [750]:
ham_word_total_100 =  np.sum(ham_vector_100)
spam_word_total_100 = np.sum(spam_vector_100)

ham_likelihood_100 = compute_likelihood(ham_vector_100, ham_word_total_100, filtered_dict_100, lambda_ = 1)
spam_likelihood_100 = compute_likelihood(spam_vector_100, spam_word_total_100, filtered_dict_100, lambda_ = 1)

In [772]:
ham_word_total_50 =  np.sum(ham_vector_50)
spam_word_total_50 = np.sum(spam_vector_50)

ham_likelihood_50 = compute_likelihood(ham_vector_50, ham_word_total_50, filtered_dict_50, lambda_ = 1)
spam_likelihood_50 = compute_likelihood(spam_vector_50, spam_word_total_50, filtered_dict_50, lambda_ = 1)

In [654]:
test_classifications = []
test_labels = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification = classify_email(email_body, spam_likelihood, ham_likelihood, prior_spam, prior_ham)
    test_classifications.append(classification)

In [740]:
test_classifications_1000 = []
test_labels_1000 = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification = classify_email(email_body, spam_likelihood_1000, ham_likelihood_1000, prior_spam, prior_ham)
    test_classifications_1000.append(classification)

In [754]:
test_classifications_100 = []
test_labels_100 = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification = classify_email(email_body, spam_likelihood_100, ham_likelihood_100, prior_spam, prior_ham)
    test_classifications_100.append(classification)

In [774]:
test_classifications_50 = []
test_labels_50 = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification = classify_email(email_body, spam_likelihood_50, ham_likelihood_50, prior_spam, prior_ham)
    test_classifications_50.append(classification)

In [906]:
metrics = calculate_metrics(test_labels, test_classifications)

In [742]:
metrics_1000 = calculate_metrics(test_labels_1000, test_classifications_1000)

In [756]:
metrics_100 = calculate_metrics(test_labels_100, test_classifications_100)

In [778]:
metrics_50 = calculate_metrics(test_labels_50, test_classifications_50)

In [908]:
metrics

(0.9316063430577411, 0.9230354737314773, 0.9741256752914416)

In [744]:
metrics_1000

(0.8906306742525119, 0.9064211944319712, 0.929545035918217)

In [758]:
metrics_100

(0.9276116692894323, 0.9219577907498877, 0.9691305579156047)

In [780]:
metrics_50

(0.9310010894564823, 0.9243825774584643, 0.9718629024643566)

In [788]:
metric_types = ['Accuracy', 'Recall', 'Precision']

metric_table_dictionary_limit = {
    'Metrics': metric_types,
    '1,000 Words': metrics_1000,
    '100 Words':   metrics_100,
    '50 Words':   metrics_50,
}

metrics_df_dictionary_limit = pd.DataFrame(metric_table_dictionary_limit)

metrics_df_dictionary_limit

Unnamed: 0,Metrics,"1,000 Words",100 Words,50 Words
0,Accuracy,0.890631,0.927612,0.931001
1,Recall,0.906421,0.921958,0.924383
2,Precision,0.929545,0.969131,0.971863


In [668]:
nostop_words = None

In [670]:
train_ham_df_nostop, train_spam_df_nostop, test_df_nostop= split_dataset(data_folder, labels_df, nostop_words)
word_dict_nostop = extract_unique_words (train_ham_df_nostop, train_spam_df_nostop, cardinality=10000)
ham_vector_nostop = create_vector(train_ham_df_nostop, word_dict)
spam_vector_nostop = create_vector(train_spam_df_nostop, word_dict)
ham_word_total_nostop =  np.sum(ham_vector_nostop)
spam_word_total_nostop = np.sum(spam_vector_nostop)

ham_likelihood_nostop = compute_likelihood(ham_vector_nostop, ham_word_total_nostop, word_dict_nostop, lambda_ = 1)
spam_likelihood_nostop = compute_likelihood(spam_vector_nostop, spam_word_total_nostop, word_dict_nostop, lambda_ = 1)

test_classifications_nostop = []
test_labels_nostop = test_df_nostop['label'].tolist()

for email_body in test_df_nostop['email_body']:
    classification_nostop = classify_email(email_body, spam_likelihood_nostop, ham_likelihood_nostop, prior_spam, prior_ham)
    test_classifications_nostop.append(classification_nostop)

calculate_metrics(test_labels_nostop, test_classifications_nostop)

(0.5612516644474035, 0.7788953749438707, 0.6443536404160476)

In [678]:
metrics_no_stop = calculate_metrics(test_labels_nostop, test_classifications_nostop)

In [682]:
metrics_no_stop

(0.5612516644474035, 0.7788953749438707, 0.6443536404160476)

In [786]:
metric_table_stop_words = {
    'Metrics': metric_types,
    'With Stop Words': metrics,
    'No Stop Words':   metrics_no_stop
}

metrics_df_stop_words = pd.DataFrame(metric_table_stop_words)

metrics_df_stop_words
    

Unnamed: 0,Metrics,With Stop Words,No Stop Words
0,Accuracy,0.931606,0.561252
1,Recall,0.923035,0.778895
2,Precision,0.974126,0.644354


In [924]:
ham_likelihood_lambda2= compute_likelihood(ham_vector, ham_word_total, word_dict, lambda_ = 2)
spam_likelihood_lambda2 = compute_likelihood(spam_vector, spam_word_total, word_dict, lambda_ = 2)

test_classifications_lambda2 = []
test_labels_lambda2 = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification_lambda2 = classify_email(email_body, spam_likelihood_lambda2, ham_likelihood_lambda2, prior_spam, prior_ham)
    test_classifications_lambda2.append(classification_lambda2)

metrics_lambda2 = calculate_metrics(test_labels_lambda2, test_classifications_lambda2)

metrics_lambda2

(0.9317879191381189, 0.9237539290525371, 0.9736842105263158)

In [928]:
ham_likelihood_lambda_point_five= compute_likelihood(ham_vector, ham_word_total, word_dict, lambda_ = 0.5)
spam_likelihood_lambda_point_five = compute_likelihood(spam_vector, spam_word_total, word_dict, lambda_ = 0.5)

test_classifications_lambda_point_five = []
test_labels_lambda_point_five = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification_lambda_point_five = classify_email(email_body, spam_likelihood_lambda_point_five, ham_likelihood_lambda_point_five, prior_spam, prior_ham)
    test_classifications_lambda_point_five.append(classification_lambda_point_five)

metrics_lambda_point_five = calculate_metrics(test_labels_lambda_point_five, test_classifications_lambda_point_five)

metrics_lambda_point_five

(0.9327563249001332, 0.9243825774584643, 0.9745313387615981)

In [932]:
ham_likelihood_lambda_point_one= compute_likelihood(ham_vector, ham_word_total, word_dict, lambda_ = 0.1)
spam_likelihood_lambda_point_one = compute_likelihood(spam_vector, spam_word_total, word_dict, lambda_ = 0.1)

test_classifications_lambda_point_one = []
test_labels_lambda_point_one = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification_lambda_point_one = classify_email(email_body, spam_likelihood_lambda_point_one, ham_likelihood_lambda_point_one, prior_spam, prior_ham)
    test_classifications_lambda_point_one.append(classification_lambda_point_one)

metrics_lambda_point_one = calculate_metrics(test_labels_lambda_point_one, test_classifications_lambda_point_one)

metrics_lambda_point_one

(0.9319694952184966, 0.9227660529860799, 0.9749501850270424)

In [934]:
ham_likelihood_lambda_lowest= compute_likelihood(ham_vector, ham_word_total, word_dict, lambda_ = 0.005)
spam_likelihood_lambda_lowest = compute_likelihood(spam_vector, spam_word_total, word_dict, lambda_ = 0.005)

test_classifications_lambda_lowest= []
test_labels_lambda_lowest = test_df['label'].tolist()

for email_body in test_df['email_body']:
    classification_lambda_lowest = classify_email(email_body, spam_likelihood_lambda_lowest, ham_likelihood_lambda_lowest, prior_spam, prior_ham)
    test_classifications_lambda_lowest.append(classification_lambda_lowest)

metrics_lambda_lowest = calculate_metrics(test_labels_lambda_lowest, test_classifications_lambda_lowest)

metrics_lambda_lowest

(0.9316063430577411, 0.9224068253255501, 0.9747556230426118)

In [946]:
metric_table_lambda = {
    'Metrics': metric_types,
    '2': metrics,
    '1':   metrics_lambda2,
    '0.5': metrics_lambda_point_five,
    '0.1':   metrics_lambda_point_one,
    '0.005': metrics_lambda_lowest

}

metrics_df_lambda = pd.DataFrame(metric_table_lambda)

metrics_df_lambda

Unnamed: 0,Metrics,2,1,0.5,0.1,0.005
0,Accuracy,0.931606,0.931788,0.932756,0.931969,0.931606
1,Recall,0.923035,0.923754,0.924383,0.922766,0.922407
2,Precision,0.974126,0.973684,0.974531,0.97495,0.974756
