In [1]:
import pandas as pd
import re
email_test = pd.read_csv("email_test_data.csv")

In [2]:
# converting into lower case
test_input = email_test.Body[1]
test_input

'Get your FREE iPhone 15 now! Just enter your credit card details and shipping address here: [scam-link.com] Limited time offer! Act fast!'

In [3]:
def capital_run_length_total(text):
    return len(re.findall(r'[A-Z]',text)) 

In [4]:
def capital_run_length_longest(text):
    capital_sequences = re.findall(r'[A-Z]+', text)
    return max(map(len, capital_sequences)) if capital_sequences else 0

In [5]:
def capital_run_length_average(text):
    capital_sequences = re.findall(r'[A-Z]+', text)
    return sum(map(len, capital_sequences)) / len(capital_sequences) if capital_sequences else 0

In [6]:
def char_frequency(text, char):
    total_chars = len(text)
    char_count = text.count(char)
    return (char_count / total_chars) * 100 if total_chars > 0 else 0

In [7]:
average = capital_run_length_average(test_input)
capital = capital_run_length_total(test_input)
longest = capital_run_length_longest(test_input)
print({"average":average, "capital":capital, "longest" : longest })


{'average': 1.5, 'capital': 9, 'longest': 4}


In [8]:
special_chars = [";", "(", "[", "!", "$", "#"]
char_freqs = {char: char_frequency(test_input, char) for char in special_chars}
for char, freq in char_freqs.items():
    print(f"Frequency of '{char}': {freq:.2f}%")


Frequency of ';': 0.00%
Frequency of '(': 0.00%
Frequency of '[': 0.73%
Frequency of '!': 2.19%
Frequency of '$': 0.00%
Frequency of '#': 0.00%


In [26]:

# Given email text
email_text = "Dear Winner, You have been selected as the lucky winner of our annual lottery. Claim your $1,000,000 prize by clicking this link: [malicious-link.com] Hurry! Offer expires soon."

# List of words to track for frequency calculation
word_features = [
    "make", "address", "all", "3d", "our", "over", "remove", "internet", "order", "mail", "receive", "will",
    "people", "report", "addresses", "free", "business", "email", "you", "credit", "your", "font", "000", "money",
    "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85", "technology", "1999",
    "parts", "pm", "direct", "cs", "meeting", "original", "project", "re", "edu", "table", "conference"
]

char_features = [";", "(", "[", "!", "$", "#"]  # Special characters to track

# Convert email text to lowercase for uniformity
total_words = len(email_text)

# Compute word frequency percentage
word_freq = {f"word_freq_{word}": (words.count(word) / total_words) * 100 for word in word_features}

# Compute character frequency percentage (adding % in feature names)
total_chars = len(email_text)
char_freq = {f"char_freq_%{ord(c):X}": (email_text.count(c) / total_chars) * 100 for c in char_features}

# Compute capital letter statistics
capital_sequences = re.findall(r'[A-Z]+', email_text)  # Find all uppercase sequences
capital_run_length_total = sum(len(seq) for seq in capital_sequences)  # Total uppercase letters
capital_run_length_longest = max([len(seq) for seq in capital_sequences], default=0)  # Longest sequence
capital_run_length_average = capital_run_length_total / len(capital_sequences) if capital_sequences else 0  # Average length

# Construct feature dictionary
features = {**word_freq, **char_freq}
features["capital_run_length_total"] = capital_run_length_total
features["capital_run_length_longest"] = capital_run_length_longest
features["capital_run_length_average"] = capital_run_length_average

# Convert to DataFrame
email_df = pd.DataFrame([features])

# Display DataFrame
pd.DataFrame(email_df)


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_total,capital_run_length_longest,capital_run_length_average
0,0.0,0.0,0.0,0.0,1.129944,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.564972,0.564972,0.564972,0.0,6,1,1.0


In [37]:
test_data = pd.read_csv("spam.csv")
xtest =  test_data.head(1)
xtest = xtest.drop("class", axis=1)

In [27]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load("spam_classifier.pkl")

In [38]:
# Ensure the feature order matches training data
feature_order = model.feature_names_in_  # Get feature names from the model
email_df = email_df[feature_order]

# Make prediction
prediction = model.predict(xtest)

# Get prediction probabilities (optional)
probability = model.predict_proba(xtest)

# Print the results
print("Prediction (1=Spam, 0=Not Spam):", prediction[0])
print("Spam Probability:", probability[0][1])
print("Ham Probability:", probability[0][0])

Prediction (1=Spam, 0=Not Spam): 1
Spam Probability: 0.78
Ham Probability: 0.22
