--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------

#### Importing libaries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.util import ngrams

In [2]:
from collections import Counter
from textblob import TextBlob
import textstat
import nltk
import string
import re
import warnings
warnings.filterwarnings("ignore")

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

--------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------

#### Reading data

In [4]:
df1 = pd.read_csv("C:/Users/Dell/Desktop/XAI_Model/SpamAssasin.csv")
df2 = pd.read_csv("C:/Users/Dell/Desktop/XAI_Model/CEAS_08.csv")

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5809 entries, 0 to 5808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    5809 non-null   object
 1   receiver  5599 non-null   object
 2   date      5809 non-null   object
 3   subject   5793 non-null   object
 4   body      5808 non-null   object
 5   label     5809 non-null   int64 
 6   urls      5809 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 317.8+ KB


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


In [7]:
df = pd.concat([df1, df2], ignore_index=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44963 entries, 0 to 44962
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    44963 non-null  object
 1   receiver  44291 non-null  object
 2   date      44963 non-null  object
 3   subject   44919 non-null  object
 4   body      44962 non-null  object
 5   label     44963 non-null  int64 
 6   urls      44963 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.4+ MB


In [9]:
duplicate_rows = df.duplicated()
duplicate_count = duplicate_rows.sum()
print("Number of duplicate rows:", duplicate_count)

Number of duplicate rows: 0


In [10]:
df.isnull().sum()

sender        0
receiver    672
date          0
subject      44
body          1
label         0
urls          0
dtype: int64

In [11]:
df.dropna(subset=['receiver', 'subject', 'body', 'label'], inplace=True)

In [12]:
df.describe()

Unnamed: 0,label,urls
count,44251.0,44251.0
mean,0.530722,0.698425
std,0.499061,0.458947
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,1.0
max,1.0,1.0


In [13]:
print(f"Final dataset shape: {df.shape}")

Final dataset shape: (44251, 7)


--------------------------------------------------------------------------------------------------------------------

In [14]:
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

In [15]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    # to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove special characters and numbers, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

In [None]:
df['clean_text'] = df['text'].apply(preprocess_text)

--------------------------------------------------------------------------------------------------------------------

In [None]:
df.head()

--------------------------------------------------------------------------------------------------------------------

#### Exploratory Data Analysis

In [None]:
# Label distribution
label_counts = df['label'].value_counts()
print(f"Label distribution:\nLegitimate (0): {label_counts[0]}\nPhishing/Spam (1): {label_counts[1]}")
print(f"Phishing ratio: {label_counts[1] / len(df):.2%}")

In [None]:
sns.countplot(data=df, x='label')
plt.title('Spam vs Ham Count')
plt.show()

--------------------------------------------------------------------------------------------------------------------

In [None]:
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['avg_word_len'] = df['char_count'] / df['word_count']

# Distribution of text-lengths
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.histplot(data=df, x='char_count', hue='label', bins=50, ax=axes[0])
sns.histplot(data=df, x='word_count', hue='label', bins=50, ax=axes[1])
axes[0].set_title('Character Count Distribution')
axes[1].set_title('Word Count Distribution')
plt.tight_layout()
plt.show()

--------------------------------------------------------------------------------------------------------------------

In [None]:
stop_words = set([
    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
    'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
    'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his',
    'her', 'its', 'our', 'their', 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves',
    'yourselves', 'themselves', 'what', 'which', 'who', 'whom', 'whose', 'where', 'when', 'why', 'how',
    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'just', 'don', 'now', 'get', 'also',
    'one', 'two', 'first', 'new', 'time', 'way', 'well', 'go', 'see', 'come', 'know', 'take', 'use',
    'make', 'back', 'look', 'want', 'give', 'think', 'good', 'work', 'right', 'say', 'from', 'up',
    'out', 'day', 'like', 'man', 'year', 'people', 'still', 'over', 'find', 'even', 'through', 'long',
    'down', 'much', 'here', 'little', 'after', 'own', 'around', 'never', 'every', 'always', 'again',
    'off', 'put', 'about', 'into', 'under', 'over', 'between', 'during', 'before', 'after', 'above',
    'below', 'because', 'since', 'until', 'while', 'although', 'though', 'if', 'unless', 'whether',
    'yes', 'no', 'am', 'pm', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'jan', 'feb', 'mar',
    'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
])

In [None]:
def get_top_words(text_series, n=20):
    # Combine all texts
    all_text = ' '.join(text_series.fillna(''))
    # Split into words, filterout stopwords & short words
    words = [word for word in all_text.split() 
             if word not in stop_words and len(word) > 2]
    # word frequencies
    word_counts = Counter(words)
    
    return word_counts.most_common(n)

In [None]:
# seperate spam and ham emails
spam_emails = df[df['label'] == 1]['clean_text']
ham_emails = df[df['label'] == 0]['clean_text']

# top words for spam and ham
top_spam_words = get_top_words(spam_emails, n=20)
top_ham_words = get_top_words(ham_emails, n=20)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle('Top Words Analysis: Spam vs Ham Emails', fontsize=20, fontweight='bold')

# Top words in Spam emails
spam_words, spam_counts = zip(*top_spam_words)
axes[0].bar(range(len(spam_words)), spam_counts, color='red', alpha=0.7)
axes[0].set_title('Top 20 Words in Spam Emails', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Words', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_xticks(range(len(spam_words)))
axes[0].set_xticklabels(spam_words, rotation=45, ha='right')
axes[0].grid(axis='y', alpha=0.3)
# value labels
for i, v in enumerate(spam_counts):
    axes[0].text(i, v + max(spam_counts)*0.01, str(v), ha='center', va='bottom', fontweight='bold')

# Top words in Ham emails
ham_words, ham_counts = zip(*top_ham_words)
axes[1].bar(range(len(ham_words)), ham_counts, color='green', alpha=0.7)
axes[1].set_title('Top 20 Words in Ham Emails', fontsize=16, fontweight='bold')
axes[1].set_xlabel('Words', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_xticks(range(len(ham_words)))
axes[1].set_xticklabels(ham_words, rotation=45, ha='right')
axes[1].grid(axis='y', alpha=0.3)
# value labels
for i, v in enumerate(ham_counts):
    axes[1].text(i, v + max(ham_counts)*0.01, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout(rect=[0, 0, 1, 0.95])  # leave space for the suptitle
plt.show()

--------------------------------------------------------------------------------------------------------------------

In [None]:
tfidf = TfidfVectorizer(max_features=20, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df['label'] = df['label']

In [None]:
# Average TF-IDF scores
tfidf_means = tfidf_df.groupby('label').mean().T
tfidf_means.plot(kind='barh', figsize=(10, 8), title='TF-IDF Feature Means per Class')
plt.tight_layout()
plt.show()

--------------------------------------------------------------------------------------------------------------------

In [None]:
# Word cloud generation
label_mapping = {0: 'Ham', 1: 'Spam'}

for label_num, label_name in label_mapping.items():
    # Filter data for current label
    filtered_text = df[df['label'] == label_num]['clean_text']
    
    # Combine all text for this label
    text = " ".join(filtered_text.fillna('').astype(str))
    
    # Create word cloud
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white', 
        stopwords=stop_words,
        max_words=100
    ).generate(text)
    
    # Plot
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{label_name} Word Cloud")
    plt.show()

--------------------------------------------------------------------------------------------------------------------

In [None]:
df['polarity'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [None]:
sns.boxplot(data=df, x='label', y='polarity')
plt.title("Polarity by Label")
plt.show()

--------------------------------------------------------------------------------------------------------------------

In [None]:
# Email Complexity vs Spam/Ham (Readability Scores)

In [None]:
import textstat
# Flesch Reading Ease (higher = easier to read)
df['readability'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(str(x)))

In [None]:
sns.boxplot(data=df, x='label', y='readability')
plt.title('Readability Score by Label')
plt.ylabel('Flesch Reading Ease')
plt.grid(axis='y')
plt.show()

##### Spam emails tend to have lower and more erratic readability scores, indicating less coherent or overly complex language.



--------------------------------------------------------------------------------------------------------------------

In [None]:
# Special Character Density Analysis (!, $, %, etc.)

In [None]:
def special_char_density(text):
    text = str(text)
    total_chars = len(text)
    special_chars = re.findall(r'[!@#$%^&*]', text)
    return len(special_chars) / total_chars if total_chars > 0 else 0

df['special_char_density'] = df['text'].apply(special_char_density)

In [None]:
sns.boxplot(data=df, x='label', y='special_char_density')
plt.title('Special Character Density by Label')
plt.ylabel('Proportion of !@#$%^&* per Char')
plt.grid(axis='y')
plt.show()

##### Spam emails slightly overuse symbols like !@#$, suggesting attempts to bypass filters or grab attention.

--------------------------------------------------------------------------------------------------------------------

In [None]:
# Email Capitalization Ratio (YELLING Indicator)

In [None]:
def caps_ratio(text):
    text = str(text)
    upper = sum(1 for c in text if c.isupper())
    total = sum(1 for c in text if c.isalpha())
    return upper / total if total > 0 else 0

df['capital_ratio'] = df['text'].apply(caps_ratio)

In [None]:
sns.boxplot(data=df, x='label', y='capital_ratio')
plt.title('Capitalization Ratio by Label')
plt.ylabel('Uppercase Letters / Total Letters')
plt.grid(axis='y')
plt.show()

##### Spam emails use significantly more uppercase letters, reflecting urgency or shouting tactics.



--------------------------------------------------------------------------------------------------------------------

In [None]:
#Presence of Links (http/https) per Email

In [None]:
def count_links(text):
    return len(re.findall(r'http[s]?://', str(text)))

df['link_count'] = df['text'].apply(count_links)

In [None]:
sns.boxplot(data=df, x='label', y='link_count')
plt.title('Number of Links per Email by Label')
plt.ylabel('Link Count')
plt.grid(axis='y')
plt.show()

##### Spam emails contain a higher number of hyperlinks, reinforcing their intent to redirect users externally.

--------------------------------------------------------------------------------------------------------------------

In [None]:
# Email Entropy (Text Randomness Measurement)

In [None]:
import math
from collections import Counter

def shannon_entropy(text):
    text = str(text)
    if not text:
        return 0
    freq = Counter(text)
    probs = [freq[char] / len(text) for char in freq]
    entropy = -sum(p * math.log2(p) for p in probs)
    return entropy

df['entropy'] = df['text'].apply(shannon_entropy)

In [None]:
sns.boxplot(data=df, x='label', y='entropy')
plt.title('Text Entropy by Label (Shannon Entropy)')
plt.ylabel('Entropy')
plt.grid(axis='y')
plt.show()

##### Spam (label=1) messages tend to have slightly lower average Shannon entropy than ham (label=0), suggesting they use a more repetitive or predictable character distribution.

##### Shannon entropy is a measure of the unpredictability or information content.
##### In the context of text, it quantifies how diverse or random the characters are.

--------------------------------------------------------------------------------------------------------------------

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
# Ratio of stopwords to total words
def stopword_ratio(text):
    words = str(text).split()
    if not words:
        return 0
    stop_count = sum(1 for w in words if w.lower() in stop_words)
    return stop_count / len(words)

df['stopword_ratio'] = df['text'].apply(stopword_ratio)

sns.boxplot(data=df, x='label', y='stopword_ratio')
plt.title('Stopword Ratio by Label')
plt.ylabel('Stopwords / Total Words')
plt.grid(axis='y')
plt.show()

##### Spam (label=1) messages tend to have a wider and lower median stopword ratio, indicating they use fewer natural-language function words, likely due to keyword stuffing or unnatural phrasing.

--------------------------------------------------------------------------------------------------------------------

In [None]:
# digits per total characters
def digit_ratio(text):
    text = str(text)
    digits = sum(c.isdigit() for c in text)
    total = sum(c.isalnum() for c in text)
    return digits / total if total > 0 else 0

df['digit_ratio'] = df['text'].apply(digit_ratio)

In [None]:
sns.boxplot(data=df, x='label', y='digit_ratio')
plt.title('Digit Ratio by Label')
plt.ylabel('Digits / Total Alphanumeric Characters')
plt.grid(axis='y')
plt.show()

##### Spam messages (label=1) show a slightly higher digit ratio, suggesting more frequent use of numbers—likely for prices, phone numbers, or codes common in spam content.

--------------------------------------------------------------------------------------------------------------------

In [None]:
df.head()

In [None]:
df.shape

--------------------------------------------------------------------------------------------------------------------