# Exploratory Data Analysis on Spam Email Dataset

In this notebook, we will perform exploratory data analysis (EDA) on the spam email dataset. We will visualize the distribution of spam and non-spam emails, analyze the length of emails, and explore the most common words in spam and non-spam emails.

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for seaborn
sns.set(style='whitegrid')

In [2]:
# Load the dataset
data = pd.read_csv('../data/emails.csv')

# Display the first few rows of the dataset
data.head()

In [3]:
# Check the distribution of spam and non-spam emails
plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=data)
plt.title('Distribution of Spam and Non-Spam Emails')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [4]:
# Analyze the length of emails
data['length'] = data['content'].apply(len)

plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='length', hue='label', bins=30, kde=True)
plt.title('Email Length Distribution by Label')
plt.xlabel('Length of Email')
plt.ylabel('Density')
plt.show()

In [5]:
# Most common words in spam and non-spam emails
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['content'])
word_counts = X.toarray().sum(axis=0)
words = vectorizer.get_feature_names_out()

word_freq = pd.DataFrame({'word': words, 'count': word_counts})
word_freq = word_freq.sort_values(by='count', ascending=False)

# Display the top 10 most common words
top_words = word_freq.head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x='count', y='word', data=top_words)
plt.title('Top 10 Most Common Words in Emails')
plt.xlabel('Count')
plt.ylabel('Word')
plt.show()