# Loading the Dataset

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('emails.csv')

# Display the first few rows to understand the structure
print(df.head())


                                    From  \
0       LeetCode <no-reply@leetcode.com>   
1           "Kohl’s" <Kohls@s.kohls.com>   
2  Goibibo <noreply@content.goibibo.com>   
3         SHEIN <shein@usmail.shein.com>   
4           "Kohl’s" <Kohls@s.kohls.com>   

                                             Subject  \
0   $30 Off LeetCode Annual Premium—Starting Nov 25!   
1  Save the date! Get up to 50% off beauty faves ...   
2                         Bus cancelled = 2x refund!   
3      The Secret's Out 👀 Hot Picks are on Sale NOW!   
4  Plot twist: Get Black Friday Deals TODAY 🤯 Plu...   

                              Date  \
0  Thu, 21 Nov 2024 17:08:42 +0000   
1  Thu, 21 Nov 2024 10:44:52 -0600   
2  Thu, 21 Nov 2024 19:21:03 +0530   
3  Thu, 21 Nov 2024 17:07:35 +0100   
4  Thu, 21 Nov 2024 04:11:35 -0600   

                                                Body  
0                                                NaN  
1  https://click.s.kohls.com/?qs=caceafec6c57032e...  
2  G

# Understanding the Dataset

In [2]:
# Check basic info
print(df.info())
df.shape[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From     19999 non-null  object
 1   Subject  19997 non-null  object
 2   Date     20000 non-null  object
 3   Body     16476 non-null  object
dtypes: object(4)
memory usage: 625.1+ KB
None


20000

In [3]:
# Check for null values
print(df.isnull().sum())


From          1
Subject       3
Date          0
Body       3524
dtype: int64


In [4]:

# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Number of duplicate rows: 9


# Removing Duplicates

In [5]:
# Drop duplicate rows
df = df.drop_duplicates()

In [6]:
# Reset the index
df.reset_index(drop=True, inplace=True)

In [7]:
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Number of duplicate rows: 0


# Handling Missing Values

In [8]:
# Drop rows with missing essential information (e.g., 'From', 'Subject', 'Body')
df = df.dropna(subset=['From', 'Subject', 'Body'])


In [9]:
# Check for null values
print(df.isnull().sum())



From       0
Subject    0
Date       0
Body       0
dtype: int64


# Standardizing and Cleaning Data - 'From' And 'Date' Columns

In [10]:
# Print the first 10 values from the 'From' column
print(df['From'].head(10))

1                          "Kohl’s" <Kohls@s.kohls.com>
2                 Goibibo <noreply@content.goibibo.com>
4                          "Kohl’s" <Kohls@s.kohls.com>
6                    Netflix <info@members.netflix.com>
7                 Google Cloud <googlecloud@google.com>
8                IndiGo <mailers@marketing.goindigo.in>
9     "CaratLane, A Tanishq Partnership" <CaratLane@...
11          Mirae Asset MF <noreply@miraeassetmf.co.in>
12                    Temu <email@market.temuemail.com>
13             Medium Daily Digest <noreply@medium.com>
Name: From, dtype: object


In [11]:
import re

# Extract email addresses
df['From'] = df['From'].apply(lambda x: re.findall(r'<(.+?)>', x)[0] if '<' in x else x)


In [12]:
# Print the first 10 values from the 'From' column
print(df['From'].head(10))


1                  Kohls@s.kohls.com
2        noreply@content.goibibo.com
4                  Kohls@s.kohls.com
6           info@members.netflix.com
7             googlecloud@google.com
8      mailers@marketing.goindigo.in
9     CaratLane@mailer.caratlane.com
11        noreply@miraeassetmf.co.in
12        email@market.temuemail.com
13                noreply@medium.com
Name: From, dtype: object


In [13]:
# Print the first 10 values from the 'Date' column
print(df['Date'].head(10))


1           Thu, 21 Nov 2024 10:44:52 -0600
2           Thu, 21 Nov 2024 19:21:03 +0530
4           Thu, 21 Nov 2024 04:11:35 -0600
6           Thu, 21 Nov 2024 09:56:10 +0000
7           Thu, 21 Nov 2024 01:43:02 -0800
8           Thu, 21 Nov 2024 14:57:48 +0530
9           Thu, 21 Nov 2024 09:04:48 +0000
11          Thu, 21 Nov 2024 09:32:18 +0530
12    Thu, 21 Nov 2024 04:11:54 +0000 (UTC)
13    Thu, 21 Nov 2024 01:50:00 +0000 (UTC)
Name: Date, dtype: object


In [14]:
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Handle any remaining invalid dates
df['Date'] = df['Date'].fillna(pd.Timestamp('1970-01-01'))  # Placeholder for invalid dates


In [15]:
# Print the first 10 values from the 'Date' column
print(df['Date'].head(10))

1     2024-11-21 10:44:52-06:00
2     2024-11-21 19:21:03+05:30
4     2024-11-21 04:11:35-06:00
6     2024-11-21 09:56:10+00:00
7     2024-11-21 01:43:02-08:00
8     2024-11-21 14:57:48+05:30
9     2024-11-21 09:04:48+00:00
11    2024-11-21 09:32:18+05:30
12    2024-11-21 04:11:54+00:00
13    2024-11-21 01:50:00+00:00
Name: Date, dtype: object


# Cleaning Text Data - 'Subject' and 'Body' Columns

In [16]:
# Print the first 10 values from the 'Subject' column - before cleaning
print(df['Subject'].head(10))

1     Save the date! Get up to 50% off beauty faves ...
2                            Bus cancelled = 2x refund!
4     Plot twist: Get Black Friday Deals TODAY 🤯 Plu...
6     🔔 Reminder: A Man on the Inside is now on Netflix
7     Launch and learn with interactive, prebuilt so...
8                We're celebrating Ethics Week with you
9                       📈 Trending this week: NEW RINGS
11          🚀 Launching: Mirae Asset Long Duration Fund
12                           Your Purchase, Our Thanks!
13    A Founder Who Just Raised a $3 Million Seed Ro...
Name: Subject, dtype: object


In [17]:
# Print the first 10 values from the 'Body' column - before cleaning
print(df['Body'].head(10))

1     https://click.s.kohls.com/?qs=caceafec6c57032e...
2     Goibibo \r\n\r\n Here’s how to claim this offe...
4     https://click.s.kohls.com/?qs=b0b020c5b3246c90...
6     Here's your reminder. Start watching now.\r\n\...
7     Google  \r\nCloud­<https://notifications.googl...
8     Share your thoughts and feedback so we can hon...
9                                                     0
11    <http://panela.miraeassetmf.co.in/vtrack?clien...
12    ----------------------------------------------...
13    Stories for Rishitha Pusapati\r\n@pusapatirish...
Name: Body, dtype: object


# a. Remove HTML Tags

In [18]:
#remove HTML Tags
from bs4 import BeautifulSoup

def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

df['Subject'] = df['Subject'].apply(remove_html)
df['Body'] = df['Body'].apply(remove_html)


In [19]:
# Print the first 10 values from the 'Date' column
print(df['Subject'].head(10))

1     Save the date! Get up to 50% off beauty faves ...
2                            Bus cancelled = 2x refund!
4     Plot twist: Get Black Friday Deals TODAY 🤯 Plu...
6     🔔 Reminder: A Man on the Inside is now on Netflix
7     Launch and learn with interactive, prebuilt so...
8                We're celebrating Ethics Week with you
9                       📈 Trending this week: NEW RINGS
11          🚀 Launching: Mirae Asset Long Duration Fund
12                           Your Purchase, Our Thanks!
13    A Founder Who Just Raised a $3 Million Seed Ro...
Name: Subject, dtype: object


# b. Remove URLs

In [20]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

df['Body'] = df['Body'].apply(remove_urls)

# c. Normalize Text

In [21]:
#converting to lower case,removing puntuation and tokenizing.
import string

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['Body'] = df['Body'].apply(clean_text)


# d. Remove Stopwords and Lemmatize

In [22]:
#Eliminate common stopwords and lemmatize words for consistency.
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize text
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['Body'] = df['Body'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajuk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajuk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
# Print the first 10 values from the 'Body' column - after cleaning
print(df['Body'].head(20))

1     plus stressfree holiday flawless gift set ͏ ͏ ...
2     goibibo here’s claim offer follow u download g...
4     save 85 clearance mark calendar sephora cyber ...
6     here reminder start watching rish man inside 2...
7     google cloud­ deploy prebuilt google cloud sol...
8     share thought feedback honour commitment view ...
9                                                     0
11                         click unsubscribe newsletter
12    temu properly view full message content please...
13    story rishitha pusapati pusapatirishitha98 ·be...
16    image google new signin window pusapatirishith...
18    padma rishitha lifemiles number 00854153510 st...
19    wait 🎉 wishlist item back ready shine 🥳 ‌​‍‎‏﻿...
21    gift guide part 2 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ...
22    cyclebar burbank 3820 w verdugo ave aburbank c...
24    plus save 10 spice rest space ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ...
25    tap know time takeitlite follow u download goi...
26    jalebi learns faster people—be mastering s

# Explore the Cleaned Data

In [27]:
# Check the cleaned dataset
print(df.head())
df.shape[0]

                          From  \
1            Kohls@s.kohls.com   
2  noreply@content.goibibo.com   
4            Kohls@s.kohls.com   
6     info@members.netflix.com   
7       googlecloud@google.com   

                                             Subject  \
1  Save the date! Get up to 50% off beauty faves ...   
2                         Bus cancelled = 2x refund!   
4  Plot twist: Get Black Friday Deals TODAY 🤯 Plu...   
6  🔔 Reminder: A Man on the Inside is now on Netflix   
7  Launch and learn with interactive, prebuilt so...   

                        Date  \
1  2024-11-21 10:44:52-06:00   
2  2024-11-21 19:21:03+05:30   
4  2024-11-21 04:11:35-06:00   
6  2024-11-21 09:56:10+00:00   
7  2024-11-21 01:43:02-08:00   

                                                Body  
1  plus stressfree holiday flawless gift set ͏ ͏ ...  
2  goibibo here’s claim offer follow u download g...  
4  save 85 clearance mark calendar sephora cyber ...  
6  here reminder start watching rish man insi

16469

In [25]:
# Save the cleaned dataset
df.to_csv('cleaned_emails.csv', index=False)

# Downloading the cleaned Dataset

In [26]:
from IPython.display import FileLink

# Create a download link
FileLink('cleaned_emails.csv')
