In [1]:
import os
import zipfile
import re

import pandas as pd

In [2]:
RAW_DATASETS = [
    "naserabdullahalam/phishing-email-dataset",
    "ahmedhassansaqr/email-spam-detection-v2",
    "subhajournal/phishingemails",
    "purusinghvi/email-spam-classification-dataset"
]

In [6]:
# Download data from kaggle
def kaggle_downloader(dataset_name: str, download_folder: str) -> str:
    os.chdir(download_folder)
    os.system(f"kaggle datasets download -d {dataset_name}")
    os.chdir("..")
    
    file_path = f"{download_folder}/{dataset_name.split('/')[-1]}.zip"
    return file_path
    
def unzipper(file_name: str, unzip_folder: str) -> None:
    with zipfile.ZipFile(file_name, "r") as zip_file:
        zip_file.extractall(unzip_folder)

In [7]:
downloads_folder = "./downloads"
csvs_folder = "./csvs"
os.makedirs(downloads_folder, exist_ok=True)
os.makedirs(csvs_folder, exist_ok=True)

In [8]:
for dataset_name in RAW_DATASETS:
    dataset_path = kaggle_downloader(dataset_name, downloads_folder)
    unzipper(dataset_path, csvs_folder)

FileNotFoundError: [Errno 2] No such file or directory: './downloads/phishing-email-dataset.zip'

In [9]:
# Uploading data from CSVs to dataframes
df_1 = pd.read_csv(f"{csvs_folder}/CEAS_08.csv")
df_2 = pd.read_csv(f"{csvs_folder}/smsspamcollection.tsv", delimiter="\t")
df_3 = pd.read_csv(f"{csvs_folder}/combined_data.csv")
df_4 = pd.read_csv(f"{csvs_folder}/Phishing_Email.csv")

df_1.shape, df_2.shape, df_3.shape, df_4.shape

((39154, 7), (5572, 4), (83448, 2), (18650, 3))

In [10]:
df_1.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [11]:
df_2.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [12]:
df_3.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [13]:
df_4.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [14]:
# Uniform labels
df_2["label"] = df_2["label"].replace({"spam": 1, "ham": 0}).infer_objects(copy=False)
df_4["Email Type"] = df_4["Email Type"].replace({"Safe Email": 0, "Phishing Email": 1}).infer_objects(copy=False)

  df_2["label"] = df_2["label"].replace({"spam": 1, "ham": 0}).infer_objects(copy=False)
  df_4["Email Type"] = df_4["Email Type"].replace({"Safe Email": 0, "Phishing Email": 1}).infer_objects(copy=False)


In [15]:
# Uniform columns
df_1 = df_1.drop(["sender", "receiver", "date", "subject", "urls"], axis=1)
df_2 = df_2.drop(["length", "punct"], axis=1)
df_4 = df_4.drop(["Unnamed: 0"], axis=1)

df_1.rename(columns={"body": "text"}, inplace=True)
df_2.rename(columns={"message": "text"}, inplace=True)
df_4.rename(columns={"Email Type": "label", "Email Text": "text"}, inplace=True)

In [16]:
# Unite datasets
df = pd.concat([df_1, df_3, df_2, df_4])
df["text"] = df["text"].fillna("")
df

Unnamed: 0,text,label
0,"Buck up, your troubles caused by small dimensi...",1
1,\nUpgrade your sex and pleasures with these te...,1
2,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1
3,Would anyone object to removing .so from this ...,0
4,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1
...,...,...
18645,date a lonely housewife always wanted to date ...,1
18646,request submitted : access request for anita ....,0
18647,"re : important - prc mtg hi dorn & john , as y...",0
18648,press clippings - letter on californian utilit...,0


In [17]:
df.shape

(146824, 2)

In [18]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [19]:
def count_urls(text: str) -> int:
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(url_pattern, text)
    return len(urls)

In [20]:
df["url_count"] = df["text"].apply(count_urls)

In [21]:
df

Unnamed: 0,text,label,url_count
0,"Buck up, your troubles caused by small dimensi...",1,1
1,\nUpgrade your sex and pleasures with these te...,1,1
2,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,24
3,Would anyone object to removing .so from this ...,0,467
4,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1
...,...,...,...
18645,date a lonely housewife always wanted to date ...,1,0
18646,request submitted : access request for anita ....,0,0
18647,"re : important - prc mtg hi dorn & john , as y...",0,0
18648,press clippings - letter on californian utilit...,0,0


In [22]:
df.to_csv("data.csv")