# Data Cleaning and Combining

## Retrieving and Cleaning Data

In [155]:
import pandas as pd
import numpy as np

In [156]:
phishing_df = pd.read_csv('./data/phishing.csv')
print(phishing_df.shape)
print(phishing_df.info())

(5971, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5971 entries, 0 to 5970
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   LABEL   5971 non-null   object
 1   TEXT    5971 non-null   object
 2   URL     5971 non-null   object
 3   EMAIL   5971 non-null   object
 4   PHONE   5971 non-null   object
dtypes: object(5)
memory usage: 233.4+ KB
None


In [157]:
phishing_df.head()

Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE
0,ham,Your opinion about me? 1. Over 2. Jada 3. Kusr...,No,No,No
1,ham,What's up? Do you want me to come online? If y...,No,No,No
2,ham,So u workin overtime nigpun?,No,No,No
3,ham,"Also sir, i sent you an email about how to log...",No,No,No
4,Smishing,Please Stay At Home. To encourage the notion o...,No,No,No


In [158]:
phishing_df['LABEL'].value_counts(normalize=True)

ham         0.811254
Smishing    0.103165
spam        0.078044
Spam        0.003852
smishing    0.003684
Name: LABEL, dtype: float64

In [159]:
phishing_df['EMAIL'].value_counts()

No     5952
yes      19
Name: EMAIL, dtype: int64

In [160]:
phishing_2_df = pd.read_csv('./data/fraud_email_.csv')
print(phishing_2_df.shape)
print(phishing_2_df.info())

(11929, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11929 entries, 0 to 11928
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    11928 non-null  object
 1   Class   11929 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 186.5+ KB
None


In [161]:
phishing_2_df.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


In [162]:
phishing_2_df['Class'].value_counts()

0    6742
1    5187
Name: Class, dtype: int64

In [163]:
phishing_3_df = pd.read_csv('./data/phishing_data_by_type.csv')
print(phishing_3_df.shape)
print(phishing_3_df.info())

(159, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  157 non-null    object
 1   Text     159 non-null    object
 2   Type     159 non-null    object
dtypes: object(3)
memory usage: 3.9+ KB
None


In [164]:
phishing_3_df.head()

Unnamed: 0,Subject,Text,Type
0,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\...,Fraud
1,URGENT ASSISTANCE /RELATIONSHIP (P),"Dear Friend,\n\nI am Mr. Ben Suleman a custom ...",Fraud
2,GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,Fraud
3,from Mrs.Johnson,Goodday Dear\n\n\nI know this mail will come t...,Fraud
4,Co-Operation,FROM MR. GODWIN AKWESI\nTEL: +233 208216645\nF...,Fraud


In [165]:
phishing_3_df['Type'].value_counts(normalize=True)

Fraud               0.251572
Phishing            0.251572
Commercial Spam     0.251572
False Positives     0.245283
Name: Type, dtype: float64

In [166]:
phishing_4_df = pd.read_csv('./data/completeSpamAssassin.csv')
print(phishing_4_df.shape)
print(phishing_4_df.info())

(6046, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6046 entries, 0 to 6045
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6046 non-null   int64 
 1   Body        6045 non-null   object
 2   Label       6046 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 141.8+ KB
None


In [167]:
phishing_4_df.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [189]:
phishing_extract_array = []
for i in range(1, 5):
    phishing_extract = pd.read_csv(f'./data/phishing_extract_{i}.csv')
    phishing_extract_array.append(phishing_extract)

    
combined_phishing_extract_df = pd.DataFrame()    
for df in phishing_extract_array:
    df.drop(columns=['from', 'subject'], inplace=True)
    combined_phishing_extract_df = pd.concat([combined_phishing_extract_df, df])
combined_phishing_extract_df.drop_duplicates(inplace=True)
combined_phishing_extract_df.to_csv('./data/phishing_eml_extract_full.csv', index=False)
#     print('---Head---')
#     display(df.head())
#     print('---Tail---')
#     display(df.tail())

In [192]:
combined_phishing_extract_df.duplicated().sum()

0

In [193]:
phishing_extract_df = pd.read_csv('./data/phishing_eml_extract_full.csv')
display(phishing_extract_df)

Unnamed: 0,content
0,"Dear valued PayPal member, Due to recent fraud..."
1,Credit Union is constantly working to ensure s...
2,"Untitled Document Dear eBay Member, We regret ..."
3,Credit Union is constantly working to ensure s...
4,"Dear Amazon member, Dear member, Due to concer..."
...,...
1866,Dear business client of Regions Bank: The Regi...
1867,logo Dear Commonwealth Bank customer Commonwea...
1868,Dear National City business client: The Nation...
1869,Your account is limited PayPal is committed to...


In [194]:
ham_extract_array = []
for i in range(1, 4):
    ham_extract = pd.read_csv(f'./data/ham_extract_{i}.csv')
    ham_extract_array.append(ham_extract)

combined_ham_extract_df = pd.DataFrame()    
for df in ham_extract_array:
    df.drop(columns=['from', 'subject'], inplace=True)
    combined_ham_extract_df = pd.concat([combined_ham_extract_df, df])
combined_ham_extract_df.drop_duplicates(inplace=True)
combined_ham_extract_df.to_csv('./data/ham_eml_extract_full.csv', index=False)

In [195]:
combined_ham_extract_df = pd.read_csv('./data/ham_eml_extract_full.csv')
display(combined_ham_extract_df)

Unnamed: 0,content
0,I actually thought of this kind of active chat...
1,Content-Disposition: inline To view this newsl...
2,----- Original Message ----- From: Joseph S. B...
3,charsetISO-8859-1 formatflowed Bob We are a co...
4,I am delurking to comment on the Salon article...
...,...
1438,You appear to be using an email application th...
1439,CNET Investor Dispatch Quote LookupEnter symbo...
1440,Todays Headlines from The Register -----------...
1441,"Hi Everyone, There seem to be several bonehead..."


## Data Frame Cleaning

In [196]:
phishing_1_clean_df = phishing_df.drop(columns=['URL', 'EMAIL', 'PHONE'])

In [197]:
phishing_1_clean_df['LABEL'].value_counts()

ham         4844
Smishing     616
spam         466
Spam          23
smishing      22
Name: LABEL, dtype: int64

In [198]:
phishing_1_clean_df['content'] = phishing_1_clean_df['TEXT']
phishing_1_clean_df['phishing'] = np.where(
    (phishing_1_clean_df['LABEL'] == 'Smishing') | (phishing_1_clean_df['LABEL'] == 'smishing'),
    True,
    False
)
phishing_1_clean_df.drop(columns = ['LABEL', 'TEXT'], inplace = True)

In [199]:
phishing_1_clean_df.head()

Unnamed: 0,content,phishing
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,False
1,What's up? Do you want me to come online? If y...,False
2,So u workin overtime nigpun?,False
3,"Also sir, i sent you an email about how to log...",False
4,Please Stay At Home. To encourage the notion o...,True


In [200]:
phishing_2_df.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


In [201]:
phishing_2_clean_df = pd.DataFrame()

In [202]:
phishing_2_clean_df['content'] = phishing_2_df['Text']
phishing_2_clean_df['phishing'] = np.where(
    phishing_2_df['Class'] == 1,
    True,
    False
)

In [203]:
phishing_2_clean_df.head()

Unnamed: 0,content,phishing
0,Supply Quality China's EXCLUSIVE dimensions at...,True
1,over. SidLet me know. Thx.,False
2,"Dear Friend,Greetings to you.I wish to accost ...",True
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,True
4,Not a surprising assessment from Embassy.,False


In [204]:
phishing_3_df.head()

Unnamed: 0,Subject,Text,Type
0,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\...,Fraud
1,URGENT ASSISTANCE /RELATIONSHIP (P),"Dear Friend,\n\nI am Mr. Ben Suleman a custom ...",Fraud
2,GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,Fraud
3,from Mrs.Johnson,Goodday Dear\n\n\nI know this mail will come t...,Fraud
4,Co-Operation,FROM MR. GODWIN AKWESI\nTEL: +233 208216645\nF...,Fraud


In [205]:
phishing_3_clean_df = pd.DataFrame()
phishing_3_clean_df['content'] = phishing_3_df['Text']
phishing_3_clean_df['phishing'] = np.where(
    phishing_3_df['Type'] == 'Fraud',
    True,
    False
)

In [206]:
phishing_3_clean_df.head()

Unnamed: 0,content,phishing
0,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\...,True
1,"Dear Friend,\n\nI am Mr. Ben Suleman a custom ...",True
2,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,True
3,Goodday Dear\n\n\nI know this mail will come t...,True
4,FROM MR. GODWIN AKWESI\nTEL: +233 208216645\nF...,True


In [207]:
phishing_4_df.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [208]:
phishing_4_clean_df = pd.DataFrame()
phishing_4_clean_df['content'] = phishing_4_df['Body'].str.replace('\\n', ' ')
phishing_4_clean_df['phishing'] = np.where(
    phishing_4_df['Label'] == 1,
    True,
    False
)

  phishing_4_clean_df['content'] = phishing_4_df['Body'].str.replace('\\n', ' ')


In [209]:
phishing_4_clean_df.head()

Unnamed: 0,content,phishing
0,Save up to 70% on Life Insurance. Why Spend M...,True
1,1) Fight The Risk of Cancer! http://www.adclic...,True
2,1) Fight The Risk of Cancer! http://www.adclic...,True
3,##############################################...,True
4,I thought you might like these: 1) Slim Down -...,True


In [215]:
phishing_5_clean_df = phishing_extract_df
phishing_5_clean_df['phishing'] = True

In [216]:
phishing_6_clean_df = combined_ham_extract_df
phishing_6_clean_df['phishing'] = False

In [217]:
phishing_all_df = pd.concat([
    phishing_1_clean_df,
    phishing_2_clean_df,
    phishing_3_clean_df,
    phishing_4_clean_df,
    phishing_5_clean_df,
    phishing_6_clean_df
])

In [223]:
phishing_all_df.dropna(inplace=True)

In [226]:
phishing_all_df.isna().sum()

content     0
phishing    0
dtype: int64

In [227]:
phishing_all_df

Unnamed: 0,content,phishing
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,False
1,What's up? Do you want me to come online? If y...,False
2,So u workin overtime nigpun?,False
3,"Also sir, i sent you an email about how to log...",False
4,Please Stay At Home. To encourage the notion o...,True
...,...,...
1438,You appear to be using an email application th...,False
1439,CNET Investor Dispatch Quote LookupEnter symbo...,False
1440,Todays Headlines from The Register -----------...,False
1441,"Hi Everyone, There seem to be several bonehead...",False


In [229]:
phishing_all_df.to_csv('./data/phishing_all_data.csv', index=False)