In [1]:
import pandas as pd 

In [2]:
main = pd.read_csv("../data/FinalDataset/malicious_phish.csv")
main

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


# Merging Data

In [4]:
import pandas as pd

# Define a list of file paths and corresponding types
file_paths = [
    ("../data/FinalDataset/URL/Benign_list_big_final.csv", "benign"),
    ("../data/FinalDataset/URL/DefacementSitesURLFiltered.csv", "defacement"),
    ("../data/FinalDataset/URL/Malware_dataset.csv", "malware"),
    ("../data/FinalDataset/URL/phishing_dataset.csv", "phishing"),
    ("../data/FinalDataset/URL/spam_dataset.csv", "spam")
]

# Initialize an empty list to store dataframes
dfs = []

# Iterate through the file paths and types
for file_path, url_type in file_paths:
    # Read the CSV file
    df = pd.read_csv(file_path, header=None)
    # Assign column names
    df.columns = ["url"]
    # Add a column for the URL type
    df["type"] = url_type
    # Append the dataframe to the list
    dfs.append(df)

dfs.append(main)
# Concatenate the list of dataframes into a single dataframe
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.drop_duplicates(inplace=True)
merged_df.reset_index(drop=True, inplace=True)

# Display the merged dataframe
merged_df.head(100000)

Unnamed: 0,url,type
0,http://1337x.to/torrent/1048648/American-Snipe...,benign
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign
...,...,...
99995,http://www.ccent.com.au/index.php?view=article...,defacement
99996,http://www.ccent.com.au/index.php?option=com_c...,defacement
99997,http://www.ccent.com.au/index.php?option=com_m...,defacement
99998,http://www.ccent.com.au/index.php?view=article...,defacement


# Preprocessing Data

In [5]:
missing_values = merged_df.isnull().sum()
print(missing_values)

url     0
type    0
dtype: int64


In [6]:
merged_df.drop_duplicates(subset=['url'], inplace=True)
merged_df

Unnamed: 0,url,type
0,http://1337x.to/torrent/1048648/American-Snipe...,benign
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign
...,...,...
653041,xbox360.ign.com/objects/850/850402.html,phishing
653042,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
653043,www.gamespot.com/xbox360/action/deadspace/,phishing
653044,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
merged_df['type_encoded'] = label_encoder.fit_transform(merged_df['type'])
merged_df

Unnamed: 0,url,type,type_encoded
0,http://1337x.to/torrent/1048648/American-Snipe...,benign,0
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign,0
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign,0
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign,0
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign,0
...,...,...,...
653041,xbox360.ign.com/objects/850/850402.html,phishing,3
653042,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,3
653043,www.gamespot.com/xbox360/action/deadspace/,phishing,3
653044,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,3


# Balancing Data

In [8]:
type_counts = merged_df['type_encoded'].value_counts()
print(type_counts)

type_encoded
0    428080
1     95308
3     94086
2     23645
4     11921
Name: count, dtype: int64


In [9]:
from sklearn.utils import resample

# Separate the majority and minority classes
df_majority = merged_df[merged_df['type_encoded'] == 0]
df_minority_1 = merged_df[merged_df['type_encoded'] == 1]
df_minority_2 = merged_df[merged_df['type_encoded'] == 2]
df_minority_3 = merged_df[merged_df['type_encoded'] == 3]
df_minority_4 = merged_df[merged_df['type_encoded'] == 4]

# Randomly undersample the majority class to match the second largest class (95,308)
df_majority_downsampled = df_majority.sample(n=95308, random_state=42)

# Combine the downsampled majority class with the minority classes
balanced_df = pd.concat([df_majority_downsampled, df_minority_1, df_minority_2, df_minority_3, df_minority_4])

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Print new class distribution
print(balanced_df['type_encoded'].value_counts())

type_encoded
0    95308
1    95308
3    94086
2    23645
4    11921
Name: count, dtype: int64


In [10]:
from sklearn.utils import resample

# Oversample minority classes (Phishing & Spam)
df_minority_2_oversampled = resample(
    df_minority_2, replace=True, n_samples=94086, random_state=42
)
df_minority_4_oversampled = resample(
    df_minority_4, replace=True, n_samples=94086, random_state=42
)

# Combine all balanced classes
final_balanced_df = pd.concat([
    df_majority_downsampled,  # 95,308
    df_minority_1,            # 95,308
    df_minority_3,            # 94,086
    df_minority_2_oversampled, # 94,086 (oversampled)
    df_minority_4_oversampled  # 94,086 (oversampled)
])

# Shuffle the dataset
final_balanced_df = final_balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Print final class distribution
print(final_balanced_df['type_encoded'].value_counts())

type_encoded
1    95308
0    95308
2    94086
4    94086
3    94086
Name: count, dtype: int64


In [None]:
final_balanced_df.to_csv("../data/final.csv")