In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import functools 

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    #read in the phishing URL datasets
    raw_data = spark.read.csv(path='phishing_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

    #show the raw_data dataframe
    raw_data.show(5)
    print("Total Records %d" %raw_data.count())

+--------+--------------------+--------------------+-------------------+--------+-------------------+------+---------+
|phish_id|                 url|    phish_detail_url|    submission_time|verified|  verification_time|online|   target|
+--------+--------------------+--------------------+-------------------+--------+-------------------+------+---------+
| 5602261|http://mail.super...|http://www.phisht...|2018-04-25 19:40:23|     yes|2018-04-25 19:56:20|   yes|    Other|
| 5602179|http://fyrlois.co...|http://www.phisht...|2018-04-25 18:17:25|     yes|2018-04-25 19:45:22|   yes|  Westpac|
| 5602152|https://daostack-...|http://www.phisht...|2018-04-25 17:33:09|     yes|2018-04-25 17:37:32|   yes|    Other|
| 5602112|http://fjffn-nfnf...|http://www.phisht...|2018-04-25 16:51:03|     yes|2018-04-25 19:02:17|   yes|Microsoft|
| 5602111|http://dnccnnfncn...|http://www.phisht...|2018-04-25 16:50:00|     yes|2018-04-25 19:02:17|   yes|Microsoft|
+--------+--------------------+-----------------

In [3]:
#Clean the data, only "url" column is required
#Selecting only url and storing in new dataframe 'data'

data = raw_data.select("url")

#show content of data
data.show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                 url|
+----------------------------------------------------------------------------------------------------+
|                                         http://mail.superprincessheroes.com/auth/personal/index.htm|
|http://fyrlois.com/img/alcance/we/login.php?cmd=login_submit&id=da5f49e66c89133f9b9ed59ee014604bd...|
|                                                                           https://daostack-token.io|
|http://fjffn-nfnfnfnfmfmff.000webhostapp.com/ufhugrhkjhkjfghkljgklfgkbjgknjkrg/ufhugrhkjhkjfghklj...|
|http://dnccnnfncndhfnfjfff.000webhostapp.com/ufhugrhkjhkjfghkljgklfgkbjgknjkrg/ufhugrhkjhkjfghklj...|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [5]:
#save the csv files to output folder
data.write.csv("ouput")

In [6]:
#read in the malicious URL datasets
malicious_url = spark.read.csv(path='malicious_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

#read in benign URL datasets
benign_url_raw = spark.read.csv(path='benign_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

print("Malicous URLs: %d" %malicious_url.count())
malicious_url.show(5)

#Processing Bening URLS - Exracting only the URL

benign_url = benign_url_raw.select("Domain").selectExpr("Domain as url")

print("Benign URLs: %d" %benign_url.count())
benign_url.show(5)



Malicous URLs: 198146
+------------+
|         url|
+------------+
|  0b55.top;1|
|  0dfd9x.net|
|135cross.com|
|     1ct.top|
|     1gc.top|
+------------+
only showing top 5 rows

Benign URLs: 1000000
+-------------+
|          url|
+-------------+
|   google.com|
| facebook.com|
|  youtube.com|
|  twitter.com|
|microsoft.com|
+-------------+
only showing top 5 rows



In [7]:
#Labeling both malicous and benign URLS 1 and 0 respectively

malicious_url_labelled = malicious_url.withColumn('label', lit(1))
benign_url_labelled = benign_url.withColumn('lable', lit(0))

#output both lebelled malicious and benign datasets for validation

print("Malicious URL")
malicious_url_labelled.show(5)

print("Benign URL")
benign_url_labelled.show(5)

Malicious URL
+------------+-----+
|         url|label|
+------------+-----+
|  0b55.top;1|    1|
|  0dfd9x.net|    1|
|135cross.com|    1|
|     1ct.top|    1|
|     1gc.top|    1|
+------------+-----+
only showing top 5 rows

Benign URL
+-------------+-----+
|          url|lable|
+-------------+-----+
|   google.com|    0|
| facebook.com|    0|
|  youtube.com|    0|
|  twitter.com|    0|
|microsoft.com|    0|
+-------------+-----+
only showing top 5 rows



In [8]:
final_ouput = malicious_url_labelled.union(benign_url_labelled)

In [10]:
final_ouput.write.csv("final_output")