In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import functools 

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("Processing Data App")\
        .getOrCreate()

    #read in the phising URL datasets
    phising_url_raw = spark.read.csv(path='phishing_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

    #read in the Spam URL datasets
    spam_url_raw = spark.read.csv(path='spam_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

    #read in benign URL datasets
    benign_url_raw = spark.read.csv(path='benign_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)


    #show the raw_data dataframe
    print("Phising URL Datasets")
    phising_url_raw.show(5)
    print("Total Records %d" %phising_url_raw.count())
    
    print("SPAM URL Datasets")
    spam_url_raw.show(5)
    print("Total Records %d" %spam_url_raw.count())

    print("Bening URL Datasets")
    benign_url_raw.show(5)
    print("Total Records %d" %benign_url_raw.count())

    


Phising URL Datasets
+--------+--------------------+--------------------+-------------------+--------+-------------------+------+---------+
|phish_id|                 url|    phish_detail_url|    submission_time|verified|  verification_time|online|   target|
+--------+--------------------+--------------------+-------------------+--------+-------------------+------+---------+
| 5602261|http://mail.super...|http://www.phisht...|2018-04-25 19:40:23|     yes|2018-04-25 19:56:20|   yes|    Other|
| 5602179|http://fyrlois.co...|http://www.phisht...|2018-04-25 18:17:25|     yes|2018-04-25 19:45:22|   yes|  Westpac|
| 5602152|https://daostack-...|http://www.phisht...|2018-04-25 17:33:09|     yes|2018-04-25 17:37:32|   yes|    Other|
| 5602112|http://fjffn-nfnf...|http://www.phisht...|2018-04-25 16:51:03|     yes|2018-04-25 19:02:17|   yes|Microsoft|
| 5602111|http://dnccnnfncn...|http://www.phisht...|2018-04-25 16:50:00|     yes|2018-04-25 19:02:17|   yes|Microsoft|
+--------+-----------------

In [12]:
#Clean the raw benign datasets and only extract "Domain" Column
benign_url = benign_url_raw.select("Domain").selectExpr("Domain as url")

#Clean phishing URL datasets and exract only the 'url' column
phishing_url = phising_url_raw.select("url")


print("Bening URLs")
benign_url.show()

print("Phising URL")
phishing_url.show()


Bening URLs
+----------------+
|             url|
+----------------+
|      google.com|
|    facebook.com|
|     youtube.com|
|     twitter.com|
|   microsoft.com|
|    linkedin.com|
|   wikipedia.org|
| plus.google.com|
|       apple.com|
|   instagram.com|
|   wordpress.org|
|       adobe.com|
|en.wikipedia.org|
|itunes.apple.com|
|   wordpress.com|
|        youtu.be|
|       vimeo.com|
|    blogspot.com|
|   pinterest.com|
| maps.google.com|
+----------------+
only showing top 20 rows

Phising URL
+--------------------+
|                 url|
+--------------------+
|http://mail.super...|
|http://fyrlois.co...|
|https://daostack-...|
|http://fjffn-nfnf...|
|http://dnccnnfncn...|
|http://inbound-al...|
|http://superprinc...|
|http://cadastrode...|
|https://dinaloupi...|
|https://cityhebdo.fr|
|https://www.cemto...|
|http://efgqef.igg...|
|https://pyapay.co...|
|https://meccamedi...|
|https://cadastram...|
|http://trycoba.uk...|
|https://inc1-upda...|
|https://localbizv...|
|https://cl

In [15]:
#Add a column called 'lable' at the end of phising url datasets and fill it with all ones (1)
phishing_url_labelled = phishing_url.withColumn('label', lit(1))

#Add a column called 'lable' at the end of spam url datasets and fill it with all ones (1)
spam_url_labelled = spam_url.withColumn('label', lit(1))

#Add a column called 'lable' at the end of spam url datasets and fill it with all zeros (0)
benign_url_labelled = benign_url.withColumn('lable', lit(0))

#output both lebelled malicious and benign datasets for validation

print("Phising URL")
phishing_url_labelled.show(5)

print("SPAM URL")
spam_url_labelled.show(5)

print("Benign URL")
benign_url_labelled.show(5)

Phising URL
+--------------------+-----+
|                 url|label|
+--------------------+-----+
|http://mail.super...|    1|
|http://fyrlois.co...|    1|
|https://daostack-...|    1|
|http://fjffn-nfnf...|    1|
|http://dnccnnfncn...|    1|
+--------------------+-----+
only showing top 5 rows

SPAM URL
+------------------------------+-----+
|amazon.co.uk.security-check.ga|label|
+------------------------------+-----+
|          autosegurancabras...|    1|
|          dadossolicitado-a...|    1|
|                hitnrun.com.my|    1|
|          maruthorvattomsri...|    1|
|          paypalsecure-2016...|    1|
+------------------------------+-----+
only showing top 5 rows

Benign URL
+-------------+-----+
|          url|lable|
+-------------+-----+
|   google.com|    0|
| facebook.com|    0|
|  youtube.com|    0|
|  twitter.com|    0|
|microsoft.com|    0|
+-------------+-----+
only showing top 5 rows



In [21]:
phishing_url_labelled.write.csv("phishing_output_labelled")
spam_url_labelled.write.csv("spam_output_lablled")
benign_url_labelled.write.csv("bening_output_labelled")

In [19]:
final_ouput = phishing_url_labelled.union(benign_url_labelled)