In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import functools 

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("Processing Data App")\
        .getOrCreate()

    #read in the phishing URL datasets
    raw_data = spark.read.csv(path='spam_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

    #show the raw_data dataframe
    raw_data.show(5)
    print("Total Records %d" %raw_data.count())

+------------------------------+
|amazon.co.uk.security-check.ga|
+------------------------------+
|          autosegurancabras...|
|          dadossolicitado-a...|
|                hitnrun.com.my|
|          maruthorvattomsri...|
|          paypalsecure-2016...|
+------------------------------+
only showing top 5 rows

Total Records 20911


In [2]:
#Clean the data, only "url" column is required
#Selecting only url and storing in new dataframe 'data'

data = raw_data.select("url")

#show content of data
data.show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                 url|
+----------------------------------------------------------------------------------------------------+
|                                         http://mail.superprincessheroes.com/auth/personal/index.htm|
|http://fyrlois.com/img/alcance/we/login.php?cmd=login_submit&id=da5f49e66c89133f9b9ed59ee014604bd...|
|                                                                           https://daostack-token.io|
|http://fjffn-nfnfnfnfmfmff.000webhostapp.com/ufhugrhkjhkjfghkljgklfgkbjgknjkrg/ufhugrhkjhkjfghklj...|
|http://dnccnnfncndhfnfjfff.000webhostapp.com/ufhugrhkjhkjfghkljgklfgkbjgknjkrg/ufhugrhkjhkjfghklj...|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [4]:
#save the csv files to output folder
data.write.csv("ouput")

In [5]:
#read in the phising URL datasets
phising_url = spark.read.csv(path='phis.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

#read in the Spam URL datasets
spam_url = spark.read.csv(path='spam_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

#read in benign URL datasets
benign_url_raw = spark.read.csv(path='benign_url.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

print("Phising URLs: %d" %phising_url.count())
phising_url.show(5)

print("Spam URLs: %d" %spam_url.count())
spam_url.show(5)


#Processing Bening URLS - Exracting only the URL

benign_url = benign_url_raw.select("Domain").selectExpr("Domain as url")

print("Benign URLs: %d" %benign_url.count())
benign_url.show(5)



Phising URLs: 35698
+-----------------------------------------------------------+
|http://mail.superprincessheroes.com/auth/personal/index.htm|
+-----------------------------------------------------------+
|                                       http://fyrlois.co...|
|                                       https://daostack-...|
|                                       http://fjffn-nfnf...|
|                                       http://dnccnnfncn...|
|                                       http://inbound-al...|
+-----------------------------------------------------------+
only showing top 5 rows

Spam URLs: 20911
+------------------------------+
|amazon.co.uk.security-check.ga|
+------------------------------+
|          autosegurancabras...|
|          dadossolicitado-a...|
|                hitnrun.com.my|
|          maruthorvattomsri...|
|          paypalsecure-2016...|
+------------------------------+
only showing top 5 rows

Benign URLs: 1000000
+-------------+
|          url|
+----

In [6]:
#Labeling both malicous and benign URLS 1 and 0 respectively

phishing_url_labelled = phising_url.withColumn('label', lit(1))
spam_url_labelled = spam_url.withColumn('label', lit(1))
benign_url_labelled = benign_url.withColumn('lable', lit(0))

#output both lebelled malicious and benign datasets for validation

print("Phising URL")
phising_url.show(5)

print("SPAM URL")
spam_url.show(5)

print("Benign URL")
benign_url_labelled.show(5)

Phising URL
+-----------------------------------------------------------+
|http://mail.superprincessheroes.com/auth/personal/index.htm|
+-----------------------------------------------------------+
|                                       http://fyrlois.co...|
|                                       https://daostack-...|
|                                       http://fjffn-nfnf...|
|                                       http://dnccnnfncn...|
|                                       http://inbound-al...|
+-----------------------------------------------------------+
only showing top 5 rows

SPAM URL
+------------------------------+
|amazon.co.uk.security-check.ga|
+------------------------------+
|          autosegurancabras...|
|          dadossolicitado-a...|
|                hitnrun.com.my|
|          maruthorvattomsri...|
|          paypalsecure-2016...|
+------------------------------+
only showing top 5 rows

Benign URL
+-------------+-----+
|          url|lable|
+-------------+----

In [11]:
final_ouput = phishing_url_labelled.union(benign_url_labelled)

In [12]:
final_ouput.write.csv("final_output0000")

In [8]:
spam_url_labelled.write.csv("spam_url_out")