In [94]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, length, regexp_replace
from pyspark.sql.types import IntegerType, BooleanType, StringType
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, PCA
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LinearSVC, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [67]:
spark = SparkSession.builder.appName('URLs_mining').getOrCreate()
spark

In [68]:
# Read the CSV file into a DataFrame
data = spark.read.csv("malicious_urls.csv", header=True, inferSchema=True)

# Show the first few rows of the DataFrame
data.show()

+--------------------+----------+
|                 url|      type|
+--------------------+----------+
|    br-icloud.com.br|  phishing|
|mp3raid.com/music...|    benign|
|bopsecrets.org/re...|    benign|
|http://www.garage...|defacement|
|http://adventure-...|defacement|
|http://buzzfil.ne...|    benign|
|espn.go.com/nba/p...|    benign|
|yourbittorrent.co...|    benign|
|http://www.pashmi...|defacement|
|allmusic.com/albu...|    benign|
|corporationwiki.c...|    benign|
|http://www.ikenmi...|defacement|
|myspace.com/video...|    benign|
|http://www.lebens...|defacement|
|http://www.szabad...|defacement|
|http://larcadelca...|defacement|
|quickfacts.census...|    benign|
|nugget.ca/Article...|    benign|
|uk.linkedin.com/p...|    benign|
|http://www.vnic.c...|defacement|
+--------------------+----------+
only showing top 20 rows



In [69]:
data.groupBy("type").count().show()

+--------------------+------+
|                type| count|
+--------------------+------+
|              benign|428103|
|          defacement| 96457|
|            phishing| 94108|
|             malware| 32520|
|                NULL|    15|
|                Ð|     1|
|PhµW\v;XyOy...|     1|
|cÔ¡æ>1\bHÇÕd...|     1|
|                spam| 12000|
+--------------------+------+



## Data pre-processing

In [70]:
data = data.dropDuplicates(["url"])
data = data.na.drop()

data.groupBy("type").count().show()

+--------------------+------+
|                type| count|
+--------------------+------+
|                spam| 11921|
|              benign|428080|
|          defacement| 95308|
|            phishing| 94083|
|             malware| 23645|
|                Ð|     1|
|PhµW\v;XyOy...|     1|
|cÔ¡æ>1\bHÇÕd...|     1|
+--------------------+------+



In [71]:
data = data.filter(col("type").rlike(r'^[ -~]+$'))
data.groupBy("type").count().show()

+----------+------+
|      type| count|
+----------+------+
|      spam| 11921|
|    benign|428080|
|defacement| 95308|
|  phishing| 94083|
|   malware| 23645|
+----------+------+



In [72]:
data = data.withColumn("url_type", when(data["type"].contains("benign"), 0).otherwise(1))
#data = data.drop("type")
data.show()

+--------------------+--------+--------+
|                 url|    type|url_type|
+--------------------+--------+--------+
|   H\vÖË]t¹[ÈöýE|phishing|       1|
|^oð]Â|¬|hõElò...|phishing|       1|
|"½<+U½¹1\f[...|phishing|       1|
|"äÕ3ñºT-\fTÖGÑîÊ...|phishing|       1|
|"ëËl×uÏB'JI¨GÙn"...|phishing|       1|
|'118bm.com/images...|  benign|       0|
|'1pcables-inox.co...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
|'9d345009-a-62cb3...|  benign|       0|
+--------------------+--------+--------+
only showing top

In [73]:
data = data.filter(col("url").rlike(r'^[ -~]+$'))
data.show()

+--------------------+------+--------+
|                 url|  type|url_type|
+--------------------+------+--------+
|'118bm.com/images...|benign|       0|
|'1pcables-inox.co...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
|'9d345009-a-62cb3...|benign|       0|
+--------------------+------+--------+
only showing top 20 rows



In [74]:
data.groupBy("url_type").count().show()

+--------+------+
|url_type| count|
+--------+------+
|       1|224323|
|       0|427801|
+--------+------+



In [75]:
sample_size = 100000

data = data.sampleBy("url_type", fractions={0 : sample_size/data.filter(col("url_type") == 0).count(),
                                               1 : sample_size/data.filter(col("url_type") == 1).count()}, seed=42)
data.groupBy("url_type").count().show()

+--------+------+
|url_type| count|
+--------+------+
|       1|100473|
|       0|100153|
+--------+------+



## Features engineering

In [76]:
data = data.withColumn("url_length", length("url"))
data = data.withColumn("digit_count", length(regexp_replace(col("url"), "[^0-9]", "")))
data = data.withColumn("ampersand_count", length(regexp_replace(col("url"), "[^&]", "")))
data = data.withColumn("underscore_count", length(regexp_replace(col("url"), "[^_]", "")))
data = data.withColumn("dot_count", length(regexp_replace(col("url"), "[^.]", "")))
data = data.withColumn("percent_count", length(regexp_replace(col("url"), "[^%]", "")))
data = data.withColumn("at_count", length(regexp_replace(col("url"), "[^@]", "")))
data = data.withColumn("tilde_count", length(regexp_replace(col("url"), "[^~]", "")))
data = data.withColumn("hash_count", length(regexp_replace(col("url"), "[^#]", "")))
data = data.withColumn("has_https", when(data["url"].startswith("https"), 1).otherwise(0))
data = data.withColumn("has_http", when(data["url"].startswith("http"), 1).otherwise(0))
data = data.withColumn("starts_with_digit", when(data["url"].rlike("^[0-9]"), 1).otherwise(0))

In [77]:
data.show()

+--------------------+------+--------+----------+-----------+---------------+----------------+---------+-------------+--------+-----------+----------+---------+--------+-----------------+
|                 url|  type|url_type|url_length|digit_count|ampersand_count|underscore_count|dot_count|percent_count|at_count|tilde_count|hash_count|has_https|has_http|starts_with_digit|
+--------------------+------+--------+----------+-----------+---------------+----------------+---------+-------------+--------+-----------+----------+---------+--------+-----------------+
|'9d345009-a-62cb3...|benign|       0|       363|         46|              2|               1|        3|            3|       0|          0|         0|        0|       0|                0|
|'9d345009-a-62cb3...|benign|       0|       371|         51|              2|               6|        3|            3|       0|          0|         0|        0|       0|                0|
|'9d345009-a-62cb3...|benign|       0|       371|         50

### Features selection

In [101]:
data = data.drop("type")
data.columns

['url',
 'url_type',
 'url_length',
 'digit_count',
 'ampersand_count',
 'underscore_count',
 'dot_count',
 'percent_count',
 'at_count',
 'tilde_count',
 'hash_count',
 'has_https',
 'has_http',
 'starts_with_digit',
 'features_vector',
 'scaled_features',
 'pca_features']

In [78]:
feature_columns = [
    "url_length",
    "digit_count",
    "ampersand_count",
    "underscore_count",
    "dot_count",
    "percent_count",
    "at_count",
    "tilde_count",
    "hash_count",
    "has_https",
    "has_http",
    "starts_with_digit"
]

In [79]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_vector")
data = assembler.transform(data)

In [80]:
data.select("features_vector").show(truncate=False)

+-----------------------------------------------+
|features_vector                                |
+-----------------------------------------------+
|(12,[0,1,2,3,4,5],[363.0,46.0,2.0,1.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,51.0,2.0,6.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,50.0,2.0,2.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,55.0,2.0,5.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,44.0,2.0,1.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,54.0,2.0,1.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,46.0,2.0,5.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,56.0,2.0,4.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,54.0,2.0,3.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,51.0,2.0,2.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,36.0,2.0,4.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,53.0,2.0,1.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,50.0,2.0,5.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,57.0,2.0,1.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,44.0,2.0,7.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,50.0,2.0,1.0,3.0,3.0])|
|(12,[0,1,2,3,4,5],[371.0,50.0,2.0,5.0,3.0,3.0])|


#### Min-Max scaling

In [81]:
# Creazione di un oggetto MinMaxScaler
min_max_scaler = MinMaxScaler(inputCol="features_vector", outputCol="scaled_features")

# Creazione di un oggetto Pipeline
min_max_pipeline = Pipeline(stages=[min_max_scaler])

# Addestramento del modello e trasformazione del DataFrame
min_max_model = min_max_pipeline.fit(data)
data = min_max_model.transform(data)

In [82]:
data.select("scaled_features").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------+
|scaled_features                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------+
|(12,[0,1,2,3,4,5],[0.20180383314543404,0.05032822757111597,0.04,0.02631578947368421,0.04878048780487805,0.020134228187919462]) |
|(12,[0,1,2,3,4,5],[0.20631341600901915,0.05579868708971553,0.04,0.15789473684210525,0.04878048780487805,0.020134228187919462]) |
|(12,[0,1,2,3,4,5],[0.20631341600901915,0.05470459518599562,0.04,0.05263157894736842,0.04878048780487805,0.020134228187919462]) |
|(12,[0,1,2,3,4,5],[0.20631341600901915,0.060175054704595186,0.04,0.13157894736842105,0.04878048780487805,0.020134228187919462])|
|(12,[0,1,2,3,4,5],[0.20631341600901915,0.048140043763676144,0.04,0.02631578947368421,0.04

#### Principal Component Analysis - PCA

In [84]:
pca = PCA(k=5, inputCol="scaled_features", outputCol="pca_features")
pca_model = pca.fit(data)

In [85]:
data = pca_model.transform(data)

In [86]:
data.select("pca_features").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------+
|pca_features                                                                                                  |
+--------------------------------------------------------------------------------------------------------------+
|[-0.0067109174354255535,-0.00316804819082955,-3.7478004956300427E-4,0.17559904279058972,-0.018358191045913963]|
|[-0.0072790567370971,-0.003970869526189762,-6.09243787391657E-4,0.23742075019371547,-0.0999988020714439]      |
|[-0.006921506686156236,-0.0033462049580184495,-4.1246585543996955E-4,0.19057798885656393,-0.03518672341220894]|
|[-0.007214088748729602,-0.0038002213476157187,-5.466494164500249E-4,0.22661826457071022,-0.08385439957954649] |
|[-0.006799080993394107,-0.0032096321595178542,-3.8140063262471444E-4,0.1776385509717172,-0.018904398160520156]|
|[-0.006856538697788737,-0.0031755567794444052,-3.4987148449833747E-4,0.17977550323355868,-0.019

#### Train test split

In [87]:
train, test = data.randomSplit([0.8, 0.2], seed = 2018)

## Classification models

### Random forest classifier

In [88]:
rf = RandomForestClassifier(featuresCol = 'scaled_features', labelCol = 'url_type')

In [89]:
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

In [92]:
predictions.select("url_type", "prediction").show(10)

+--------+----------+
|url_type|prediction|
+--------+----------+
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 10 rows



In [95]:
evaluator = MulticlassClassificationEvaluator(labelCol="url_type", predictionCol="prediction", metricName="accuracy")

In [96]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2%}".format(accuracy))

Accuracy: 82.72%


In [97]:
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Weighted Precision: {:.2%}".format(precision))

Weighted Precision: 85.42%


In [98]:
evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(predictions)
print("Weighted Recall: {:.2%}".format(recall))

Weighted Recall: 82.72%


### Naive Bayes

### Support Vector Machine

### Gradient-Boosted Trees 