In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType)


In [3]:
spark = SparkSession.builder.appName("avsig_new_vars").getOrCreate()

In [4]:
df = spark.read.csv("AvSigversion_Threats.csv",inferSchema=True,header=True)

In [5]:
df.show()

+-----+--------------------+----------+------------+------------+
|index|                Name|AlertLevel|AvSigVersion|        Type|
+-----+--------------------+----------+------------+------------+
|    0|Backdoor:MSIL/Bla...|    severe| 1.155.266.0|AddedThreats|
|    1|Backdoor:Win32/Fa...|    severe| 1.155.266.0|AddedThreats|
|    2|Backdoor:Win32/Ne...|    severe| 1.155.266.0|AddedThreats|
|    3|    PWS:Win32/Fareit|    severe| 1.155.266.0|AddedThreats|
|    4|Trojan:Win32/Ceat...|    severe| 1.155.266.0|AddedThreats|
|    5| Trojan:Win32/Comame|    severe| 1.155.266.0|AddedThreats|
|    6|Trojan:Win32/Core...|    severe| 1.155.266.0|AddedThreats|
|    7|Trojan:Win32/EyeS...|    severe| 1.155.266.0|AddedThreats|
|    8|Trojan:Win32/Lokt...|    severe| 1.155.266.0|AddedThreats|
|    9|Trojan:Win32/Medf...|    severe| 1.155.266.0|AddedThreats|
|   10|  Trojan:Win32/Otran|    severe| 1.155.266.0|AddedThreats|
|   11|Trojan:Win32/Redy...|    severe| 1.155.266.0|AddedThreats|
|   12|Tro

In [12]:
df = df.withColumnRenamed('Name','AvSigVersion_Name')\
        .withColumnRenamed('AlertLevel','AvSigVersion_AlertLevel')\
        .withColumnRenamed('Type','AvSigVersion_Type')
df = df.drop('index')

In [13]:
df.summary().show(truncate=False)

+-------+----------------------------+-----------------------+------------+-----------------+
|summary|AvSigVersion_Name           |AvSigVersion_AlertLevel|AvSigVersion|AvSigVersion_Type|
+-------+----------------------------+-----------------------+------------+-----------------+
|count  |1010337                     |1010337                |1010337     |1010337          |
|mean   |null                        |null                   |null        |null             |
|stddev |null                        |null                   |null        |null             |
|min    |Adware:AndroidOS/Hiddad     |high                   |1.155.266.0 |AddedThreats     |
|25%    |null                        |null                   |null        |null             |
|50%    |null                        |null                   |null        |null             |
|75%    |null                        |null                   |null        |null             |
|max    |Worm:iPhoneOS/Vigorf.A!plock|severe                

In [16]:
data_train = spark.read.csv('../../../data/df_cat_prepro_0/*.csv', header=True, inferSchema=True).select('MachineIdentifier','AvSigVersion')

In [17]:
train_new = data_train.join(df,'AvSigVersion','left')


In [20]:
train_new.select('AvSigVersion_Name').distinct().count()

41063

In [21]:
train_new.select('AvSigVersion_AlertLevel').distinct().count()

5

In [25]:
train_new.groupby('AvSigVersion_AlertLevel').count().show(truncate=False)

+-----------------------+----------+
|AvSigVersion_AlertLevel|count     |
+-----------------------+----------+
|moderate               |3271397   |
|null                   |6292268   |
|low                    |1559      |
|severe                 |1172040768|
|high                   |80880523  |
+-----------------------+----------+



In [22]:
train_new.select('AvSigVersion_Type').distinct().count()

3

In [26]:
train_new.groupby('AvSigVersion_Type').count().show(truncate=False)

+-----------------+----------+
|AvSigVersion_Type|count     |
+-----------------+----------+
|UpdatedThreats   |1175403292|
|null             |6292268   |
|AddedThreats     |80790955  |
+-----------------+----------+



In [29]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer


In [30]:
cols_label_encoder = ['AvSigVersion_AlertLevel','AvSigVersion_Type']
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep").fit(train_new) for c in cols_label_encoder]
pipeline = Pipeline(stages=indexers)
train_new = pipeline.fit(train_new).transform(train_new)

In [32]:
train_new.persist()

DataFrame[AvSigVersion: string, MachineIdentifier: string, AvSigVersion_Name: string, AvSigVersion_AlertLevel: string, AvSigVersion_Type: string, AvSigVersion_AlertLevel_index: double, AvSigVersion_Type_index: double]

In [37]:
train_new.columns

['AvSigVersion',
 'MachineIdentifier',
 'AvSigVersion_Name',
 'AvSigVersion_AlertLevel',
 'AvSigVersion_Type',
 'AvSigVersion_AlertLevel_index',
 'AvSigVersion_Type_index']

In [40]:
train_new.select('AvSigVersion_Name').show(truncate=False)

+-----------------------------------+
|AvSigVersion_Name                  |
+-----------------------------------+
|Adware:Win32/BetterSurf            |
|Adware:Win32/BetterSurf!rfn        |
|Adware:Win32/Lollipop              |
|Adware:Win32/Pirrit                |
|BrowserModifier:Win32/Webalta!rfn  |
|SoftwareBundler:Win32/Softpulse!rfn|
|HackTool:MSIL/Boilod.B             |
|HackTool:MSIL/Noancooe.A           |
|HackTool:Win32/AutoKMS             |
|HackTool:Win32/Cain                |
|HackTool:Win32/KeeFarce            |
|HackTool:Win32/PasswordFox         |
|Joke:VBS/CdEject.C                 |
|Backdoor:Java/ReverseBackdoor!rfn  |
|Backdoor:MSIL/Bladabindi.AA        |
|Backdoor:MSIL/Bladabindi.G         |
|Backdoor:MSIL/Hamaetot!rfn         |
|Backdoor:MSIL/Hamaetot.A           |
|Backdoor:MSIL/Sylavriu!rfn         |
|Backdoor:Win32/Advo!rfn            |
+-----------------------------------+
only showing top 20 rows



In [51]:
train_new = train_new.withColumn('AvSigVersion_Name_1', split(train_new['AvSigVersion_Name'], ':')[0])\
            .withColumn('AvSigVersion_Name_2', split(train_new['AvSigVersion_Name'], '/')[0])



In [48]:
train_new.withColumn('AvSigVersion_Name_0', split(train_new['AvSigVersion_Name'], '/')[0])\
            .select('AvSigVersion_Name_0')\
            .distinct().count()


568

In [49]:
train_new.withColumn('AvSigVersion_Name_0', split(train_new['AvSigVersion_Name'], '/')[0])\
            .select('AvSigVersion_Name_0')\
            .show()


+--------------------+
| AvSigVersion_Name_0|
+--------------------+
|        Adware:Win32|
|        Adware:Win32|
|        Adware:Win32|
|        Adware:Win32|
|BrowserModifier:W...|
|SoftwareBundler:W...|
|       HackTool:MSIL|
|       HackTool:MSIL|
|      HackTool:Win32|
|      HackTool:Win32|
|      HackTool:Win32|
|      HackTool:Win32|
|            Joke:VBS|
|       Backdoor:Java|
|       Backdoor:MSIL|
|       Backdoor:MSIL|
|       Backdoor:MSIL|
|       Backdoor:MSIL|
|       Backdoor:MSIL|
|      Backdoor:Win32|
+--------------------+
only showing top 20 rows

