In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType)
from pyspark.sql.functions import *
from pyspark import SparkConf
from pyspark import SparkContext
import multiprocessing

In [2]:
cores = multiprocessing.cpu_count()
p = 10
conf = SparkConf()
conf.set("spark.driver.cores", cores)
conf.set("spark.driver.memory", "10g")
conf.set("spark.sql.shuffle.partitions", p * cores)
conf.set("spark.default.parallelism", p * cores)
sc = SparkContext(conf=conf)

In [3]:
spark = SparkSession.builder.appName('separador_cat_num').getOrCreate()

In [4]:
df_cat = spark.read.csv('data/df_cat/*.csv', header=True, inferSchema=True)

In [5]:
mis_cols = df_cat.columns[:len(df_cat.columns)//2]

In [6]:
df_cat.columns[len(df_cat.columns)//2:]

['Census_ProcessorClass',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_PowerPlatformRoleName',
 'Census_InternalBatteryType',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSSkuName',
 'Census_OSInstallTypeName',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_GenuineStateName',
 'Census_ActivationChannel',
 'Census_FlightRing']

In [5]:
df_cat.show(5, False)

+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+--------------------+----------------------------------------+----------+-------+------------+---------------------+-------------------+---------------------+--------------------------+----------------------+----------------------------+--------------------------+----------------+---------------------+---------------+------------------+-------------------+------------------------+--------------------------------+-----------------------+------------------------+-----------------+
|MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsVer   |OsPlatformSubRelease|OsBuildLab                              |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|Census_ProcessorClass|Census_PrimaryDiskTypeName|Census_ChassisTypeName|Census_PowerPlatformRoleName|Census_InternalBatteryType|Census_OSVersi

In [7]:
df_cat = df_cat.select(mis_cols)

In [71]:
df_cat.select('ProductName').distinct().show()

+-------------+
|  ProductName|
+-------------+
|          mse|
|mseprerelease|
| win8defender|
|         scep|
|windowsintune|
|          fep|
+-------------+



In [10]:
df_cat.agg({"EngineVersion": "max"}).collect()[0]

Row(max(EngineVersion)='1.1.9700.0')

In [11]:
df_cat.agg({"EngineVersion": "min"}).collect()[0]

Row(min(EngineVersion)='1.1.10302.0')

In [8]:
df_cat = df_cat.withColumn('EngineVersion_2', split(df_cat['EngineVersion'], '\.')[2])\
.withColumn('EngineVersion_3', split(df_cat['EngineVersion'], '\.')[3])\
.withColumn('EngineVersion_0', split(df_cat['EngineVersion'], '\.')[0].cast(IntegerType()))\
.withColumn('EngineVersion_1', split(df_cat['EngineVersion'], '\.')[1].cast(IntegerType()))

In [9]:
df_cat.select('EngineVersion', 'EngineVersion_2', 'EngineVersion_3', 'EngineVersion_0').show(10, False)

+-------------+---------------+---------------+---------------+
|EngineVersion|EngineVersion_2|EngineVersion_3|EngineVersion_0|
+-------------+---------------+---------------+---------------+
|1.1.15100.1  |15100          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
+-------------+---------------+---------------+---------------+
only showing top 10 rows



In [52]:
print(df_cat.agg({"EngineVersion_1": "max"}).collect()[0],
      df_cat.agg({"EngineVersion_1": "min"}).collect()[0])

Row(max(EngineVersion_1)=1) Row(min(EngineVersion_1)=1)


In [10]:
df_cat = df_cat.withColumn('AppVersion_0', split(df_cat['AppVersion'], '\.')[0].cast(IntegerType()))\
.withColumn('AppVersion_1', split(df_cat['AppVersion'], '\.')[1].cast(IntegerType()))\
.withColumn('AppVersion_2', split(df_cat['AppVersion'], '\.')[2].cast(IntegerType()))\
.withColumn('AppVersion_3', split(df_cat['AppVersion'], '\.')[3].cast(IntegerType()))

In [75]:
df_cat.select('AppVersion','AppVersion_0','AppVersion_1','AppVersion_2','AppVersion_3').show(10, False)

+----------------+------------+------------+------------+------------+
|AppVersion      |AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|
+----------------+------------+------------+------------+------------+
|4.18.1807.18075 |4           |18          |1807        |18075       |
|4.9.10586.1106  |4           |9           |10586       |1106        |
|4.11.15063.447  |4           |11          |15063       |447         |
|4.18.1806.18062 |4           |18          |1806        |18062       |
|4.11.15063.0    |4           |11          |15063       |0           |
|4.12.17007.18011|4           |12          |17007       |18011       |
|4.18.1807.18075 |4           |18          |1807        |18075       |
|4.14.17639.18041|4           |14          |17639       |18041       |
|4.10.209.0      |4           |10          |209         |0           |
|4.18.1807.18075 |4           |18          |1807        |18075       |
+----------------+------------+------------+------------+------------+
only s

In [29]:
df_cat.select('AppVersion').distinct().count()

124

In [60]:
av.agg(max("AppVersion_0"), min("AppVersion_0")).show()

+-----------------+-----------------+
|max(AppVersion_0)|min(AppVersion_0)|
+-----------------+-----------------+
|                4|                4|
+-----------------+-----------------+



In [11]:
df_cat = df_cat.withColumn('AvSigVersion_0', split(df_cat['AvSigVersion'], '\.')[0].cast(IntegerType()))\
.withColumn('AvSigVersion_1', split(df_cat['AvSigVersion'], '\.')[1].cast(IntegerType()))\
.withColumn('AvSigVersion_2', split(df_cat['AvSigVersion'], '\.')[2].cast(IntegerType()))\
.withColumn('AvSigVersion_3', split(df_cat['AvSigVersion'], '\.')[3].cast(IntegerType()))
df_cat.select('AvSigVersion','AvSigVersion_0','AvSigVersion_1','AvSigVersion_2','AvSigVersion_3').show(10, False)

+------------+--------------+--------------+--------------+--------------+
|AvSigVersion|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|
+------------+--------------+--------------+--------------+--------------+
|1.273.1642.0|1             |273           |1642          |0             |
|1.275.199.0 |1             |275           |199           |0             |
|1.273.1420.0|1             |273           |1420          |0             |
|1.273.315.0 |1             |273           |315           |0             |
|1.275.370.0 |1             |275           |370           |0             |
|1.275.1140.0|1             |275           |1140          |0             |
|1.275.1209.0|1             |275           |1209          |0             |
|1.273.1276.0|1             |273           |1276          |0             |
|1.273.1749.0|1             |273           |1749          |0             |
|1.273.802.0 |1             |273           |802           |0             |
+------------+-----------

In [12]:
df_cat.persist()
df_cat.count()

16774736

In [13]:
len(mis_cols)

15

In [14]:
len(df_cat.columns)

27

In [15]:
df_cat.show(5, False)

+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+--------------------+----------------------------------------+----------+-------+------------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+
|MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsVer   |OsPlatformSubRelease|OsBuildLab                              |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|
+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+-----------------

In [16]:
from pyspark.ml.feature import StringIndexer

In [17]:
df_cat.select('Platform').distinct().show()

+-----------+
|   Platform|
+-----------+
|  windows10|
|windows2016|
|   windows8|
|   windows7|
+-----------+



In [18]:
indexer = StringIndexer(inputCol="Platform", outputCol="PlatformIndex")
df_cat = indexer.fit(df_cat).transform(df_cat)
df_cat.show(5, False)

+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+--------------------+----------------------------------------+----------+-------+------------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+-------------+
|MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsVer   |OsPlatformSubRelease|OsBuildLab                              |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|PlatformIndex|
+--------------------------------+------------+-------------+---------------+------------+---------+--------

In [19]:
df_cat.select('Processor').distinct().show()

+---------+
|Processor|
+---------+
|      x64|
|      x86|
|    arm64|
+---------+



In [20]:
indexer_processor = StringIndexer(inputCol="Processor", outputCol="ProcessorIndex")
df_cat = indexer_processor.fit(df_cat).transform(df_cat)
df_cat.show(5, False)

+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+--------------------+----------------------------------------+----------+-------+------------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+-------------+--------------+
|MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsVer   |OsPlatformSubRelease|OsBuildLab                              |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|PlatformIndex|ProcessorIndex|
+--------------------------------+------------+-------------+---------------+-

In [22]:
df_cat.select('OsVer').distinct().count()

69

In [23]:
df_cat_freq_osver = df_cat.groupBy('OsVer').count().withColumnRenamed('count', 'OsVer_freq')

In [24]:
df_cat = df_cat.join(df_cat_freq_osver, ['OsVer'], 'left')

In [25]:
df_cat.show(2, False)

+---------+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------------------+-----------------------------------------------------+----------+-------+-----------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+-------------+--------------+----------+
|OsVer    |MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsPlatformSubRelease|OsBuildLab                                           |SkuEdition|PuaMode|SmartScreen|Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|PlatformIndex|ProcessorIndex|OsVer_freq|
+---------+-------------------

In [27]:
df_cat_freq_osver.show(truncate=False)

+----------+----------+
|OsVer     |OsVer_freq|
+----------+----------+
|10.0.0.22 |1         |
|6.1.2.0   |3         |
|6.3.2.0   |1         |
|10.0.5.117|1         |
|10.0.19.80|1         |
|10.0.32.72|39        |
|6.3.80.0  |1         |
|6.3.4.0   |4         |
|10.0.0.112|12        |
|6.3.1.144 |1         |
|10.0.1.44 |4         |
|10.0.2.86 |4         |
|6.1.0.0   |1154      |
|10.0.2.0  |45        |
|10.0.7.80 |2         |
|10.0.0.80 |1         |
|10.0.0.96 |1         |
|6.1.1.0   |147921    |
|6.3.0.16  |1         |
|10.0.16.0 |14        |
+----------+----------+
only showing top 20 rows



In [28]:
df_cat.select('OsPlatformSubRelease').distinct().count()

9

In [30]:
indexer_OsPlatformSubRelease = StringIndexer(inputCol="OsPlatformSubRelease", outputCol="OsPlatformSubReleaseIndex")
df_cat = indexer_OsPlatformSubRelease.fit(df_cat).transform(df_cat)
df_cat.show(2, False)

+---------+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------------------+-----------------------------------------------------+----------+-------+-----------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+-------------+--------------+----------+-------------------------+
|OsVer    |MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsPlatformSubRelease|OsBuildLab                                           |SkuEdition|PuaMode|SmartScreen|Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|PlatformIndex|ProcessorIndex|OsVer_freq|OsPla

In [31]:
df_cat = df_cat.withColumn('OsBuildLab_0', split(df_cat['OsBuildLab'], '\.')[0].cast(IntegerType()))\
.withColumn('OsBuildLab_1', split(df_cat['OsBuildLab'], '\.')[1].cast(IntegerType()))\
.withColumn('OsBuildLab_2', split(df_cat['OsBuildLab'], '\.')[2])\
.withColumn('OsBuildLab_3', split(df_cat['OsBuildLab'], '\.')[3])\
.withColumn('OsBuildLab_4', split(df_cat['OsBuildLab'], '\.')[4])
df_cat.show(5, False)

+---------+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------------------+-----------------------------------------------------+----------+-------+------------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+-------------+--------------+----------+-------------------------+------------+------------+------------+----------------------+------------+
|OsVer    |MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsPlatformSubRelease|OsBuildLab                                           |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_

In [32]:
df_cat = df_cat.withColumn('OsBuildLab_4_0', split(df_cat['OsBuildLab_4'], '-')[0].cast(IntegerType()))\
.withColumn('OsBuildLab_4_1', split(df_cat['OsBuildLab_4'], '-')[1].cast(IntegerType()))
df_cat.show(5, False)

+---------+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------------------+-----------------------------------------------------+----------+-------+------------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+-------------+--------------+----------+-------------------------+------------+------------+------------+----------------------+------------+--------------+--------------+
|OsVer    |MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsPlatformSubRelease|OsBuildLab                                           |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_