In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType)
from pyspark.sql.functions import *
from pyspark import SparkConf
from pyspark import SparkContext
import multiprocessing

In [2]:
cores = multiprocessing.cpu_count()
p = 10
conf = SparkConf()
conf.set("spark.driver.cores", cores)
conf.set("spark.driver.memory", "10g")
conf.set("spark.sql.shuffle.partitions", p * cores)
conf.set("spark.default.parallelism", p * cores)
sc = SparkContext(conf=conf)

In [3]:
spark = SparkSession.builder.appName('separador_cat_num').getOrCreate()

In [4]:
df_cat = spark.read.csv('data/df_cat/*.csv', header=True, inferSchema=True)

In [69]:
mis_cols = df_cat.columns[:len(df_cat.columns)//2]

In [67]:
df_cat.columns[len(df_cat.columns)//2:]

['Census_ProcessorClass',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_PowerPlatformRoleName',
 'Census_InternalBatteryType',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSSkuName',
 'Census_OSInstallTypeName',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_GenuineStateName',
 'Census_ActivationChannel',
 'Census_FlightRing']

In [5]:
df_cat.show(5, False)

+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+--------------------+----------------------------------------+----------+-------+------------+---------------------+-------------------+---------------------+--------------------------+----------------------+----------------------------+--------------------------+----------------+---------------------+---------------+------------------+-------------------+------------------------+--------------------------------+-----------------------+------------------------+-----------------+
|MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsVer   |OsPlatformSubRelease|OsBuildLab                              |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|Census_ProcessorClass|Census_PrimaryDiskTypeName|Census_ChassisTypeName|Census_PowerPlatformRoleName|Census_InternalBatteryType|Census_OSVersi

In [70]:
df_cat = df_cat.select(mis_cols)

In [71]:
df_cat.select('ProductName').distinct().show()

+-------------+
|  ProductName|
+-------------+
|          mse|
|mseprerelease|
| win8defender|
|         scep|
|windowsintune|
|          fep|
+-------------+



In [10]:
df_cat.agg({"EngineVersion": "max"}).collect()[0]

Row(max(EngineVersion)='1.1.9700.0')

In [11]:
df_cat.agg({"EngineVersion": "min"}).collect()[0]

Row(min(EngineVersion)='1.1.10302.0')

In [72]:
df_cat = df_cat.withColumn('EngineVersion_2', split(df_cat['EngineVersion'], '\.')[2])\
.withColumn('EngineVersion_3', split(df_cat['EngineVersion'], '\.')[3])\
.withColumn('EngineVersion_0', split(df_cat['EngineVersion'], '\.')[0].cast(IntegerType()))\
.withColumn('EngineVersion_1', split(df_cat['EngineVersion'], '\.')[1].cast(IntegerType()))

In [73]:
df_cat.select('EngineVersion', 'EngineVersion_2', 'EngineVersion_3', 'EngineVersion_0').show(10, False)

+-------------+---------------+---------------+---------------+
|EngineVersion|EngineVersion_2|EngineVersion_3|EngineVersion_0|
+-------------+---------------+---------------+---------------+
|1.1.15100.1  |15100          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15200.1  |15200          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
|1.1.15100.1  |15100          |1              |1              |
+-------------+---------------+---------------+---------------+
only showing top 10 rows



In [52]:
print(df_cat.agg({"EngineVersion_1": "max"}).collect()[0],
      df_cat.agg({"EngineVersion_1": "min"}).collect()[0])

Row(max(EngineVersion_1)=1) Row(min(EngineVersion_1)=1)


In [74]:
df_cat = df_cat.withColumn('AppVersion_0', split(df_cat['AppVersion'], '\.')[0].cast(IntegerType()))\
.withColumn('AppVersion_1', split(df_cat['AppVersion'], '\.')[1].cast(IntegerType()))\
.withColumn('AppVersion_2', split(df_cat['AppVersion'], '\.')[2].cast(IntegerType()))\
.withColumn('AppVersion_3', split(df_cat['AppVersion'], '\.')[3].cast(IntegerType()))

In [75]:
df_cat.select('AppVersion','AppVersion_0','AppVersion_1','AppVersion_2','AppVersion_3').show(10, False)

+----------------+------------+------------+------------+------------+
|AppVersion      |AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|
+----------------+------------+------------+------------+------------+
|4.18.1807.18075 |4           |18          |1807        |18075       |
|4.9.10586.1106  |4           |9           |10586       |1106        |
|4.11.15063.447  |4           |11          |15063       |447         |
|4.18.1806.18062 |4           |18          |1806        |18062       |
|4.11.15063.0    |4           |11          |15063       |0           |
|4.12.17007.18011|4           |12          |17007       |18011       |
|4.18.1807.18075 |4           |18          |1807        |18075       |
|4.14.17639.18041|4           |14          |17639       |18041       |
|4.10.209.0      |4           |10          |209         |0           |
|4.18.1807.18075 |4           |18          |1807        |18075       |
+----------------+------------+------------+------------+------------+
only s

In [60]:
av.agg(max("AppVersion_0"), min("AppVersion_0")).show()

+-----------------+-----------------+
|max(AppVersion_0)|min(AppVersion_0)|
+-----------------+-----------------+
|                4|                4|
+-----------------+-----------------+



In [76]:
df_cat = df_cat.withColumn('AvSigVersion_0', split(df_cat['AvSigVersion'], '\.')[0].cast(IntegerType()))\
.withColumn('AvSigVersion_1', split(df_cat['AvSigVersion'], '\.')[1].cast(IntegerType()))\
.withColumn('AvSigVersion_2', split(df_cat['AvSigVersion'], '\.')[2].cast(IntegerType()))\
.withColumn('AvSigVersion_3', split(df_cat['AvSigVersion'], '\.')[3].cast(IntegerType()))
df_cat.select('AvSigVersion','AvSigVersion_0','AvSigVersion_1','AvSigVersion_2','AvSigVersion_3').show(10, False)

+------------+--------------+--------------+--------------+--------------+
|AvSigVersion|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|
+------------+--------------+--------------+--------------+--------------+
|1.273.1642.0|1             |273           |1642          |0             |
|1.275.199.0 |1             |275           |199           |0             |
|1.273.1420.0|1             |273           |1420          |0             |
|1.273.315.0 |1             |273           |315           |0             |
|1.275.370.0 |1             |275           |370           |0             |
|1.275.1140.0|1             |275           |1140          |0             |
|1.275.1209.0|1             |275           |1209          |0             |
|1.273.1276.0|1             |273           |1276          |0             |
|1.273.1749.0|1             |273           |1749          |0             |
|1.273.802.0 |1             |273           |802           |0             |
+------------+-----------

In [77]:
df_cat.persist()
df_cat.count()

16774736

In [79]:
len(mis_cols)

15

In [80]:
len(df_cat.columns)

27

In [78]:
df_cat.show(5, False)

+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+--------------------+----------------------------------------+----------+-------+------------+---------------------+-------------------+---------------+---------------+---------------+---------------+------------+------------+------------+------------+--------------+--------------+--------------+--------------+
|MachineIdentifier               |ProductName |EngineVersion|AppVersion     |AvSigVersion|Platform |Processor|OsVer   |OsPlatformSubRelease|OsBuildLab                              |SkuEdition|PuaMode|SmartScreen |Census_MDC2FormFactor|Census_DeviceFamily|EngineVersion_2|EngineVersion_3|EngineVersion_0|EngineVersion_1|AppVersion_0|AppVersion_1|AppVersion_2|AppVersion_3|AvSigVersion_0|AvSigVersion_1|AvSigVersion_2|AvSigVersion_3|
+--------------------------------+------------+-------------+---------------+------------+---------+---------+--------+-----------------

In [2]:
from pyspark.ml.feature import StringIndexer

AttributeError: type object 'numpy.ndarray' has no attribute '__array_function__'

In [81]:
df_cat.select('Platform').distinct().show()

+-----------+
|   Platform|
+-----------+
|  windows10|
|windows2016|
|   windows8|
|   windows7|
+-----------+



In [1]:
indexer = StringIndexer(inputCol="Platform", outputCol="PlatformIndex")
df_cat = indexer.fit(df_cat).transform(df_cat)
df_cat.show(5, False)

NameError: name 'StringIndexer' is not defined