# Kaggle Competition : Microsoft Malware Data
> This is a Sungryong Hong's Notebook.  

> I have a stand-alone Spark(2.3.2)/Hadoop(2.8.3) cluster, which has 48 logical cores with 150GB memory. 

> I have put the data files to my hdfs. Check the contents as `hfs -cat /data/spark/msmalware/test.csv | head`.  

>`hfs` is an alias for `hdfs dfs`. 


## 1. Import Basic Packages

In [1]:
# Basic Libraries 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
import gc

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

#### Spark-Shell Sesssion 

In [2]:
# Basic PySpark Libraries

# Old Style : SparkContext 
#from pyspark import SparkContext   
#from pyspark.sql import SQLContext


# New Style : Spark Session  
#Shell-Mode: Spark Session Name is `spark`

sc = spark.sparkContext
sqlsc = SQLContext(sc)
sc.setCheckpointDir("hdfs://master:54310/tmp/spark/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

In [3]:
# Enable Arrow for boosting up python performances 
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

#### Spark-Submit Session

> An example of Spark Session for non-shell (submit) mode; 

>`spark = SparkSession.builder.appName("largeScaleGstat")\
    .config("spark.driver.maxResultSize","8g")\
    .config("spark.sql.execution.arrow.enabled","true")\
    .config("spark.executor.memoryOverhead","42GB")\
    .getOrCreate()`


## 2. Read the *test.csv* 

### 2.1 Read the file to a spark dataframe

In [4]:
rawdf_schema = T.StructType([\
                             T.StructField('MachineIdentifier', T.StringType(), True),\
                             T.StructField('ProductName', T.StringType(), True),\
                             T.StructField('EngineVersion', T.StringType(), True),\
                             T.StructField('AppVersion', T.StringType(), True),\
                             T.StructField('AvSigVersion', T.StringType(), True),\
                             T.StructField('IsBeta', T.IntegerType(), True),\
                             T.StructField('RtpStateBitfield', T.FloatType(), True),\
                             T.StructField('IsSxsPassiveMode', T.IntegerType(), True),\
                             T.StructField('DefaultBrowsersIdentifier', T.FloatType(), True),\
                             T.StructField('AVProductStatesIdentifier', T.FloatType(), True),\
                             T.StructField('AVProductsInstalled', T.FloatType(), True),\
                             T.StructField('AVProductsEnabled', T.FloatType(), True),\
                             T.StructField('HasTpm', T.IntegerType(), True),\
                             T.StructField('CountryIdentifier', T.IntegerType(), True),\
                             T.StructField('CityIdentifier', T.FloatType(), True),\
                             T.StructField('OrganizationIdentifier', T.FloatType(), True),\
                             T.StructField('GeoNameIdentifier', T.FloatType(), True),\
                             T.StructField('LocaleEnglishNameIdentifier', T.IntegerType(), True),\
                             T.StructField('Platform', T.StringType(), True),\
                             T.StructField('Processor', T.StringType(), True),\
                             T.StructField('OsVer', T.StringType(), True),\
                             T.StructField('OsBuild', T.IntegerType(), True),\
                             T.StructField('OsSuite', T.IntegerType(), True),\
                             T.StructField('OsPlatformSubRelease', T.StringType(), True),\
                             T.StructField('OsBuildLab', T.StringType(), True),\
                             T.StructField('SkuEdition', T.StringType(), True),\
                             T.StructField('IsProtected', T.FloatType(), True),\
                             T.StructField('AutoSampleOptIn', T.IntegerType(), True),\
                             T.StructField('PuaMode', T.StringType(), True),\
                             T.StructField('SMode', T.FloatType(), True),\
                             T.StructField('IeVerIdentifier', T.FloatType(), True),\
                             T.StructField('SmartScreen', T.StringType(), True),\
                             T.StructField('Firewall', T.FloatType(), True),\
                             T.StructField('UacLuaenable', T.FloatType(), True),\
                             T.StructField('Census_MDC2FormFactor', T.StringType(), True),\
                             T.StructField('Census_DeviceFamily', T.StringType(), True),\
                             T.StructField('Census_OEMNameIdentifier', T.FloatType(), True),\
                             T.StructField('Census_OEMModelIdentifier', T.FloatType(), True),\
                             T.StructField('Census_ProcessorCoreCount', T.FloatType(), True),\
                             T.StructField('Census_ProcessorManufacturerIdentifier', T.FloatType(), True),\
                             T.StructField('Census_ProcessorModelIdentifier', T.FloatType(), True),\
                             T.StructField('Census_ProcessorClass', T.StringType(), True),\
                             T.StructField('Census_PrimaryDiskTotalCapacity', T.FloatType(), True),\
                             T.StructField('Census_PrimaryDiskTypeName', T.StringType(), True),\
                             T.StructField('Census_SystemVolumeTotalCapacity', T.FloatType(), True),\
                             T.StructField('Census_HasOpticalDiskDrive', T.IntegerType(), True),\
                             T.StructField('Census_TotalPhysicalRAM', T.FloatType(), True),\
                             T.StructField('Census_ChassisTypeName', T.StringType(), True),\
                             T.StructField('Census_InternalPrimaryDiagonalDisplaySizeInInches', T.FloatType(), True),\
                             T.StructField('Census_InternalPrimaryDisplayResolutionHorizontal', T.FloatType(), True),\
                             T.StructField('Census_InternalPrimaryDisplayResolutionVertical', T.FloatType(), True),\
                             T.StructField('Census_PowerPlatformRoleName', T.StringType(), True),\
                             T.StructField('Census_InternalBatteryType', T.StringType(), True),\
                             T.StructField('Census_InternalBatteryNumberOfCharges', T.FloatType(), True),\
                             T.StructField('Census_OSVersion', T.StringType(), True),\
                             T.StructField('Census_OSArchitecture', T.StringType(), True),\
                             T.StructField('Census_OSBranch', T.StringType(), True),\
                             T.StructField('Census_OSBuildNumber', T.IntegerType(), True),\
                             T.StructField('Census_OSBuildRevision', T.IntegerType(), True),\
                             T.StructField('Census_OSEdition', T.StringType(), True),\
                             T.StructField('Census_OSSkuName', T.StringType(), True),\
                             T.StructField('Census_OSInstallTypeName', T.StringType(), True),\
                             T.StructField('Census_OSInstallLanguageIdentifier', T.FloatType(), True),\
                             T.StructField('Census_OSUILocaleIdentifier', T.IntegerType(), True),\
                             T.StructField('Census_OSWUAutoUpdateOptionsName', T.StringType(), True),\
                             T.StructField('Census_IsPortableOperatingSystem', T.IntegerType(), True),\
                             T.StructField('Census_GenuineStateName', T.StringType(), True),\
                             T.StructField('Census_ActivationChannel', T.StringType(), True),\
                             T.StructField('Census_IsFlightingInternal', T.FloatType(), True),\
                             T.StructField('Census_IsFlightsDisabled', T.FloatType(), True),\
                             T.StructField('Census_FlightRing', T.StringType(), True),\
                             T.StructField('Census_ThresholdOptIn', T.FloatType(), True),\
                             T.StructField('Census_FirmwareManufacturerIdentifier', T.FloatType(), True),\
                             T.StructField('Census_FirmwareVersionIdentifier', T.FloatType(), True),\
                             T.StructField('Census_IsSecureBootEnabled', T.IntegerType(), True),\
                             T.StructField('Census_IsWIMBootEnabled', T.FloatType(), True),\
                             T.StructField('Census_IsVirtualDevice', T.FloatType(), True),\
                             T.StructField('Census_IsTouchEnabled', T.IntegerType(), True),\
                             T.StructField('Census_IsPenCapable', T.IntegerType(), True),\
                             T.StructField('Census_IsAlwaysOnAlwaysConnectedCapable', T.FloatType(), True),\
                             T.StructField('Wdft_IsGamer', T.FloatType(), True),\
                             T.StructField('Wdft_RegionIdentifier', T.FloatType(), True),\
                             T.StructField('HasDetections', T.IntegerType(), True)
                            ])


In [5]:
rawdf = sqlsc.read.csv("hdfs://master:54310/data/spark/msmalware/train.csv",\
                       header=True, schema = rawdf_schema)

### 2.2 Browse the raw data

In [6]:
rawdf.select('MachineIdentifier','ProductName','EngineVersion','AppVersion','AvSigVersion','IsBeta')\
     .show(10,truncate=True)

+--------------------+------------+-------------+---------------+------------+------+
|   MachineIdentifier| ProductName|EngineVersion|     AppVersion|AvSigVersion|IsBeta|
+--------------------+------------+-------------+---------------+------------+------+
|0000028988387b115...|win8defender|  1.1.15100.1|4.18.1807.18075|1.273.1735.0|     0|
|000007535c3f730ef...|win8defender|  1.1.14600.4|   4.13.17134.1|  1.263.48.0|     0|
|000007905a28d863f...|win8defender|  1.1.15100.1|4.18.1807.18075|1.273.1341.0|     0|
|00000b11598a75ea8...|win8defender|  1.1.15100.1|4.18.1807.18075|1.273.1527.0|     0|
|000014a5f00daa18e...|win8defender|  1.1.15100.1|4.18.1807.18075|1.273.1379.0|     0|
|000016191b897145d...|win8defender|  1.1.15100.1|4.18.1807.18075|1.273.1094.0|     0|
|0000161e8abf8d8b8...|win8defender|  1.1.15100.1|4.18.1807.18075| 1.273.845.0|     0|
|000019515bc8f9585...|win8defender|  1.1.15100.1|4.18.1807.18075|1.273.1393.0|     0|
|00001a027a0ab970c...|win8defender|  1.1.15200.1|4.18.

In [7]:
spark.conf.set('spark.debug.maxToStringFields',50)

> The default StringField size is 25. I got some warnings about this short string-field size. 

In [8]:
len(rawdf.columns)

83

In [9]:
rawdf.head(3)

[Row(MachineIdentifier=u'0000028988387b115f69f31a3bf04f09', ProductName=u'win8defender', EngineVersion=u'1.1.15100.1', AppVersion=u'4.18.1807.18075', AvSigVersion=u'1.273.1735.0', IsBeta=0, RtpStateBitfield=7.0, IsSxsPassiveMode=0, DefaultBrowsersIdentifier=None, AVProductStatesIdentifier=53447.0, AVProductsInstalled=1.0, AVProductsEnabled=1.0, HasTpm=1, CountryIdentifier=29, CityIdentifier=128035.0, OrganizationIdentifier=18.0, GeoNameIdentifier=35.0, LocaleEnglishNameIdentifier=171, Platform=u'windows10', Processor=u'x64', OsVer=u'10.0.0.0', OsBuild=17134, OsSuite=256, OsPlatformSubRelease=u'rs4', OsBuildLab=u'17134.1.amd64fre.rs4_release.180410-1804', SkuEdition=u'Pro', IsProtected=1.0, AutoSampleOptIn=0, PuaMode=None, SMode=0.0, IeVerIdentifier=137.0, SmartScreen=None, Firewall=1.0, UacLuaenable=1.0, Census_MDC2FormFactor=u'Desktop', Census_DeviceFamily=u'Windows.Desktop', Census_OEMNameIdentifier=2668.0, Census_OEMModelIdentifier=9124.0, Census_ProcessorCoreCount=4.0, Census_Proce

> 83 columns ... kind of too many

#### Checking `Null` counts 

In [10]:
nullstatdf = rawdf.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in rawdf.columns])\
     .toPandas().transpose()
nullstatdf.columns = ['null_counts']
nullstatdf.sort_values(by='null_counts',ascending=False)

Unnamed: 0,null_counts
PuaMode,8919174
Census_ProcessorClass,8884852
DefaultBrowsersIdentifier,8488045
Census_IsFlightingInternal,7408759
Census_InternalBatteryType,6338414
Census_ThresholdOptIn,5667325
Census_IsWIMBootEnabled,5659703
SmartScreen,3177011
OrganizationIdentifier,2751518
SMode,537759


#### Checking `HasDetections` 

In [11]:
rawdf.groupby('HasDetections').count().show()

+-------------+-------+
|HasDetections|  count|
+-------------+-------+
|            1|4458892|
|            0|4462591|
+-------------+-------+



> Ok ... the data set is well-cleaned, showing a 50:50 categorical distribution.  

#### Checking Duplicates 

In [12]:
rawdf.count()

8921483

In [13]:
rawdf.dropDuplicates().count()

8921483

In [14]:
rawdf.agg(F.count('MachineIdentifier').alias('count'),F.countDistinct('MachineIdentifier').alias('distinct')).show()

+-------+--------+
|  count|distinct|
+-------+--------+
|8921483| 8921483|
+-------+--------+



> Again, this is a well-cleaned data set; no annoying duplicates or glitches. 

#### Explore some `crosstabs`  

In [15]:
rawdf.cube('EngineVersion','AppVersion','AvSigVersion','IsBeta').count().orderBy('count',ascending=False).show(10)

+-------------+---------------+------------+------+-------+
|EngineVersion|     AppVersion|AvSigVersion|IsBeta|  count|
+-------------+---------------+------------+------+-------+
|         null|           null|        null|  null|8921483|
|         null|           null|        null|     0|8921416|
|         null|4.18.1807.18075|        null|  null|5139224|
|         null|4.18.1807.18075|        null|     0|5139218|
|  1.1.15200.1|           null|        null|  null|3845067|
|  1.1.15200.1|           null|        null|     0|3845043|
|  1.1.15100.1|           null|        null|  null|3675915|
|  1.1.15100.1|           null|        null|     0|3675891|
|  1.1.15200.1|4.18.1807.18075|        null|  null|2915372|
|  1.1.15200.1|4.18.1807.18075|        null|     0|2915368|
+-------------+---------------+------------+------+-------+
only showing top 10 rows



In [16]:
rawdf.filter(rawdf.HasDetections ==1)\
     .cube('EngineVersion','AppVersion','AvSigVersion','IsBeta')\
     .count().orderBy('count',ascending=False).show(10)

+-------------+---------------+------------+------+-------+
|EngineVersion|     AppVersion|AvSigVersion|IsBeta|  count|
+-------------+---------------+------------+------+-------+
|         null|           null|        null|  null|4458892|
|         null|           null|        null|     0|4458859|
|         null|4.18.1807.18075|        null|  null|2725768|
|         null|4.18.1807.18075|        null|     0|2725766|
|  1.1.15100.1|           null|        null|  null|2031651|
|  1.1.15100.1|           null|        null|     0|2031637|
|  1.1.15200.1|           null|        null|  null|1890743|
|  1.1.15200.1|           null|        null|     0|1890733|
|  1.1.15200.1|4.18.1807.18075|        null|  null|1440624|
|  1.1.15200.1|4.18.1807.18075|        null|     0|1440623|
+-------------+---------------+------------+------+-------+
only showing top 10 rows



#### Explore categorical cardinalities and frequencies

In [17]:
rawdf.dtypes

[('MachineIdentifier', 'string'),
 ('ProductName', 'string'),
 ('EngineVersion', 'string'),
 ('AppVersion', 'string'),
 ('AvSigVersion', 'string'),
 ('IsBeta', 'int'),
 ('RtpStateBitfield', 'float'),
 ('IsSxsPassiveMode', 'int'),
 ('DefaultBrowsersIdentifier', 'float'),
 ('AVProductStatesIdentifier', 'float'),
 ('AVProductsInstalled', 'float'),
 ('AVProductsEnabled', 'float'),
 ('HasTpm', 'int'),
 ('CountryIdentifier', 'int'),
 ('CityIdentifier', 'float'),
 ('OrganizationIdentifier', 'float'),
 ('GeoNameIdentifier', 'float'),
 ('LocaleEnglishNameIdentifier', 'int'),
 ('Platform', 'string'),
 ('Processor', 'string'),
 ('OsVer', 'string'),
 ('OsBuild', 'int'),
 ('OsSuite', 'int'),
 ('OsPlatformSubRelease', 'string'),
 ('OsBuildLab', 'string'),
 ('SkuEdition', 'string'),
 ('IsProtected', 'float'),
 ('AutoSampleOptIn', 'int'),
 ('PuaMode', 'string'),
 ('SMode', 'float'),
 ('IeVerIdentifier', 'float'),
 ('SmartScreen', 'string'),
 ('Firewall', 'float'),
 ('UacLuaenable', 'float'),
 ('Census

In [18]:
%%time
listCategorical = []
listNumerical = []
for eachcol in rawdf.dtypes[1:]: # excluding Machine-Identifier from features
    if eachcol[1] == 'string':
        listCategorical.append([eachcol[0],rawdf.select(eachcol[0]).distinct().count()])
        print listCategorical[-1]
    else:
        listNumerical.append(eachcol[0])

['ProductName', 6]
['EngineVersion', 70]
['AppVersion', 110]
['AvSigVersion', 8531]
['Platform', 4]
['Processor', 3]
['OsVer', 58]
['OsPlatformSubRelease', 9]
['OsBuildLab', 664]
['SkuEdition', 8]
['PuaMode', 3]
['SmartScreen', 22]
['Census_MDC2FormFactor', 13]
['Census_DeviceFamily', 3]
['Census_ProcessorClass', 4]
['Census_PrimaryDiskTypeName', 5]
['Census_ChassisTypeName', 53]
['Census_PowerPlatformRoleName', 11]
['Census_InternalBatteryType', 80]
['Census_OSVersion', 469]
['Census_OSArchitecture', 3]
['Census_OSBranch', 32]
['Census_OSEdition', 33]
['Census_OSSkuName', 30]
['Census_OSInstallTypeName', 9]
['Census_OSWUAutoUpdateOptionsName', 6]
['Census_GenuineStateName', 5]
['Census_ActivationChannel', 6]
['Census_FlightRing', 10]
CPU times: user 114 ms, sys: 38.4 ms, total: 152 ms
Wall time: 5min 8s


In [19]:
print [eachcat[0] for eachcat in listCategorical]

['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'Platform', 'Processor', 'OsVer', 'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'PuaMode', 'SmartScreen', 'Census_MDC2FormFactor', 'Census_DeviceFamily', 'Census_ProcessorClass', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSVersion', 'Census_OSArchitecture', 'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName', 'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel', 'Census_FlightRing']


In [20]:
len(listCategorical)

29

In [21]:
print listNumerical

['IsBeta', 'RtpStateBitfield', 'IsSxsPassiveMode', 'DefaultBrowsersIdentifier', 'AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled', 'HasTpm', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier', 'OsBuild', 'OsSuite', 'IsProtected', 'AutoSampleOptIn', 'SMode', 'IeVerIdentifier', 'Firewall', 'UacLuaenable', 'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount', 'Census_ProcessorManufacturerIdentifier', 'Census_ProcessorModelIdentifier', 'Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 'Census_HasOpticalDiskDrive', 'Census_TotalPhysicalRAM', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_InternalPrimaryDisplayResolutionHorizontal', 'Census_InternalPrimaryDisplayResolutionVertical', 'Census_InternalBatteryNumberOfCharges', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_OSInstallLanguageIdentifier', 'Census_OSUILocaleIdentifier', 

In [22]:
len(listNumerical)

53

### 2.3 Applying `StringIndexer` to all categorical features

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

In [24]:
indexersCategorical = \
[StringIndexer(inputCol=eachcat[0], outputCol="{0}_indexed".format(eachcat[0]), handleInvalid='keep') \
 for eachcat in listCategorical]

In [25]:
print [eachindexer.getOutputCol() for eachindexer in indexersCategorical]

['ProductName_indexed', 'EngineVersion_indexed', 'AppVersion_indexed', 'AvSigVersion_indexed', 'Platform_indexed', 'Processor_indexed', 'OsVer_indexed', 'OsPlatformSubRelease_indexed', 'OsBuildLab_indexed', 'SkuEdition_indexed', 'PuaMode_indexed', 'SmartScreen_indexed', 'Census_MDC2FormFactor_indexed', 'Census_DeviceFamily_indexed', 'Census_ProcessorClass_indexed', 'Census_PrimaryDiskTypeName_indexed', 'Census_ChassisTypeName_indexed', 'Census_PowerPlatformRoleName_indexed', 'Census_InternalBatteryType_indexed', 'Census_OSVersion_indexed', 'Census_OSArchitecture_indexed', 'Census_OSBranch_indexed', 'Census_OSEdition_indexed', 'Census_OSSkuName_indexed', 'Census_OSInstallTypeName_indexed', 'Census_OSWUAutoUpdateOptionsName_indexed', 'Census_GenuineStateName_indexed', 'Census_ActivationChannel_indexed', 'Census_FlightRing_indexed']


#### Sanity Check for `StringIndexer`

In [26]:
for idx in range(len(listCategorical)):
    indexersCategorical[idx].fit(rawdf).transform(rawdf)\
        .select('MachineIdentifier',listCategorical[idx][0],indexersCategorical[idx].getOutputCol()).show(5)

+--------------------+------------+-------------------+
|   MachineIdentifier| ProductName|ProductName_indexed|
+--------------------+------------+-------------------+
|0000028988387b115...|win8defender|                0.0|
|000007535c3f730ef...|win8defender|                0.0|
|000007905a28d863f...|win8defender|                0.0|
|00000b11598a75ea8...|win8defender|                0.0|
|000014a5f00daa18e...|win8defender|                0.0|
+--------------------+------------+-------------------+
only showing top 5 rows

+--------------------+-------------+---------------------+
|   MachineIdentifier|EngineVersion|EngineVersion_indexed|
+--------------------+-------------+---------------------+
|0000028988387b115...|  1.1.15100.1|                  1.0|
|000007535c3f730ef...|  1.1.14600.4|                  4.0|
|000007905a28d863f...|  1.1.15100.1|                  1.0|
|00000b11598a75ea8...|  1.1.15100.1|                  1.0|
|000014a5f00daa18e...|  1.1.15100.1|                  1.0|

+--------------------+--------------------------+----------------------------------+
|   MachineIdentifier|Census_PrimaryDiskTypeName|Census_PrimaryDiskTypeName_indexed|
+--------------------+--------------------------+----------------------------------+
|0000028988387b115...|                       HDD|                               0.0|
|000007535c3f730ef...|                       HDD|                               0.0|
|000007905a28d863f...|                       SSD|                               1.0|
|00000b11598a75ea8...|                   UNKNOWN|                               2.0|
|000014a5f00daa18e...|                       HDD|                               0.0|
+--------------------+--------------------------+----------------------------------+
only showing top 5 rows

+--------------------+----------------------+------------------------------+
|   MachineIdentifier|Census_ChassisTypeName|Census_ChassisTypeName_indexed|
+--------------------+----------------------+-----------

+--------------------+------------------------+--------------------------------+
|   MachineIdentifier|Census_ActivationChannel|Census_ActivationChannel_indexed|
+--------------------+------------------------+--------------------------------+
|0000028988387b115...|                  Retail|                             0.0|
|000007535c3f730ef...|                  Retail|                             0.0|
|000007905a28d863f...|              OEM:NONSLP|                             3.0|
|00000b11598a75ea8...|              OEM:NONSLP|                             3.0|
|000014a5f00daa18e...|                  Retail|                             0.0|
+--------------------+------------------------+--------------------------------+
only showing top 5 rows

+--------------------+-----------------+-------------------------+
|   MachineIdentifier|Census_FlightRing|Census_FlightRing_indexed|
+--------------------+-----------------+-------------------------+
|0000028988387b115...|           Retail|     

#### Indexing all categorical features

In [27]:
%%time
pipeline = Pipeline(stages=indexersCategorical)
numdf = pipeline.fit(rawdf).transform(rawdf)

CPU times: user 224 ms, sys: 55.1 ms, total: 279 ms
Wall time: 5min 22s


### 2.4 Checking the indexed dataframe 

In [28]:
numdf.cache()

DataFrame[MachineIdentifier: string, ProductName: string, EngineVersion: string, AppVersion: string, AvSigVersion: string, IsBeta: int, RtpStateBitfield: float, IsSxsPassiveMode: int, DefaultBrowsersIdentifier: float, AVProductStatesIdentifier: float, AVProductsInstalled: float, AVProductsEnabled: float, HasTpm: int, CountryIdentifier: int, CityIdentifier: float, OrganizationIdentifier: float, GeoNameIdentifier: float, LocaleEnglishNameIdentifier: int, Platform: string, Processor: string, OsVer: string, OsBuild: int, OsSuite: int, OsPlatformSubRelease: string, OsBuildLab: string, SkuEdition: string, IsProtected: float, AutoSampleOptIn: int, PuaMode: string, SMode: float, IeVerIdentifier: float, SmartScreen: string, Firewall: float, UacLuaenable: float, Census_MDC2FormFactor: string, Census_DeviceFamily: string, Census_OEMNameIdentifier: float, Census_OEMModelIdentifier: float, Census_ProcessorCoreCount: float, Census_ProcessorManufacturerIdentifier: float, Census_ProcessorModelIdentifi

In [29]:
len(numdf.columns)

112

In [30]:
rawdf.select('EngineVersion').groupBy('EngineVersion').count().sort(F.desc("count")).show(10)

+-------------+-------+
|EngineVersion|  count|
+-------------+-------+
|  1.1.15200.1|3845067|
|  1.1.15100.1|3675915|
|  1.1.15000.2| 265218|
|  1.1.14901.4| 212408|
|  1.1.14600.4| 160585|
|  1.1.14800.3| 136476|
|  1.1.15300.6| 120295|
|  1.1.14104.0|  93926|
|  1.1.13504.0|  70645|
|  1.1.15300.5|  68716|
+-------------+-------+
only showing top 10 rows



In [31]:
numdf.select('EngineVersion_indexed').groupBy('EngineVersion_indexed').count().sort(F.desc("count")).show(10)

+---------------------+-------+
|EngineVersion_indexed|  count|
+---------------------+-------+
|                  0.0|3845067|
|                  1.0|3675915|
|                  2.0| 265218|
|                  3.0| 212408|
|                  4.0| 160585|
|                  5.0| 136476|
|                  6.0| 120295|
|                  7.0|  93926|
|                  8.0|  70645|
|                  9.0|  68716|
+---------------------+-------+
only showing top 10 rows



In [32]:
numdf.select('EngineVersion_indexed','AppVersion_indexed','AvSigVersion_indexed').describe().show()

+-------+---------------------+------------------+--------------------+
|summary|EngineVersion_indexed|AppVersion_indexed|AvSigVersion_indexed|
+-------+---------------------+------------------+--------------------+
|  count|              8921483|           8921483|             8921483|
|   mean|   1.4386187812048736| 3.139872597414578|  418.11989946066143|
| stddev|   3.0645009246419974|6.3083521882857125|   698.0236865590472|
|    min|                  0.0|               0.0|                 0.0|
|    max|                 69.0|             109.0|              8530.0|
+-------+---------------------+------------------+--------------------+



#### Save it as a parquet table and done

In [33]:
import pyarrow as pa
import pyarrow.parquet as pq

In [34]:
numdf.write.option("compression","snappy").mode("overwrite")\
     .save("hdfs://master:54310/data/spark/msmalware/train_num_df.parquet.snappy")