# Kaggle Competition : Microsoft Malware Data
> This is a Sungryong Hong's Notebook.  

> I have a stand-alone Spark(2.3.2)/Hadoop(2.8.3) cluster, which has 48 logical cores with 150GB memory. 

> I have put the data files to my hdfs. Check the contents as `hfs -cat /data/spark/msmalware/test.csv | head`.  

>`hfs` is an alias for `hdfs dfs`. 


## 1. Import Basic Packages

In [1]:
# Basic Libraries 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
import gc

pd.set_option('display.max_rows', 500)

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

#### Spark-Shell Sesssion 

In [2]:
# Basic PySpark Libraries

# Old Style : SparkContext 
#from pyspark import SparkContext   
#from pyspark.sql import SQLContext


# New Style : Spark Session  
#Shell-Mode: Spark Session Name is `spark`

sc = spark.sparkContext
sqlsc = SQLContext(sc)
sc.setCheckpointDir("hdfs://master:54310/tmp/spark/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

In [3]:
# Enable Arrow for boosting up python performances 
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
spark.conf.set('spark.debug.maxToStringFields',50)

#### Spark-Submit Session

> An example of Spark Session for non-shell (submit) mode; 

>`spark = SparkSession.builder.appName("largeScaleGstat")\
    .config("spark.driver.maxResultSize","8g")\
    .config("spark.sql.execution.arrow.enabled","true")\
    .config("spark.executor.memoryOverhead","42GB")\
    .getOrCreate()`


## 2. Read the encoded paquet table

In [4]:
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
numdf = sqlsc.read.parquet('hdfs://master:54310/data/spark/msmalware/train_num_df.parquet.snappy')

### 2.1 Explore the dataframe

In [6]:
print numdf.columns

['MachineIdentifier', 'ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'IsBeta', 'RtpStateBitfield', 'IsSxsPassiveMode', 'DefaultBrowsersIdentifier', 'AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled', 'HasTpm', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier', 'Platform', 'Processor', 'OsVer', 'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'IsProtected', 'AutoSampleOptIn', 'PuaMode', 'SMode', 'IeVerIdentifier', 'SmartScreen', 'Firewall', 'UacLuaenable', 'Census_MDC2FormFactor', 'Census_DeviceFamily', 'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount', 'Census_ProcessorManufacturerIdentifier', 'Census_ProcessorModelIdentifier', 'Census_ProcessorClass', 'Census_PrimaryDiskTotalCapacity', 'Census_PrimaryDiskTypeName', 'Census_SystemVolumeTotalCapacity', 'Census_HasOpticalDiskDrive', 'Census_TotalPhysicalRAM', 'Census_ChassisType

In [7]:
len(numdf.columns)

148

#### Due to the too many columns, apply `transpose()` to display the contents vertically

In [8]:
numdf.limit(2).toPandas().transpose()

Unnamed: 0,0,1
MachineIdentifier,aade60f69e46ec08d0c36d3fd333a873,aade6203179179e823eb82de772692e8
ProductName,win8defender,win8defender
EngineVersion,1.1.15200.1,1.1.15100.1
AppVersion,4.18.1807.18075,4.18.1806.18062
AvSigVersion,1.275.215.0,1.273.1490.0
IsBeta,0,0
RtpStateBitfield,7,7
IsSxsPassiveMode,0,0
DefaultBrowsersIdentifier,,
AVProductStatesIdentifier,53447,53447


#### Select numerical features

In [9]:
listFeatures = []
listIntegerFeatures = []
listIndexedFeatures = []
listImputedFeatures = [] 
for eachcol in numdf.dtypes:
    if 'indexed' in eachcol[0]:
        listIndexedFeatures.append(eachcol[0])
        #print eachcol[0] + " -> Indexed : " + listIndexedFeatures[-1]
    if 'imputed' in eachcol[0]:
        listImputedFeatures.append(eachcol[0])
        #print eachcol[0] + " -> Imputed : " + listImputedFeatures[-1]
    if eachcol[1] == 'int':
        listIntegerFeatures.append(eachcol[0])
        #print eachcol[0] + " -> Integer : " + listIntegerFeatures[-1]

listFeatures = listIntegerFeatures + listIndexedFeatures + listImputedFeatures    

In [10]:
len(listFeatures)

82

In [11]:
numdf.select(listFeatures).limit(8).toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7
IsBeta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IsSxsPassiveMode,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HasTpm,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CountryIdentifier,84.0,179.0,41.0,43.0,149.0,44.0,89.0,50.0
LocaleEnglishNameIdentifier,44.0,75.0,217.0,42.0,56.0,218.0,75.0,51.0
OsBuild,16299.0,16299.0,17134.0,17134.0,9600.0,17134.0,10240.0,17134.0
OsSuite,768.0,768.0,256.0,768.0,768.0,768.0,256.0,256.0
AutoSampleOptIn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Census_HasOpticalDiskDrive,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
Census_OSBuildNumber,16299.0,16299.0,17134.0,17134.0,10586.0,17134.0,10240.0,17134.0


#### Checking all features have zero `null` value.

In [12]:
nullstatdf = numdf.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in listFeatures])\
     .toPandas().transpose()
nullstatdf.columns = ['null_counts']
nullstatdf.sort_values(by='null_counts',ascending=False)

Unnamed: 0,null_counts
IsBeta,0
Census_GenuineStateName_indexed,0
Census_OSInstallLanguageIdentifier_indexed,0
Census_OSInstallTypeName_indexed,0
Census_OSSkuName_indexed,0
Census_OSEdition_indexed,0
Census_OSBranch_indexed,0
Census_OSArchitecture_indexed,0
Census_OSVersion_indexed,0
Census_InternalBatteryType_indexed,0


## 3. Vectorize the Features

#### `OneHotEncode` the indexed features

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator

In [14]:
encoder = OneHotEncoderEstimator(inputCols=listIndexedFeatures,\
                                 outputCols=[eachcol+ "_onehot" for eachcol in listIndexedFeatures])

In [15]:
listOneHotFeatures = encoder.getOutputCols()
print listOneHotFeatures

['ProductName_indexed_onehot', 'EngineVersion_indexed_onehot', 'AppVersion_indexed_onehot', 'AvSigVersion_indexed_onehot', 'RtpStateBitfield_indexed_onehot', 'DefaultBrowsersIdentifier_indexed_onehot', 'AVProductStatesIdentifier_indexed_onehot', 'AVProductsInstalled_indexed_onehot', 'AVProductsEnabled_indexed_onehot', 'CityIdentifier_indexed_onehot', 'OrganizationIdentifier_indexed_onehot', 'GeoNameIdentifier_indexed_onehot', 'Platform_indexed_onehot', 'Processor_indexed_onehot', 'OsVer_indexed_onehot', 'OsPlatformSubRelease_indexed_onehot', 'OsBuildLab_indexed_onehot', 'SkuEdition_indexed_onehot', 'IsProtected_indexed_onehot', 'PuaMode_indexed_onehot', 'SMode_indexed_onehot', 'IeVerIdentifier_indexed_onehot', 'SmartScreen_indexed_onehot', 'Firewall_indexed_onehot', 'UacLuaenable_indexed_onehot', 'Census_MDC2FormFactor_indexed_onehot', 'Census_DeviceFamily_indexed_onehot', 'Census_OEMNameIdentifier_indexed_onehot', 'Census_OEMModelIdentifier_indexed_onehot', 'Census_ProcessorManufactur

In [16]:
%%time
numdf = encoder.fit(numdf).transform(numdf)

CPU times: user 20.9 ms, sys: 6.31 ms, total: 27.2 ms
Wall time: 1.54 s


In [17]:
numdf.select('Census_OSSkuName_indexed','Census_ProcessorClass_indexed',\
             'Census_OSSkuName_indexed_onehot','Census_ProcessorClass_indexed_onehot').show(5)

+------------------------+-----------------------------+-------------------------------+------------------------------------+
|Census_OSSkuName_indexed|Census_ProcessorClass_indexed|Census_OSSkuName_indexed_onehot|Census_ProcessorClass_indexed_onehot|
+------------------------+-----------------------------+-------------------------------+------------------------------------+
|                     0.0|                          3.0|                 (30,[0],[1.0])|                           (3,[],[])|
|                     0.0|                          3.0|                 (30,[0],[1.0])|                           (3,[],[])|
|                     1.0|                          3.0|                 (30,[1],[1.0])|                           (3,[],[])|
|                     3.0|                          3.0|                 (30,[3],[1.0])|                           (3,[],[])|
|                     0.0|                          3.0|                 (30,[0],[1.0])|                           (3,

#### `VectorAssembler` the vectorizedFeatures

In [18]:
from pyspark.ml.feature import VectorAssembler

In [19]:
vectorizedFeatures = listIntegerFeatures + listOneHotFeatures + listImputedFeatures    

In [20]:
len(vectorizedFeatures)

82

In [21]:
print vectorizedFeatures

['IsBeta', 'IsSxsPassiveMode', 'HasTpm', 'CountryIdentifier', 'LocaleEnglishNameIdentifier', 'OsBuild', 'OsSuite', 'AutoSampleOptIn', 'Census_HasOpticalDiskDrive', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_OSUILocaleIdentifier', 'Census_IsPortableOperatingSystem', 'Census_IsSecureBootEnabled', 'Census_IsTouchEnabled', 'Census_IsPenCapable', 'HasDetections', 'ProductName_indexed_onehot', 'EngineVersion_indexed_onehot', 'AppVersion_indexed_onehot', 'AvSigVersion_indexed_onehot', 'RtpStateBitfield_indexed_onehot', 'DefaultBrowsersIdentifier_indexed_onehot', 'AVProductStatesIdentifier_indexed_onehot', 'AVProductsInstalled_indexed_onehot', 'AVProductsEnabled_indexed_onehot', 'CityIdentifier_indexed_onehot', 'OrganizationIdentifier_indexed_onehot', 'GeoNameIdentifier_indexed_onehot', 'Platform_indexed_onehot', 'Processor_indexed_onehot', 'OsVer_indexed_onehot', 'OsPlatformSubRelease_indexed_onehot', 'OsBuildLab_indexed_onehot', 'SkuEdition_indexed_onehot', 'IsProtected_indexe

In [22]:
vecAssem = VectorAssembler(inputCols = vectorizedFeatures, outputCol= "features")

In [23]:
mldata = vecAssem.transform(numdf).select(['MachineIdentifier']+vectorizedFeatures+['features'])

In [24]:
mldata.cache()

DataFrame[MachineIdentifier: string, IsBeta: int, IsSxsPassiveMode: int, HasTpm: int, CountryIdentifier: int, LocaleEnglishNameIdentifier: int, OsBuild: int, OsSuite: int, AutoSampleOptIn: int, Census_HasOpticalDiskDrive: int, Census_OSBuildNumber: int, Census_OSBuildRevision: int, Census_OSUILocaleIdentifier: int, Census_IsPortableOperatingSystem: int, Census_IsSecureBootEnabled: int, Census_IsTouchEnabled: int, Census_IsPenCapable: int, HasDetections: int, ProductName_indexed_onehot: vector, EngineVersion_indexed_onehot: vector, AppVersion_indexed_onehot: vector, AvSigVersion_indexed_onehot: vector, RtpStateBitfield_indexed_onehot: vector, DefaultBrowsersIdentifier_indexed_onehot: vector, AVProductStatesIdentifier_indexed_onehot: vector, AVProductsInstalled_indexed_onehot: vector, AVProductsEnabled_indexed_onehot: vector, CityIdentifier_indexed_onehot: vector, OrganizationIdentifier_indexed_onehot: vector, GeoNameIdentifier_indexed_onehot: vector, Platform_indexed_onehot: vector, Pro

#### Save it as a parquet table and done

In [25]:
import pyarrow as pa
import pyarrow.parquet as pq

In [26]:
mldata.write.option("compression","snappy").mode("overwrite")\
     .save("hdfs://master:54310/data/spark/msmalware/mldata.parquet.snappy")