In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType)
from pyspark.sql.functions import *
from pyspark import SparkConf
from pyspark import SparkContext
import multiprocessing
from pyspark.ml import Pipeline
import sys

print('Inicio del Script')

# Configuracion de memoria y cores
cores = multiprocessing.cpu_count()
p = 3
particiones = cores * p
conf = SparkConf()
conf.set("spark.sql.shuffle.partitions", particiones)
conf.set("spark.default.parallelism", particiones)
sc = SparkContext(conf=conf)


Inicio del Script


In [2]:
spark = SparkSession.builder.appName("Microsoft_Kaggle").getOrCreate()

# Read data
print('Lectura del DF crudo')
data = spark.read.csv('../data/df_cat/*.csv', header=True, inferSchema=True)\
.select('MachineIdentifier', 'Census_ChassisTypeName', 'Census_InternalBatteryType', 'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_FlightRing', 'OsVer', 'SmartScreen', 'Census_MDC2FormFactor')

# Persistimos el DF para mejorar el rendimiento
data.persist()
print('Numero de casos totales = {}'.format(data.count()))

Lectura del DF crudo
Numero de casos totales = 16774736


In [3]:
frequency_census = data.groupBy('Census_ChassisTypeName').count().withColumnRenamed('count','Census_ChassisTypeName_freq')


In [4]:
frequency_census.count()

58

In [5]:
frequency_census.show(58, False)

+----------------------+---------------------------+
|Census_ChassisTypeName|Census_ChassisTypeName_freq|
+----------------------+---------------------------+
|127                   |10                         |
|81                    |2                          |
|36                    |5                          |
|AllinOne              |380502                     |
|Tablet                |24924                      |
|0                     |226                        |
|30                    |297                        |
|PeripheralChassis     |1                          |
|Unknown               |19057                      |
|EmbeddedPC            |1                          |
|SpaceSaving           |55390                      |
|49                    |1                          |
|HandHeld              |82628                      |
|35                    |59                         |
|StickPC               |309                        |
|64                    |1                     

In [6]:
def transformaciones_ChassisTypeName(x):
    try:
        to_int = int(x)
        return 'Numerico'
    except:
        if x == 'Unknown' or x == 'Other':
            return 'UNKNOWN'
        else:
            return x
    
udf_ChassisTypeName = udf(lambda z: transformaciones_ChassisTypeName(z), StringType())

In [12]:
data = data.withColumn('Census_ChassisTypeName', udf_ChassisTypeName('Census_ChassisTypeName'))

In [13]:
data.select('Census_ChassisTypeName').distinct().show(100)

+----------------------+
|Census_ChassisTypeName|
+----------------------+
|              AllinOne|
|                Tablet|
|     PeripheralChassis|
|              Numerico|
|            EmbeddedPC|
|           SpaceSaving|
|              HandHeld|
|               StickPC|
|           Convertible|
|            SubChassis|
|              Portable|
|                MiniPC|
|               UNKNOWN|
|               Desktop|
|     LowProfileDesktop|
|          SealedCasePC|
|      ExpansionChassis|
|            Detachable|
|      RackMountChassis|
|           SubNotebook|
|   BusExpansionChassis|
|                  null|
|              Notebook|
|                Laptop|
|                 Tower|
|        BladeEnclosure|
|            IoTGateway|
|              LunchBox|
|    MultisystemChassis|
|        DockingStation|
|            CompactPCI|
|             MiniTower|
|     MainServerChassis|
|                 Blade|
|              PizzaBox|
+----------------------+



In [10]:
imputaciones = {'Census_ChassisTypeName': 'UNKNOWN'}

In [14]:
frequency_census = data.groupBy('Census_OSBranch').count().withColumnRenamed('count','Census_OSBranch_freq')

In [15]:
# Pensar agrupaciones
frequency_census.show(40, False)

+-------------------------+--------------------+
|Census_OSBranch          |Census_OSBranch_freq|
+-------------------------+--------------------+
|rs5_release_sigma        |67                  |
|win7sp1_ldr              |18                  |
|rs5_release_sign         |1                   |
|th2_release_sec          |491437              |
|rs5_release              |305932              |
|rs_shell                 |2                   |
|rs1_release_sec          |1                   |
|rs4_release              |7988302             |
|rs_prerelease_flt        |2956                |
|rs_xbox                  |2                   |
|win7sp1_gdr              |2                   |
|rs3_release_svc_escrow   |1695788             |
|win8_gdr                 |3                   |
|rs1_release_inmarket     |3                   |
|rs1_release_srvmedia     |12                  |
|rs_onecore_base_cobalt   |1                   |
|th1_st1                  |379561              |
|winblue_ltsb       

In [16]:
df_cat_freq_Census_OSEdition = data.groupBy('Census_OSEdition').count().withColumnRenamed('count', 'Census_OSEdition_freq')

In [17]:
df_cat_freq_Census_OSEdition.show(40, False)

+---------------------------+---------------------+
|Census_OSEdition           |Census_OSEdition_freq|
+---------------------------+---------------------+
|Home                       |6                    |
|Window 10 Enterprise       |1                    |
|CoreN                      |9478                 |
|ProfessionalEducationN     |434                  |
|ServerDatacenter           |38                   |
|ServerDatacenterACor       |1                    |
|Ultimate                   |17                   |
|EducationN                 |1853                 |
|ProfessionalCountrySpecific|8                    |
|00426-OEM-8992662-00006    |1                    |
|ServerDatacenterEval       |1402                 |
|CoreCountrySpecific        |343596               |
|CloudN                     |12                   |
|Enterprise 2015 LTSB       |1                    |
|EnterpriseN                |672                  |
|Core                       |6243156              |
|EnterpriseS

In [21]:
def transformaciones_OSEdition(x):
    if x == '#':
        return None
    if x == '00426-OEM-8992662-00006':
        return 'Ultimate'
    elif x == 'HomePremium' or x == 'HomeBasic':
        return 'Home'
    elif x == 'Window 10 Enterprise' or x == 'Enterprise 2015 LTSB':
        return 'Enterprise'
    elif x == 'ServerDatacenterACor' or x == 'ServerDatacenterEval':
        return 'ServerDatacenter'
    elif x == 'ProfessionalSingleLanguage' or x == 'PRO' or x == 'Pro' or x == 'professional'\
    or x == 'ProfessionalCountrySpecific':
        return 'Professional'
    elif x == 'ProfessionalEducationN' or x == 'EducationN' or x == 'ProfessionalEducation':
        return 'Education'
    elif x == 'CloudN':
        return 'Cloud'
    else:
        return x
    
udf_OSEdition = udf(lambda z: transformaciones_OSEdition(z), StringType())

In [22]:
a = data.withColumn('Census_OSEdition', udf_OSEdition('Census_OSEdition'))

In [24]:
a.groupBy('Census_OSEdition').count().show(100, False)

+------------------------+-------+
|Census_OSEdition        |count  |
+------------------------+-------+
|Home                    |11     |
|ServerDatacenter        |1441   |
|CoreN                   |9478   |
|Ultimate                |18     |
|CoreCountrySpecific     |343596 |
|EnterpriseN             |672    |
|Core                    |6243156|
|EnterpriseS             |40020  |
|ProfessionalN           |60807  |
|EnterpriseSN            |1482   |
|ProfessionalWorkstationN|21     |
|ServerSolution          |1303   |
|Education               |192433 |
|Enterprise              |69887  |
|ProfessionalWorkstation |303    |
|Cloud                   |12391  |
|EnterpriseG             |1      |
|null                    |6      |
|CoreSingleLanguage      |3542427|
|ServerStandardEval      |4538   |
|Professional            |6232539|
|ServerRdsh              |28     |
|ServerStandard          |18178  |
+------------------------+-------+



In [20]:
imputaciones['Census_OSEdition'] = 'Unknown'

In [23]:
frequency_Census_OSSkuName = data.groupBy('Census_OSSkuName').count().withColumnRenamed('count','Census_OSSkuName_freq')

In [25]:
frequency_Census_OSSkuName.show(32, False)

+----------------------------+---------------------+
|Census_OSSkuName            |Census_OSSkuName_freq|
+----------------------------+---------------------+
|ULTIMATE                    |18                   |
|CORE_COUNTRYSPECIFIC        |343155               |
|EDUCATION_N                 |1847                 |
|PRO_CHINA                   |7                    |
|UNLICENSED                  |30                   |
|ENTERPRISEG                 |2                    |
|PROFESSIONAL_N              |61215                |
|DATACENTER_SERVER           |41                   |
|ENTERPRISE_S_N              |1484                 |
|ENTERPRISE                  |69737                |
|PRO_WORKSTATION_N           |20                   |
|CORE_SINGLELANGUAGE         |3541887              |
|HOME_BASIC                  |1                    |
|SB_SOLUTION_SERVER          |1304                 |
|DATACENTER_EVALUATION_SERVER|1398                 |
|PROFESSIONAL                |6345213         

In [26]:
frequency_Census_Census_FlightRing = data.groupBy('Census_FlightRing').count().withColumnRenamed('count','Census_FlightRing_freq')

In [28]:
frequency_Census_Census_FlightRing.show()

+-----------------+----------------------+
|Census_FlightRing|Census_FlightRing_freq|
+-----------------+----------------------+
|              OSG|                    11|
|          Unknown|                444681|
|           Canary|                     5|
|           Retail|              15696448|
|          Invalid|                     2|
|         Disabled|                  7101|
|              WIF|                 20435|
|              WIS|                 20663|
|               RP|                 26769|
|         CBCanary|                     1|
|          NOT_SET|                558620|
+-----------------+----------------------+



In [32]:
df_cat_freq_SmartScreen = data.groupBy('SmartScreen').count().withColumnRenamed('count', 'SmartScreen_freq')

In [34]:
df_cat_freq_SmartScreen.show(28, False)

+-------------+----------------+
|SmartScreen  |SmartScreen_freq|
+-------------+----------------+
|&#x01;       |602             |
|OFF          |11              |
|Block        |43774           |
|0            |6               |
|00000000     |1               |
|&#x02;       |820             |
|requireadmin |20              |
|ON           |1               |
|Promt        |2               |
|RequiredAdmin|1               |
|Enabled      |1               |
|of           |1               |
|BLOCK        |1               |
|Prompt       |63422           |
|on           |280             |
|&#x03;       |1               |
|Off          |347924          |
|RequireAdmin |7729743         |
|off          |3133            |
|null         |6675413         |
|ExistsNotSet |1646629         |
|requireAdmin |1               |
|warn         |2               |
|On           |1536            |
|Warn         |261408          |
|prompt       |1               |
|Deny         |1               |
|Promprt  

In [None]:
data = data.withColumn('SmartScreen', when(col('SmartScreen') == '&#x01;'), None)\
.otherwise(col('SmartScreen')))

In [52]:
def transformaciones_SmartScreen(x):
    if x == '&#x01;' or x == '0' or x == '00000000' or x == '&#x02;' or x == '&#x03;':
        return None
    elif x == 'Block':
        return 'BLOCK'
    elif x == 'requireadmin' or x == 'RequireAdmin' or x == 'requireAdmin':
        return 'RequiredAdmin'
    elif x == 'Promt' or x == 'prompt' or x == 'Promprt':
        return 'Prompt'
    elif x == 'of' or x == 'Off' or x == 'off':
        return 'OFF'
    elif x == 'on' or x == 'On' or x == 'Enabled':
        return 'ON'
    elif x == 'warn':
        return 'Warn'
    else:
        return x
    
udf_SmartScreen = udf(lambda z: transformaciones_SmartScreen(z), StringType())

In [55]:
data = data.withColumn('SmartScreen', udf_SmartScreen('SmartScreen'))

In [56]:
data.select('SmartScreen').distinct().show()

+-------------+
|  SmartScreen|
+-------------+
|          OFF|
|           ON|
|RequiredAdmin|
|        BLOCK|
|       Prompt|
|         null|
| ExistsNotSet|
|         Warn|
|         Deny|
+-------------+



In [58]:
data.show(50)

+--------------------+----------------------+--------------------------+--------------------+--------------------+--------------------+-----------------+--------+-------------+---------------------+
|   MachineIdentifier|Census_ChassisTypeName|Census_InternalBatteryType|     Census_OSBranch|    Census_OSEdition|    Census_OSSkuName|Census_FlightRing|   OsVer|  SmartScreen|Census_MDC2FormFactor|
+--------------------+----------------------+--------------------------+--------------------+--------------------+--------------------+-----------------+--------+-------------+---------------------+
|5344f3df517376797...|              Notebook|                      null|         rs4_release|  CoreSingleLanguage| CORE_SINGLELANGUAGE|           Retail|10.0.0.0|         null|             Notebook|
|5344f5da09650da47...|              Notebook|                      li-i|         th2_release|                Core|                CORE|           Retail|10.0.0.0|RequiredAdmin|             Notebook|
|5344

In [None]:
imputaciones['SmartScreen'] = 'Unknown'

In [59]:
df_cat_freq_osver = data.groupBy('Census_MDC2FormFactor').count().withColumnRenamed('count', 'Census_MDC2FormFactor_freq')

In [61]:
df_cat_freq_osver.show()

+---------------------+--------------------------+
|Census_MDC2FormFactor|Census_MDC2FormFactor_freq|
+---------------------+--------------------------+
|          SmallTablet|                     52491|
|          ServerOther|                        53|
|          Convertible|                    744141|
|                Other|                         1|
|              Desktop|                   3813472|
|          LargeTablet|                    121325|
|             IoTOther|                         2|
|          LargeServer|                      1453|
|           Detachable|                    544171|
|             AllInOne|                    547632|
|             Notebook|                  10652855|
|              PCOther|                    275408|
|         MediumServer|                      6169|
|          SmallServer|                     15563|
+---------------------+--------------------------+



In [62]:
# informamos si es other
data = data.withColumn('Census_MDC2FormFactor_other',
                       when((col('Census_MDC2FormFactor') == 'Other') |\
                            (col('Census_MDC2FormFactor') == 'ServerOther') |\
                            (col('Census_MDC2FormFactor') == 'IoTOther') |\
                            (col('Census_MDC2FormFactor') == 'PCOther'), 1)\
                      .otherwise(0))