# Con MongoDB

In [None]:
import pandas as pd
import numpy as np
import pymongo

In [None]:
# Iniciamos el cliente de MongoDB

myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [None]:
# Obtenemos la bbdd deseasa, en este caso "microsoft"

db = myclient.microsoft

In [None]:
# Abrimos la tabla deseada de la base de datos
# es decir, el train y el test previamiente cargados

train = db.train
test = db.test

In [None]:
print(train.find_one())

In [None]:
print(test.find_one())

In [None]:
# Si sobra memoria RAM, se puede importar estas colecciones
# a un DF de Pandas

# df_train = pd.DataFrame.from_records(train.find())

# Con PySpark

Mejor para DFs con un gran volumen de datos

Librerias a utilizar

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType)
from pyspark.sql.functions import *
from pyspark import SparkConf
from pyspark import SparkContext
import multiprocessing

Configuracion basica de particiones y memoria a utilizar

In [2]:
cores = multiprocessing.cpu_count()
p = 10
conf = SparkConf()
conf.set("spark.driver.cores", cores)
conf.set("spark.driver.memory", "10g")
conf.set("spark.sql.shuffle.partitions", p * cores)
conf.set("spark.default.parallelism", p * cores)
sc = SparkContext(conf=conf)

Iniciamos la SparkSession

In [3]:
spark = SparkSession.builder.appName('separador_cat_num').getOrCreate()

Schemas para train y para test

In [4]:
schema_train = StructType([StructField('MachineIdentifier', StringType(), True),
                           StructField('ProductName', StringType(), True),
                           StructField('EngineVersion', StringType(), True),
                           StructField('AppVersion', StringType(), True),
                           StructField('AvSigVersion', StringType(), True),
                           StructField('IsBeta', StringType(), True),
                           StructField('RtpStateBitfield', StringType(), True),
                           StructField('IsSxsPassiveMode', StringType(), True),
                           StructField('DefaultBrowsersIdentifier', StringType(), True),
                           StructField('AVProductStatesIdentifier', StringType(), True),
                           StructField('AVProductsInstalled', StringType(), True),
                           StructField('AVProductsEnabled', StringType(), True),
                           StructField('HasTpm', StringType(), True),
                           StructField('CountryIdentifier', StringType(), True),
                           StructField('CityIdentifier', StringType(), True),
                           StructField('OrganizationIdentifier', StringType(), True),
                           StructField('GeoNameIdentifier', StringType(), True),
                           StructField('LocaleEnglishNameIdentifier', StringType(), True),
                           StructField('Platform', StringType(), True),
                           StructField('Processor', StringType(), True),
                           StructField('OsVer', StringType(), True),
                           StructField('OsBuild', StringType(), True),
                           StructField('OsSuite', StringType(), True),
                           StructField('OsPlatformSubRelease', StringType(), True),
                           StructField('OsBuildLab', StringType(), True),
                           StructField('SkuEdition', StringType(), True),
                           StructField('IsProtected', StringType(), True),
                           StructField('AutoSampleOptIn', StringType(), True),
                           StructField('PuaMode', StringType(), True),
                           StructField('SMode', StringType(), True),
                           StructField('IeVerIdentifier', StringType(), True),
                           StructField('SmartScreen', StringType(), True),
                           StructField('Firewall', StringType(), True),
                           StructField('UacLuaenable', StringType(), True),
                           StructField('Census_MDC2FormFactor', StringType(), True),
                           StructField('Census_DeviceFamily', StringType(), True),
                           StructField('Census_OEMNameIdentifier', StringType(), True),
                           StructField('Census_OEMModelIdentifier', StringType(), True),
                           StructField('Census_ProcessorCoreCount', StringType(), True),
                           StructField('Census_ProcessorManufacturerIdentifier', StringType(), True),
                           StructField('Census_ProcessorModelIdentifier', StringType(), True),
                           StructField('Census_ProcessorClass', StringType(), True),
                           StructField('Census_PrimaryDiskTotalCapacity', StringType(), True),
                           StructField('Census_PrimaryDiskTypeName', StringType(), True),
                           StructField('Census_SystemVolumeTotalCapacity', StringType(), True),
                           StructField('Census_HasOpticalDiskDrive', StringType(), True),
                           StructField('Census_TotalPhysicalRAM', StringType(), True),
                           StructField('Census_ChassisTypeName', StringType(), True),
                           StructField('Census_InternalPrimaryDiagonalDisplaySizeInInches', StringType(), True),
                           StructField('Census_InternalPrimaryDisplayResolutionHorizontal', StringType(), True),
                           StructField('Census_InternalPrimaryDisplayResolutionVertical', StringType(), True),
                           StructField('Census_PowerPlatformRoleName', StringType(), True),
                           StructField('Census_InternalBatteryType', StringType(), True),
                           StructField('Census_InternalBatteryNumberOfCharges', StringType(), True),
                           StructField('Census_OSVersion', StringType(), True),
                           StructField('Census_OSArchitecture', StringType(), True),
                           StructField('Census_OSBranch', StringType(), True),
                           StructField('Census_OSBuildNumber', StringType(), True),
                           StructField('Census_OSBuildRevision', StringType(), True),
                           StructField('Census_OSEdition', StringType(), True),
                           StructField('Census_OSSkuName', StringType(), True),
                           StructField('Census_OSInstallTypeName', StringType(), True),
                           StructField('Census_OSInstallLanguageIdentifier', StringType(), True),
                           StructField('Census_OSUILocaleIdentifier', StringType(), True),
                           StructField('Census_OSWUAutoUpdateOptionsName', StringType(), True),
                           StructField('Census_IsPortableOperatingSystem', StringType(), True),
                           StructField('Census_GenuineStateName', StringType(), True),
                           StructField('Census_ActivationChannel', StringType(), True),
                           StructField('Census_IsFlightingInternal', StringType(), True),
                           StructField('Census_IsFlightsDisabled', StringType(), True),
                           StructField('Census_FlightRing', StringType(), True),
                           StructField('Census_ThresholdOptIn', StringType(), True),
                           StructField('Census_FirmwareManufacturerIdentifier', StringType(), True),
                           StructField('Census_FirmwareVersionIdentifier', StringType(), True),
                           StructField('Census_IsSecureBootEnabled', StringType(), True),
                           StructField('Census_IsWIMBootEnabled', StringType(), True),
                           StructField('Census_IsVirtualDevice', StringType(), True),
                           StructField('Census_IsTouchEnabled', StringType(), True),
                           StructField('Census_IsPenCapable', StringType(), True),
                           StructField('Census_IsAlwaysOnAlwaysConnectedCapable', StringType(), True),
                           StructField('Wdft_IsGamer', StringType(), True),
                           StructField('Wdft_RegionIdentifier', StringType(), True),
                           StructField('HasDetections', StringType(), True)
                          ])

schema_test = StructType([StructField('MachineIdentifier', StringType(), True),
                           StructField('ProductName', StringType(), True),
                           StructField('EngineVersion', StringType(), True),
                           StructField('AppVersion', StringType(), True),
                           StructField('AvSigVersion', StringType(), True),
                           StructField('IsBeta', StringType(), True),
                           StructField('RtpStateBitfield', StringType(), True),
                           StructField('IsSxsPassiveMode', StringType(), True),
                           StructField('DefaultBrowsersIdentifier', StringType(), True),
                           StructField('AVProductStatesIdentifier', StringType(), True),
                           StructField('AVProductsInstalled', StringType(), True),
                           StructField('AVProductsEnabled', StringType(), True),
                           StructField('HasTpm', StringType(), True),
                           StructField('CountryIdentifier', StringType(), True),
                           StructField('CityIdentifier', StringType(), True),
                           StructField('OrganizationIdentifier', StringType(), True),
                           StructField('GeoNameIdentifier', StringType(), True),
                           StructField('LocaleEnglishNameIdentifier', StringType(), True),
                           StructField('Platform', StringType(), True),
                           StructField('Processor', StringType(), True),
                           StructField('OsVer', StringType(), True),
                           StructField('OsBuild', StringType(), True),
                           StructField('OsSuite', StringType(), True),
                           StructField('OsPlatformSubRelease', StringType(), True),
                           StructField('OsBuildLab', StringType(), True),
                           StructField('SkuEdition', StringType(), True),
                           StructField('IsProtected', StringType(), True),
                           StructField('AutoSampleOptIn', StringType(), True),
                           StructField('PuaMode', StringType(), True),
                           StructField('SMode', StringType(), True),
                           StructField('IeVerIdentifier', StringType(), True),
                           StructField('SmartScreen', StringType(), True),
                           StructField('Firewall', StringType(), True),
                           StructField('UacLuaenable', StringType(), True),
                           StructField('Census_MDC2FormFactor', StringType(), True),
                           StructField('Census_DeviceFamily', StringType(), True),
                           StructField('Census_OEMNameIdentifier', StringType(), True),
                           StructField('Census_OEMModelIdentifier', StringType(), True),
                           StructField('Census_ProcessorCoreCount', StringType(), True),
                           StructField('Census_ProcessorManufacturerIdentifier', StringType(), True),
                           StructField('Census_ProcessorModelIdentifier', StringType(), True),
                           StructField('Census_ProcessorClass', StringType(), True),
                           StructField('Census_PrimaryDiskTotalCapacity', StringType(), True),
                           StructField('Census_PrimaryDiskTypeName', StringType(), True),
                           StructField('Census_SystemVolumeTotalCapacity', StringType(), True),
                           StructField('Census_HasOpticalDiskDrive', StringType(), True),
                           StructField('Census_TotalPhysicalRAM', StringType(), True),
                           StructField('Census_ChassisTypeName', StringType(), True),
                           StructField('Census_InternalPrimaryDiagonalDisplaySizeInInches', StringType(), True),
                           StructField('Census_InternalPrimaryDisplayResolutionHorizontal', StringType(), True),
                           StructField('Census_InternalPrimaryDisplayResolutionVertical', StringType(), True),
                           StructField('Census_PowerPlatformRoleName', StringType(), True),
                           StructField('Census_InternalBatteryType', StringType(), True),
                           StructField('Census_InternalBatteryNumberOfCharges', StringType(), True),
                           StructField('Census_OSVersion', StringType(), True),
                           StructField('Census_OSArchitecture', StringType(), True),
                           StructField('Census_OSBranch', StringType(), True),
                           StructField('Census_OSBuildNumber', StringType(), True),
                           StructField('Census_OSBuildRevision', StringType(), True),
                           StructField('Census_OSEdition', StringType(), True),
                           StructField('Census_OSSkuName', StringType(), True),
                           StructField('Census_OSInstallTypeName', StringType(), True),
                           StructField('Census_OSInstallLanguageIdentifier', StringType(), True),
                           StructField('Census_OSUILocaleIdentifier', StringType(), True),
                           StructField('Census_OSWUAutoUpdateOptionsName', StringType(), True),
                           StructField('Census_IsPortableOperatingSystem', StringType(), True),
                           StructField('Census_GenuineStateName', StringType(), True),
                           StructField('Census_ActivationChannel', StringType(), True),
                           StructField('Census_IsFlightingInternal', StringType(), True),
                           StructField('Census_IsFlightsDisabled', StringType(), True),
                           StructField('Census_FlightRing', StringType(), True),
                           StructField('Census_ThresholdOptIn', StringType(), True),
                           StructField('Census_FirmwareManufacturerIdentifier', StringType(), True),
                           StructField('Census_FirmwareVersionIdentifier', StringType(), True),
                           StructField('Census_IsSecureBootEnabled', StringType(), True),
                           StructField('Census_IsWIMBootEnabled', StringType(), True),
                           StructField('Census_IsVirtualDevice', StringType(), True),
                           StructField('Census_IsTouchEnabled', StringType(), True),
                           StructField('Census_IsPenCapable', StringType(), True),
                           StructField('Census_IsAlwaysOnAlwaysConnectedCapable', StringType(), True),
                           StructField('Wdft_IsGamer', StringType(), True),
                           StructField('Wdft_RegionIdentifier', StringType(), True),
                          ])

Cargamos los datos. Para empezar lo haremos con inferSchema=True

In [5]:
df_train = spark.read.csv('data/train.csv', header=True, inferSchema=True)
df_test = spark.read.csv('data/test.csv', header=True, inferSchema=True)

In [6]:
df_train.show(4, False)

+--------------------------------+------------+-------------+---------------+------------+------+----------------+----------------+-------------------------+-------------------------+-------------------+-----------------+------+-----------------+--------------+----------------------+-----------------+---------------------------+---------+---------+--------+-------+-------+--------------------+----------------------------------------+----------+-----------+---------------+-------+-----+---------------+------------+--------+------------+---------------------+-------------------+------------------------+-------------------------+-------------------------+--------------------------------------+-------------------------------+---------------------+-------------------------------+--------------------------+--------------------------------+--------------------------+-----------------------+----------------------+-------------------------------------------------+-------------------------------

In [7]:
df_test.show(4, False)

+--------------------------------+------------+-------------+-----------+------------+------+----------------+----------------+-------------------------+-------------------------+-------------------+-----------------+------+-----------------+--------------+----------------------+-----------------+---------------------------+---------+---------+--------+-------+-------+--------------------+-----------------------------------------------------+----------+-----------+---------------+-------+-----+---------------+------------+--------+------------+---------------------+-------------------+------------------------+-------------------------+-------------------------+--------------------------------------+-------------------------------+---------------------+-------------------------------+--------------------------+--------------------------------+--------------------------+-----------------------+----------------------+-------------------------------------------------+----------------------

Creamos la variable "HasDetections" para el test

In [8]:
df_test = df_test.withColumn('HasDetections', lit(None))

In [9]:
df_test.show(4, False)

+--------------------------------+------------+-------------+-----------+------------+------+----------------+----------------+-------------------------+-------------------------+-------------------+-----------------+------+-----------------+--------------+----------------------+-----------------+---------------------------+---------+---------+--------+-------+-------+--------------------+-----------------------------------------------------+----------+-----------+---------------+-------+-----+---------------+------------+--------+------------+---------------------+-------------------+------------------------+-------------------------+-------------------------+--------------------------------------+-------------------------------+---------------------+-------------------------------+--------------------------+--------------------------------+--------------------------+-----------------------+----------------------+-------------------------------------------------+----------------------

In [10]:
print(df_train.count(), df_test.count())

8921483 7853253


Unimos el train y test en un unico DF

In [11]:
full_df = df_train.union(df_test)

In [12]:
full_df.persist()
print(full_df.count())

16774736


Ahora comenzamos con el proceso de separar variables categoricas y numericas

In [16]:
cat_cols = list()
num_cols = list()

In [21]:
all_types = full_df.dtypes
for col in all_types:
#     print(col[0], col[1])
    if col[1] == 'string':
        cat_cols.append(col[0])
    else:
        num_cols.append(col[0])

In [22]:
cat_cols

['MachineIdentifier',
 'ProductName',
 'EngineVersion',
 'AppVersion',
 'AvSigVersion',
 'Platform',
 'Processor',
 'OsVer',
 'OsPlatformSubRelease',
 'OsBuildLab',
 'SkuEdition',
 'PuaMode',
 'SmartScreen',
 'Census_MDC2FormFactor',
 'Census_DeviceFamily',
 'Census_ProcessorClass',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_PowerPlatformRoleName',
 'Census_InternalBatteryType',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSSkuName',
 'Census_OSInstallTypeName',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_GenuineStateName',
 'Census_ActivationChannel',
 'Census_FlightRing']

In [23]:
num_cols

['IsBeta',
 'RtpStateBitfield',
 'IsSxsPassiveMode',
 'DefaultBrowsersIdentifier',
 'AVProductStatesIdentifier',
 'AVProductsInstalled',
 'AVProductsEnabled',
 'HasTpm',
 'CountryIdentifier',
 'CityIdentifier',
 'OrganizationIdentifier',
 'GeoNameIdentifier',
 'LocaleEnglishNameIdentifier',
 'OsBuild',
 'OsSuite',
 'IsProtected',
 'AutoSampleOptIn',
 'SMode',
 'IeVerIdentifier',
 'Firewall',
 'UacLuaenable',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorCoreCount',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_PrimaryDiskTotalCapacity',
 'Census_SystemVolumeTotalCapacity',
 'Census_HasOpticalDiskDrive',
 'Census_TotalPhysicalRAM',
 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
 'Census_InternalPrimaryDisplayResolutionHorizontal',
 'Census_InternalPrimaryDisplayResolutionVertical',
 'Census_InternalBatteryNumberOfCharges',
 'Census_OSBuildNumber',
 'Census_OSBuildRevision',
 'Census_OSInstallLanguageIdentifi

In [None]:
# def pyspark_read_from_csv(csv_path='', iftrain = True):
#     if iftrain:
#         df_schema = schema_train
#     else:
#         df_schema = schema_test
#     df_file = spark.read.csv(csv_path, schema=df_schema, sep=',')

In [None]:
# df_train = spark.read.csv('data/train.csv')