In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType,DateType)
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark import SparkConf
from pyspark import SparkContext
import multiprocessing
from datetime import datetime
import numpy as np
import pandas as pd
import csv

In [23]:
datedictAS = np.load('AvSigVersionTimestamps.npy')
datedictOS = np.load('OSVersionTimestamps.npy')

In [56]:
rango_av = len(datedictAS[()])
rango_os = len(datedictOS[()])

In [57]:
df_os = pd.DataFrame(datedictOS.tolist(),index=range(1,rango_os+1))
df_av = pd.DataFrame(datedictAS.tolist(),index=range(1,rango_av+1))

In [58]:
data_os = df_os.transpose().reset_index()
data_av = df_av.transpose().reset_index()

In [59]:
data_os = data_os.iloc[:,:2]
data_av = data_av.iloc[:,:2]

In [60]:
data_os.columns = ['Census_OSVersion', 'DateCensus_OSVersion']
data_av.columns = ['AvSigVersion', 'DateAvSigVersion']

In [63]:
print(data_os.head())
print(data_av.head())

   Census_OSVersion DateCensus_OSVersion
0    10.0.14393.351           2016-10-27
1   10.0.14393.2097           2018-02-22
2  10.0.10240.17918           2018-07-16
3   10.0.17744.1004           2018-09-07
4    10.0.16299.665           2018-09-11
   AvSigVersion    DateAvSigVersion
0   1.155.266.0 2013-07-18 09:08:00
1   1.167.387.0 2014-02-21 14:20:00
2  1.169.1625.0 2014-04-03 01:17:00
3  1.169.2478.0 2014-04-12 17:16:00
4    1.169.55.0 2014-03-18 01:09:00


In [64]:
data_os.to_csv("fechas_os.csv",index=False)
data_av.to_csv("fechas_av.csv",index=False)

In [2]:
conf = SparkConf()
cores = multiprocessing.cpu_count()
conf = SparkConf()
conf.set("spark.sql.shuffle.partitions", int(8))
conf.set("spark.default.parallelism", int(8))
sc = SparkContext(conf=conf)

In [3]:
spark = SparkSession.builder.appName("Microsoft_Kaggle").getOrCreate()

In [4]:
df_num = spark.read.csv("../data/df_cat_prepro_0/*.csv",inferSchema=True,header=True).limit(5000)
df_num.persist()
df_num.count()

5000

In [5]:
df_fechas_av = spark.read.csv("fechas_av.csv",inferSchema=True,header=True)
df_fechas_os = spark.read.csv("fechas_os.csv",inferSchema=True,header=True)

In [6]:
df_fechas_os = df_fechas_os.withColumn('DateCensus_OSVersion', to_date(col('DateCensus_OSVersion')))
df_fechas_av = df_fechas_av.withColumn('DateAvSigVersion', to_date(col('DateAvSigVersion')))

In [48]:
# df_fechas_av.show(8)

In [49]:
# df_fechas_os.show(8)

In [50]:
# df_num.select('AvSigVersion').distinct().count()

In [51]:
# df_num.select('Census_OSVersion').distinct().count()

In [52]:
# df_num.select('OsBuildLab').distinct().count()

El 5 elemento de la columna OsBuildLab es la fecha de release, por lo que hay que sacarla para aportar info!!!

In [53]:
# df_num.select('OsBuildLab').distinct().show(776, truncate = False)

In [7]:
df_num = df_num.withColumn('OsBuildLab', regexp_replace('OsBuildLab', '\*', '.').cast(StringType()))

In [8]:
# df_num.select('OsBuildLab').distinct().show(776, truncate = False)

In [9]:
df_date_osbuild = df_num.withColumn('OsBuildLab_4', split(df_num['OsBuildLab'], '\.')[4].cast(StringType()))
df_date_osbuild = df_date_osbuild.withColumn('OsBuildLab_date', split(df_date_osbuild['OsBuildLab_4'], '-')[0].cast(StringType()))

In [10]:
# df_date_osbuild.select('OsBuildLab_date').show()

In [11]:
df_date_osbuild = df_date_osbuild.withColumn('OsBuildLab_date', to_date(col('OsBuildLab_date'), format='yyMMdd'))

In [56]:
# df_date_osbuild.select('OsBuildLab_date').show()

In [57]:
# a = df_date_osbuild.orderBy('AvSigVersion')

In [58]:
# a.select('AvSigVersion').distinct().count()

In [59]:
# AvSigVersion_dis = a.select('AvSigVersion').distinct().collect()

In [19]:
# AvSigVersion_dis

In [16]:
df_dates = df_date_osbuild.join(df_fechas_av, ['AvSigVersion'], 'left').select('MachineIdentifier',
                                                                               'Platform',
                                                                               'OsBuildLab_date',
                                                                               'DateAvSigVersion')

In [17]:
# df_dates.show()

In [18]:
w1 = Window.partitionBy('Platform').orderBy('OsBuildLab_date')
w2 = Window.partitionBy('Platform').orderBy('DateAvSigVersion')

In [19]:
data_windows = df_dates.withColumn('OsBuildLab_date_lag', lag('OsBuildLab_date').over(w1))
# .withColumn('DateAvSigVersion_lag', lag('DateAvSigVersion').over(w2))

In [20]:
data_windows.persist()
data_windows.count()

5000

In [21]:
# data_windows.filter(col('OsBuildLab_date').isNotNull()).show()

In [22]:
date_diff = data_windows.withColumn('OSBuild_diff', datediff(col('OsBuildLab_date'), col('OsBuildLab_date_lag')))

In [23]:
# date_diff.filter((col('OSBuild_diff') > 0) & (col('Platform') != 'windows10')).show(50)

In [24]:
date_diff_2 = date_diff.withColumn('DateAvSigVersion_lag', lag('DateAvSigVersion').over(w2))

In [25]:
date_diff_2.persist()
date_diff_2.count()

5000

In [26]:
date_diff_2.show(6)

+--------------------+---------+---------------+----------------+-------------------+------------+--------------------+
|   MachineIdentifier| Platform|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|
+--------------------+---------+---------------+----------------+-------------------+------------+--------------------+
|5363bedd7963ce3c8...|windows10|     2016-10-14|      2016-05-20|         2016-10-14|           0|                null|
|535d83787918a49eb...|windows10|     2016-12-20|      2016-06-27|         2016-12-20|           0|          2016-05-20|
|53524488234b61d2d...|windows10|     2016-05-27|      2016-07-11|         2016-05-27|           0|          2016-06-27|
|535fc198a964e5dc3...|windows10|     2018-05-02|      2016-07-20|         2018-05-02|           0|          2016-07-11|
|53642865f3ed2f6f3...|windows10|     2017-06-02|      2016-07-21|         2017-06-02|           0|          2016-07-20|
|5363115653644973e...|windows10|     201

In [27]:
date_diff_3 = date_diff_2.withColumn('AvSigVersion_diff', datediff(col('DateAvSigVersion'), col('DateAvSigVersion_lag')))

In [28]:
date_diff_3.filter((col('AvSigVersion_diff') > 0) & (col('Platform') != 'windows10')).show(3)

+--------------------+-----------+---------------+----------------+-------------------+------------+--------------------+-----------------+
|   MachineIdentifier|   Platform|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|AvSigVersion_diff|
+--------------------+-----------+---------------+----------------+-------------------+------------+--------------------+-----------------+
|535e5ba128ba4e481...|windows2016|     2018-06-07|      2018-07-31|         2018-04-27|          41|          2018-07-27|                4|
|5363e313e1294dc45...|windows2016|     2018-06-07|      2018-08-01|         2018-06-07|           0|          2018-07-31|                1|
|534ceb5e33ce45976...|windows2016|     2017-04-27|      2018-08-02|               null|        null|          2018-08-01|                1|
+--------------------+-----------+---------------+----------------+-------------------+------------+--------------------+-----------------+
only showing top 3 r

In [29]:
df_max_date = date_diff_3.groupBy('Platform').agg(max('OsBuildLab_date'),
                                                  max('DateAvSigVersion'),
                                                  max('OSBuild_diff'),
                                                  max('AvSigVersion_diff'))

In [41]:
df_date_max_date = date_diff_3.join(df_max_date, ['Platform'], 'left')

In [42]:
df_date_max_date.show(3)

+---------+--------------------+---------------+----------------+-------------------+------------+--------------------+-----------------+--------------------+---------------------+-----------------+----------------------+
| Platform|   MachineIdentifier|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|AvSigVersion_diff|max(OsBuildLab_date)|max(DateAvSigVersion)|max(OSBuild_diff)|max(AvSigVersion_diff)|
+---------+--------------------+---------------+----------------+-------------------+------------+--------------------+-----------------+--------------------+---------------------+-----------------+----------------------+
|windows10|5363bedd7963ce3c8...|     2016-10-14|      2016-05-20|         2016-10-14|           0|                null|             null|          2018-09-11|           2018-09-25|               73|                    60|
|windows10|535d83787918a49eb...|     2016-12-20|      2016-06-27|         2016-12-20|           0|          2016

In [43]:
df_date_max_date = df_date_max_date.withColumn('OsBuildLab_difftotal', datediff(col('max(OsBuildLab_date)'), col('OsBuildLab_date')))\
.withColumn('DateAvSigVersion_difftotal', datediff(col('max(DateAvSigVersion)'), col('DateAvSigVersion')))\
.withColumn('DateAvSigVersion_ratio', col('AvSigVersion_diff')/col('max(AvSigVersion_diff)'))\
.withColumn('OsBuildLab_ratio', col('OSBuild_diff')/col('max(OSBuild_diff)'))

In [44]:
df_date_max_date.show(3)

+---------+--------------------+---------------+----------------+-------------------+------------+--------------------+-----------------+--------------------+---------------------+-----------------+----------------------+--------------------+--------------------------+----------------------+----------------+
| Platform|   MachineIdentifier|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|AvSigVersion_diff|max(OsBuildLab_date)|max(DateAvSigVersion)|max(OSBuild_diff)|max(AvSigVersion_diff)|OsBuildLab_difftotal|DateAvSigVersion_difftotal|DateAvSigVersion_ratio|OsBuildLab_ratio|
+---------+--------------------+---------------+----------------+-------------------+------------+--------------------+-----------------+--------------------+---------------------+-----------------+----------------------+--------------------+--------------------------+----------------------+----------------+
|windows10|5363bedd7963ce3c8...|     2016-10-14|      2016-05-20|     

In [45]:
df_date_max_date = df_date_max_date.withColumn('OsBuildLab_dayOfWeek', date_format('OsBuildLab_date', 'u'))

In [49]:
df_date_max_date.filter(col('OsBuildLab_dayOfWeek')==1).show(3)

+---------+--------------------+---------------+----------------+-------------------+------------+--------------------+-----------------+--------------------+---------------------+-----------------+----------------------+--------------------+--------------------------+----------------------+------------------+--------------------+
| Platform|   MachineIdentifier|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|AvSigVersion_diff|max(OsBuildLab_date)|max(DateAvSigVersion)|max(OSBuild_diff)|max(AvSigVersion_diff)|OsBuildLab_difftotal|DateAvSigVersion_difftotal|DateAvSigVersion_ratio|  OsBuildLab_ratio|OsBuildLab_dayOfWeek|
+---------+--------------------+---------------+----------------+-------------------+------------+--------------------+-----------------+--------------------+---------------------+-----------------+----------------------+--------------------+--------------------------+----------------------+------------------+--------------------+
|