In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, IntegerType, LongType,DateType)
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark import SparkConf
from pyspark import SparkContext
import multiprocessing
from datetime import datetime
import numpy as np
import pandas as pd
import csv

In [23]:
datedictAS = np.load('AvSigVersionTimestamps.npy')
datedictOS = np.load('OSVersionTimestamps.npy')

In [56]:
rango_av = len(datedictAS[()])
rango_os = len(datedictOS[()])

In [57]:
df_os = pd.DataFrame(datedictOS.tolist(),index=range(1,rango_os+1))
df_av = pd.DataFrame(datedictAS.tolist(),index=range(1,rango_av+1))

In [58]:
data_os = df_os.transpose().reset_index()
data_av = df_av.transpose().reset_index()

In [59]:
data_os = data_os.iloc[:,:2]
data_av = data_av.iloc[:,:2]

In [60]:
data_os.columns = ['Census_OSVersion', 'DateCensus_OSVersion']
data_av.columns = ['AvSigVersion', 'DateAvSigVersion']

In [63]:
print(data_os.head())
print(data_av.head())

   Census_OSVersion DateCensus_OSVersion
0    10.0.14393.351           2016-10-27
1   10.0.14393.2097           2018-02-22
2  10.0.10240.17918           2018-07-16
3   10.0.17744.1004           2018-09-07
4    10.0.16299.665           2018-09-11
   AvSigVersion    DateAvSigVersion
0   1.155.266.0 2013-07-18 09:08:00
1   1.167.387.0 2014-02-21 14:20:00
2  1.169.1625.0 2014-04-03 01:17:00
3  1.169.2478.0 2014-04-12 17:16:00
4    1.169.55.0 2014-03-18 01:09:00


In [64]:
data_os.to_csv("fechas_os.csv",index=False)
data_av.to_csv("fechas_av.csv",index=False)

In [2]:
spark = SparkSession.builder.appName("Microsoft_Kaggle").getOrCreate()

In [3]:
df_num = spark.read.csv("../data/df_cat_prepro_0/*.csv",inferSchema=True,header=True)
df_num.persist()
df_num.count()

16774736

In [4]:
df_fechas_av = spark.read.csv("fechas_av.csv",inferSchema=True,header=True)
df_fechas_os = spark.read.csv("fechas_os.csv",inferSchema=True,header=True)

In [5]:
df_fechas_os = df_fechas_os.withColumn('DateCensus_OSVersion', to_date(col('DateCensus_OSVersion')))
df_fechas_av = df_fechas_av.withColumn('DateAvSigVersion', to_date(col('DateAvSigVersion')))

In [6]:
df_fechas_av.show(8)

+------------+----------------+
|AvSigVersion|DateAvSigVersion|
+------------+----------------+
| 1.155.266.0|      2013-07-18|
| 1.167.387.0|      2014-02-21|
|1.169.1625.0|      2014-04-03|
|1.169.2478.0|      2014-04-12|
|  1.169.55.0|      2014-03-18|
| 1.177.229.0|      2014-06-19|
|1.187.1121.0|      2014-11-01|
| 1.187.322.0|      2014-10-23|
+------------+----------------+
only showing top 8 rows



In [7]:
df_fechas_os.show(8)

+----------------+--------------------+
|Census_OSVersion|DateCensus_OSVersion|
+----------------+--------------------+
|  10.0.14393.351|          2016-10-27|
| 10.0.14393.2097|          2018-02-22|
|10.0.10240.17918|          2018-07-16|
| 10.0.17744.1004|          2018-09-07|
|  10.0.16299.665|          2018-09-11|
|  10.0.15063.297|          2017-05-09|
|  10.0.15063.296|          2017-05-09|
|  10.0.16299.125|          2017-12-12|
+----------------+--------------------+
only showing top 8 rows



In [9]:
df_num.select('AvSigVersion').distinct().count()

9623

In [10]:
df_num.select('Census_OSVersion').distinct().count()

579

In [11]:
df_num.select('OsBuildLab').distinct().count()

776

El 5 elemento de la columna OsBuildLab es la fecha de release, por lo que hay que sacarla para aportar info!!!

In [82]:
df_num.select('OsBuildLab').distinct().show(776, truncate = False)

+---------------------------------------------------------+
|OsBuildLab                                               |
+---------------------------------------------------------+
|17713.1000.amd64fre.rs5_release.180706-1551              |
|17650.1001.x86fre.rs_prerelease.180414-2140              |
|17752.1.amd64fre.rs5_release.180829-1740                 |
|7601.23807.x86fre.win7sp1_ldr.170512-0600                |
|17692.1000.amd64fre.rs_prerelease.180609-1317            |
|7601.22656.amd64fre.win7sp1_ldr.140417-1532              |
|18260.1000.amd64fre.rsmaster.181010-1540                 |
|14393.82.amd64fre.rs1_release.160805-1735                |
|10240.17643.x86fre.th1_st1.170918-1824                   |
|7601.18939.x86fre.win7sp1_gdr.150722-0600                |
|14393.1944.x86fre.rs1_release.171129-2100                |
|17686.1003.amd64fre.rs_prerelease.180603-1447            |
|7600.16539.x86fre.win7_gdr.100226-1909                   |
|7601.24291.amd64fre.win7sp1_ldr_escrow.

In [6]:
df_num = df_num.withColumn('OsBuildLab', regexp_replace('OsBuildLab', '\*', '.').cast(StringType()))

In [10]:
df_num.select('OsBuildLab').distinct().show(776, truncate = False)

+---------------------------------------------------------+
|OsBuildLab                                               |
+---------------------------------------------------------+
|17713.1000.amd64fre.rs5_release.180706-1551              |
|17650.1001.x86fre.rs_prerelease.180414-2140              |
|17752.1.amd64fre.rs5_release.180829-1740                 |
|7601.23807.x86fre.win7sp1_ldr.170512-0600                |
|17692.1000.amd64fre.rs_prerelease.180609-1317            |
|7601.22656.amd64fre.win7sp1_ldr.140417-1532              |
|18260.1000.amd64fre.rsmaster.181010-1540                 |
|14393.82.amd64fre.rs1_release.160805-1735                |
|10240.17643.x86fre.th1_st1.170918-1824                   |
|7601.18939.x86fre.win7sp1_gdr.150722-0600                |
|14393.1944.x86fre.rs1_release.171129-2100                |
|17686.1003.amd64fre.rs_prerelease.180603-1447            |
|7600.16539.x86fre.win7_gdr.100226-1909                   |
|7601.24291.amd64fre.win7sp1_ldr_escrow.

In [7]:
df_date_osbuild = df_num.withColumn('OsBuildLab_4', split(df_num['OsBuildLab'], '\.')[4].cast(StringType()))
df_date_osbuild = df_date_osbuild.withColumn('OsBuildLab_date', split(df_date_osbuild['OsBuildLab_4'], '-')[0].cast(StringType()))

In [15]:
df_date_osbuild.select('OsBuildLab_date').show()

+---------------+
|OsBuildLab_date|
+---------------+
|         180410|
|         151029|
|         170317|
|         180410|
|         170317|
|         170928|
|         180410|
|         170928|
|         180718|
|         180410|
|         180410|
|         180410|
|         180410|
|         180410|
|         180410|
|         170317|
|         170602|
|         180410|
|         180410|
|         170928|
+---------------+
only showing top 20 rows



In [8]:
df_date_osbuild = df_date_osbuild.withColumn('OsBuildLab_date', to_date(col('OsBuildLab_date'), format='yyMMdd'))

In [9]:
df_date_osbuild.select('OsBuildLab_date').show()

+---------------+
|OsBuildLab_date|
+---------------+
|     2018-04-10|
|     2015-10-29|
|     2017-03-17|
|     2018-04-10|
|     2017-03-17|
|     2017-09-28|
|     2018-04-10|
|     2017-09-28|
|     2018-07-18|
|     2018-04-10|
|     2018-04-10|
|     2018-04-10|
|     2018-04-10|
|     2018-04-10|
|     2018-04-10|
|     2017-03-17|
|     2017-06-02|
|     2018-04-10|
|     2018-04-10|
|     2017-09-28|
+---------------+
only showing top 20 rows



In [22]:
df_dates = df_date_osbuild.join(df_fechas_av, ['AvSigVersion'], 'left').select('MachineIdentifier',
                                                                       'Platform',
                                                                       'OsBuildLab_date',
                                                                       'DateAvSigVersion')

In [23]:
df_dates.show()

+--------------------+---------+---------------+----------------+
|   MachineIdentifier| Platform|OsBuildLab_date|DateAvSigVersion|
+--------------------+---------+---------------+----------------+
|5344f3df517376797...|windows10|     2018-04-10|      2018-08-19|
|5344f5da09650da47...|windows10|     2015-10-29|      2018-08-26|
|5344fab725abad6c3...|windows10|     2017-03-17|      2018-08-15|
|5344fbd47e4e539c0...|windows10|     2018-04-10|      2018-07-25|
|5344fdfae5951a8ac...|windows10|     2017-03-17|      2018-08-29|
|5345001b054c778cc...|windows10|     2017-09-28|      2018-09-14|
|534500a1e32cb2cc8...|windows10|     2018-04-10|      2018-09-14|
|534500f5ca39374f6...|windows10|     2017-09-28|      2018-08-12|
|534507022faa94a23...| windows8|     2018-07-18|      2018-08-21|
|53450a4a1e8973e91...|windows10|     2018-04-10|      2018-08-03|
|53450d78143513758...|windows10|     2018-04-10|      2018-09-25|
|53450ea8fa2ace94a...|windows10|     2018-04-10|      2018-08-02|
|534510007

In [24]:
w1 = Window.partitionBy('Platform').orderBy('OsBuildLab_date')
w2 = Window.partitionBy('Platform').orderBy('DateAvSigVersion')

In [25]:
data_windows = df_dates.withColumn('OsBuildLab_date_lag', lag('OsBuildLab_date').over(w1))
# .withColumn('DateAvSigVersion_lag', lag('DateAvSigVersion').over(w2))

In [26]:
data_windows.persist()
data_windows.count()

16774736

In [28]:
data_windows.filter(col('OsBuildLab_date').isNotNull()).show()

+--------------------+---------+---------------+----------------+-------------------+
|   MachineIdentifier| Platform|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|
+--------------------+---------+---------------+----------------+-------------------+
|534b4791897c6acb7...|windows10|     2015-07-09|      2018-05-18|               null|
|534c4acaa3916629b...|windows10|     2015-07-09|      2018-07-26|         2015-07-09|
|534d7a9f10a5a5918...|windows10|     2015-07-09|      2018-07-29|         2015-07-09|
|534db34d290579c1d...|windows10|     2015-07-09|      2018-07-07|         2015-07-09|
|53505f12db06d8e75...|windows10|     2015-07-09|      2018-08-27|         2015-07-09|
|53530e0c6773dfcb6...|windows10|     2015-07-09|      2018-08-19|         2015-07-09|
|535370552792a60ce...|windows10|     2015-07-09|      2018-08-26|         2015-07-09|
|5356d730d58f61bca...|windows10|     2015-07-09|      2018-07-19|         2015-07-09|
|535b8e93827115a34...|windows10|     2015-07-09|      

In [30]:
date_diff = data_windows.withColumn('OSBuild_diff', datediff(col('OsBuildLab_date'), col('OsBuildLab_date_lag')))

In [36]:
date_diff.filter((col('OSBuild_diff') > 0) & (col('Platform') != 'windows10')).show(50)

+--------------------+-----------+---------------+----------------+-------------------+------------+
|   MachineIdentifier|   Platform|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|
+--------------------+-----------+---------------+----------------+-------------------+------------+
|3ff629201b93b433c...|windows2016|     2016-08-08|      2018-11-12|         2016-07-15|          24|
|a93c25732635813d5...|windows2016|     2016-09-15|      2018-09-05|         2016-08-08|          38|
|0935c285534e4bac6...|windows2016|     2016-10-04|      2016-05-20|         2016-09-15|          19|
|537a976f1eece31b5...|windows2016|     2016-11-02|      2018-07-27|         2016-10-04|          29|
|2e5c7d4a05f3bbe45...|windows2016|     2016-12-08|      2017-01-05|         2016-11-02|          36|
|5473542d3cad384f7...|windows2016|     2016-12-20|      2018-08-25|         2016-12-08|          12|
|3b791f851b5d7ae1b...|windows2016|     2017-01-12|      2017-03-14|         2016-12-20|    

In [37]:
date_diff_2 = date_diff.withColumn('DateAvSigVersion_lag', lag('DateAvSigVersion').over(w2))

In [38]:
date_diff_2.persist()
date_diff_2.count()

16774736

In [39]:
date_diff_2.show(6)

+--------------------+---------+---------------+----------------+-------------------+------------+--------------------+
|   MachineIdentifier| Platform|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|
+--------------------+---------+---------------+----------------+-------------------+------------+--------------------+
|25baf4bda36d81995...|windows10|     2016-10-24|            null|         2016-10-24|           0|                null|
|fe1ba8ba89a72755c...|windows10|     2016-10-24|            null|         2016-10-24|           0|                null|
|afa888523818e2c23...|windows10|     2016-11-02|            null|         2016-11-02|           0|                null|
|82cdb7903574e73d8...|windows10|     2016-12-20|            null|         2016-12-20|           0|                null|
|e6a3855da11ea06d3...|windows10|     2016-12-20|            null|         2016-12-20|           0|                null|
|faa1173ccba128cdc...|windows10|     201

In [40]:
date_diff_3 = date_diff_2.withColumn('AvSigVersion_diff', datediff(col('DateAvSigVersion'), col('DateAvSigVersion_lag')))

In [42]:
date_diff_3.filter((col('AvSigVersion_diff') > 0) & (col('Platform') != 'windows10')).show(10)

+--------------------+-----------+---------------+----------------+-------------------+------------+--------------------+-----------------+
|   MachineIdentifier|   Platform|OsBuildLab_date|DateAvSigVersion|OsBuildLab_date_lag|OSBuild_diff|DateAvSigVersion_lag|AvSigVersion_diff|
+--------------------+-----------+---------------+----------------+-------------------+------------+--------------------+-----------------+
|2e5c7d4a05f3bbe45...|windows2016|     2016-12-08|      2017-01-05|         2016-11-02|          36|          2016-05-20|              230|
|107b7d9c05505abd8...|windows2016|     2016-12-20|      2017-03-09|         2016-12-20|           0|          2017-01-05|               63|
|3b791f851b5d7ae1b...|windows2016|     2017-01-12|      2017-03-14|         2016-12-20|          23|          2017-03-09|                5|
|49c67aec2cec710ff...|windows2016|     2017-03-27|      2017-04-26|         2017-03-27|           0|          2017-03-14|               43|
|350d8cfdd42123bbd..