In [1]:
# The code was removed by Watson Studio for sharing.

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20191126154004-0000
KERNEL_ID = 3267b70d-3ac2-4aa0-a33c-d5f8ca1bfa77


# <center> Create Device Facts </center>

In the present notebook we have the code used to summarize and get the Device Facts from a data set of trips.

In [2]:
# Spark required imports
from pyspark.sql import SparkSession
from pyspark.sql import types
from pyspark.sql import Window
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import when

import pandas as pd

# Access to IBM Cloud Object Storage
import ibmos2spark

# Python Utilities
from datetime import datetime

In [3]:


import scipy
from scipy import stats
import numpy as np, statsmodels.stats.api as sms
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
import itertools
import seaborn as sns
import matplotlib.pyplot as plt



# Service Credentials

In [4]:
# The code was removed by Watson Studio for sharing.

Load the trips data set.

In [5]:
name = "TripFacts5_Geo"
Trips_geo = spark.read.parquet(local_cos.url(name, local_bucket))
print('Done')
#print('Number of rows: ', "{:,}".format(Trips_geo.count()))
Trips_geo.printSchema()

Done
root
 |-- DeviceId: long (nullable = true)
 |-- TripID: long (nullable = true)
 |-- Dist_km: double (nullable = true)
 |-- Duration_h: double (nullable = true)
 |-- DistanciaST: double (nullable = true)
 |-- DuracionST_h: double (nullable = true)
 |-- Dur_50_h: double (nullable = true)
 |-- Dur_70_h: double (nullable = true)
 |-- Dur_80_h: double (nullable = true)
 |-- Dur_90_h: double (nullable = true)
 |-- Dur_100_h: double (nullable = true)
 |-- Dist_50: double (nullable = true)
 |-- Dist_70: double (nullable = true)
 |-- Dist_80: double (nullable = true)
 |-- Dist_90: double (nullable = true)
 |-- Dist_100: double (nullable = true)
 |-- Latitude_ON: double (nullable = true)
 |-- Longitude_ON: double (nullable = true)
 |-- EC_Time_ON: timestamp (nullable = true)
 |-- Latitude_OFF: double (nullable = true)
 |-- Longitude_OFF: double (nullable = true)
 |-- EC_Time_OFF: timestamp (nullable = true)
 |-- DayTime: string (nullable = true)
 |-- DayType: string (nullable = true)
 |-- D

In [14]:
Trips_geo.select('DeviceId', 'TripID', 'Date', 'DayWeek', 'EC_Time_ON', 'EC_Time_OFF', 'Month').where((F.col('DeviceId')==28574)&(F.col('Date')>='2019-07-25')).orderBy('EC_Time_ON').show(200)

+--------+--------+----------+-------+-------------------+-------------------+-----+
|DeviceId|  TripID|      Date|DayWeek|         EC_Time_ON|        EC_Time_OFF|Month|
+--------+--------+----------+-------+-------------------+-------------------+-----+
|   28574|28130155|2019-07-26|      5|2019-07-26 11:25:37|2019-07-26 11:53:06|    7|
|   28574|28143346|2019-07-26|      5|2019-07-26 13:26:32|2019-07-26 13:52:32|    7|
|   28574|28549192|2019-07-31|      3|2019-07-31 07:33:00|2019-07-31 07:45:26|    7|
|   28574|28594688|2019-07-31|      3|2019-07-31 15:04:59|2019-07-31 15:09:23|    7|
|   28574|29274826|2019-08-07|      3|2019-08-07 19:30:56|2019-08-07 19:44:18|    8|
|   28574|29682824|2019-08-12|      1|2019-08-12 13:35:00|2019-08-12 13:45:59|    8|
|   28574|29844431|2019-08-14|      3|2019-08-14 08:39:52|2019-08-14 08:55:13|    8|
|   28574|29867804|2019-08-14|      3|2019-08-14 12:28:47|2019-08-14 12:59:30|    8|
|   28574|29871554|2019-08-14|      3|2019-08-14 13:05:33|2019-08

### Data set with installation dates 

In [6]:


name = 'data_asset/Dispositivos_Prod_Ene-Sep-2019_install_date_7ffcc8d3'
spark_dev_inst = spark.read.parquet(local_cos.url(name, local_bucket))
spark_dev_inst.printSchema()



root
 |-- DeviceId: integer (nullable = true)
 |-- installeddate: string (nullable = true)



Now, let’s filter the devices which don’t have enough telemetry to be analyzed. We will only select devices which have at least 90 days of installation until September 30, 2019 and at least 90 days of telemetry information.

In [7]:


dev_df = Trips_geo.select('DeviceId', 'Date').groupBy('DeviceId').agg(F.min('Date').alias('min_date'),F.max('Date').alias('max_date'))

dev_df = dev_df.withColumn('days_passed', F.datediff(F.lit(F.col('max_date')), F.lit(F.col('min_date'))))
dev_df = dev_df.where('days_passed>89')
dev_df = dev_df.join(spark_dev_inst.select("DeviceId", 'installeddate'), ["DeviceId"])
dev_df = dev_df.dropDuplicates()
dev_df = dev_df.withColumn("circ_time", F.datediff(F.to_date(F.lit("2019-09-30")), F.lit(F.col('installeddate'))))
dev_df = dev_df.withColumn('dias_circ', F.when(F.col('circ_time')<F.col('days_passed'),F.col('days_passed')).otherwise(F.col('circ_time')))
date_df = dev_df.where('dias_circ>89').toPandas()
l_dev = list(date_df['DeviceId'] )


Trips_f = Trips_geo.where(Trips_geo.DeviceId.isin(l_dev))
Trips_f = Trips_f.dropDuplicates()
print("************ Datos filtrados *************")

************ Datos filtrados *************



## Interparish, Intercantonal, Interprovince Trips

The Republic of Ecuador is divided into 24 provinces. Each province is divided into cantons and cantons are divided into parish. We have created the categories A, B, C and D for each trip:

*	Trip type A: when the ignition on is registered in a certain parish and the ignition off is registered in the same parish.
*	Trip type B: when the ignition on is registered in a certain parish but the ignition off is registered in another parish.
*	Trip type C: when the ignition on is registered in a certain canton but the ignition off is registered in another canton.
*	Trip type D: when the ignition on is registered in a certain province but the ignition off is registered in another province.


In [8]:


Trips_f = Trips_f.withColumn('Tipo_Viaje', F.when((F.col('ViajeInterparroquial')*F.col('ViajeIntercantonal')*F.col('ViajeInterprovincial'))>0,'D').otherwise(F.when((F.col('ViajeInterparroquial')*F.col('ViajeIntercantonal'))>0,'C').otherwise(F.when(F.col('ViajeInterparroquial')>0,'B').otherwise('A'))))



Collapse province of ignition off into the tree following categories.

   * **Pich_y_alrededores**: Pichincha, Imbabura, Carchi, Esmeraldas, Cotopaxi, Napo, Pastaza, Santo Domingo de los Tsachilas, Tungurahua y Bolívar
   * **Guayas_y_alrededores**: Guayas, Santa Elena, Manabí, Los Rios y El Oro
   * **Otras** : Loja, Zamora Chinchipe, Azuay, Cañar, Zona No Delimitada, Morona Santiago y Orellana

In [9]:
cond = F.when(F.col('OFF_N1').isin(['Loja','Zamora Chinchipe', 'Azuay', 'CaÃ±ar', 
                                    'Zona No Delimitada', 'Morona Santiago',
                                    'Orellana']),'Otras').otherwise(F.when(F.col('OFF_N1').isin(['Guayas', 'Santa Elena', 
                                                                                                 'Manabi', 'Los Rios', 'El Oro']),'Guayas_y_alrededores').otherwise('Pich_y_alrededores'))
Trips_f = Trips_f.withColumn('Provincia_OFF', cond)

In [10]:
Trips_f.columns

['DeviceId',
 'TripID',
 'Dist_km',
 'Duration_h',
 'DistanciaST',
 'DuracionST_h',
 'Dur_50_h',
 'Dur_70_h',
 'Dur_80_h',
 'Dur_90_h',
 'Dur_100_h',
 'Dist_50',
 'Dist_70',
 'Dist_80',
 'Dist_90',
 'Dist_100',
 'Latitude_ON',
 'Longitude_ON',
 'EC_Time_ON',
 'Latitude_OFF',
 'Longitude_OFF',
 'EC_Time_OFF',
 'DayTime',
 'DayType',
 'DayWeek',
 'Month',
 'Date',
 'ON_ID',
 'OFF_ID',
 'ON_N3',
 'ON_N2',
 'ON_N1',
 'OFF_N3',
 'OFF_N2',
 'OFF_N1',
 'ViajeInterparroquial',
 'ViajeIntercantonal',
 'ViajeInterprovincial',
 'Tipo_Viaje',
 'Provincia_OFF']

## Compute Device Facts

In [11]:
# Variables por categorías de DayTime
Devices_day = Trips_f.withColumn('DayTime_0_6_dist', when(((F.col("DayTime")== 'Madrugada 0-3')|(F.col("DayTime")== 'Madrugada 3-6')), F.col("Dist_km")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_6_9_dist', when((F.col("DayTime")== 'Manana 6-9'), F.col("Dist_km")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_9_18_dist', when(((F.col("DayTime")== 'Manana 9-12')|(F.col("DayTime")== 'Tarde 12-15')|(F.col("DayTime")== 'Tarde 15-18')), F.col("Dist_km")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_18_21_dist', when((F.col("DayTime")== 'Noche 18-21'), F.col("Dist_km")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_21_24_dist', when((F.col("DayTime")== 'Noche 21-24'), F.col("Dist_km")).otherwise(0))

In [12]:
# Variables por categorías de DayTime
Devices_day = Devices_day.withColumn('DayTime_0_6_dur', when(((F.col("DayTime")== 'Madrugada 0-3')|(F.col("DayTime")== 'Madrugada 3-6')), F.col("Duration_h")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_6_9_dur', when((F.col("DayTime")== 'Manana 6-9'), F.col("Duration_h")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_9_18_dur', when(((F.col("DayTime")== 'Manana 9-12')|(F.col("DayTime")== 'Tarde 12-15')|(F.col("DayTime")== 'Tarde 15-18')), F.col("Duration_h")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_18_21_dur', when((F.col("DayTime")== 'Noche 18-21'), F.col("Duration_h")).otherwise(0))
Devices_day = Devices_day.withColumn('DayTime_21_24_dur', when((F.col("DayTime")== 'Noche 21-24'), F.col("Duration_h")).otherwise(0))

In [13]:
# Variables por categorías de DayTime
Devices_day = Devices_day.withColumn('N_Trips_DayTime_0_6', when(((F.col("DayTime")== 'Madrugada 0-3')|(F.col("DayTime")== 'Madrugada 3-6')), 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_DayTime_6_9', when((F.col("DayTime")== 'Manana 6-9'), 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_DayTime_9_18', when(((F.col("DayTime")== 'Manana 9-12')|(F.col("DayTime")== 'Tarde 12-15')|(F.col("DayTime")== 'Tarde 15-18')), 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_DayTime_18_21', when((F.col("DayTime")== 'Noche 18-21'), 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_DayTime_21_24', when((F.col("DayTime")== 'Noche 21-24'), 1).otherwise(0))

In [14]:
# Variables DayType
Devices_day = Devices_day.withColumn('DT_Weekday_dist', F.when(F.col('DayType') == 'Weekday', F.col('Dist_km')).otherwise(0))
Devices_day = Devices_day.withColumn('DT_Weekend_dist', F.when(F.col('DayType') == 'Weekend', F.col('Dist_km')).otherwise(0))

Devices_day = Devices_day.withColumn('DT_Weekday_dur', F.when(F.col('DayType') == 'Weekday', F.col('Duration_h')).otherwise(0))
Devices_day = Devices_day.withColumn('DT_Weekend_dur', F.when(F.col('DayType') == 'Weekend', F.col('Duration_h')).otherwise(0))

Devices_day = Devices_day.withColumn('N_Trips_DT_Weekday', F.when(F.col('DayType') == 'Weekday', 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_DT_Weekend', F.when(F.col('DayType') == 'Weekend', 1).otherwise(0))

In [15]:
# Variables Tipo de viaje
Devices_day = Devices_day.withColumn('Tipo_Viaje_A_dist', F.when(F.col('Tipo_Viaje') == 'A', F.col('Dist_km')).otherwise(0))
Devices_day = Devices_day.withColumn('Tipo_Viaje_B_dist', F.when(F.col('Tipo_Viaje') == 'B', F.col('Dist_km')).otherwise(0))
Devices_day = Devices_day.withColumn('Tipo_Viaje_C_dist', F.when(F.col('Tipo_Viaje') == 'C', F.col('Dist_km')).otherwise(0))
Devices_day = Devices_day.withColumn('Tipo_Viaje_D_dist', F.when(F.col('Tipo_Viaje') == 'D', F.col('Dist_km')).otherwise(0))

# Variables Tipo de viaje
Devices_day = Devices_day.withColumn('Tipo_Viaje_A_dur', F.when(F.col('Tipo_Viaje') == 'A', F.col('Duration_h')).otherwise(0))
Devices_day = Devices_day.withColumn('Tipo_Viaje_B_dur', F.when(F.col('Tipo_Viaje') == 'B', F.col('Duration_h')).otherwise(0))
Devices_day = Devices_day.withColumn('Tipo_Viaje_C_dur', F.when(F.col('Tipo_Viaje') == 'C', F.col('Duration_h')).otherwise(0))
Devices_day = Devices_day.withColumn('Tipo_Viaje_D_dur', F.when(F.col('Tipo_Viaje') == 'D', F.col('Duration_h')).otherwise(0))

# Variables Tipo de viaje
Devices_day = Devices_day.withColumn('N_Trips_Tipo_Viaje_A', F.when(F.col('Tipo_Viaje') == 'A', 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_Tipo_Viaje_B', F.when(F.col('Tipo_Viaje') == 'B', 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_Tipo_Viaje_C', F.when(F.col('Tipo_Viaje') == 'C', 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_Tipo_Viaje_D', F.when(F.col('Tipo_Viaje') == 'D', 1).otherwise(0))

In [16]:
# Variables de ubicación
Devices_day = Devices_day.withColumn('Guayas_y_alr_dist', F.when(F.col('Provincia_OFF') == 'Guayas_y_alrededores', F.col('Dist_km')).otherwise(0))
Devices_day = Devices_day.withColumn('Pich_y_alr_dist', F.when(F.col('Provincia_OFF') == 'Pich_y_alrededores', F.col('Dist_km')).otherwise(0))
Devices_day = Devices_day.withColumn('Otras_dist', F.when(F.col('Provincia_OFF') == 'Otras', F.col('Dist_km')).otherwise(0))

# Variables de ubicación
Devices_day = Devices_day.withColumn('Guayas_y_alr_dur', F.when(F.col('Provincia_OFF') == 'Guayas_y_alrededores', F.col('Duration_h')).otherwise(0))
Devices_day = Devices_day.withColumn('Pich_y_alr_dur', F.when(F.col('Provincia_OFF') == 'Pich_y_alrededores', F.col('Duration_h')).otherwise(0))
Devices_day = Devices_day.withColumn('Otras_dur', F.when(F.col('Provincia_OFF') == 'Otras', F.col('Duration_h')).otherwise(0))

# Variables de ubicación
Devices_day = Devices_day.withColumn('N_Trips_Guayas_y_alr', F.when(F.col('Provincia_OFF') == 'Guayas_y_alrededores', 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_Pich_y_alr', F.when(F.col('Provincia_OFF') == 'Pich_y_alrededores', 1).otherwise(0))
Devices_day = Devices_day.withColumn('N_Trips_Otras', F.when(F.col('Provincia_OFF') == 'Otras', 1).otherwise(0))

In [17]:
print(Devices_day.columns)

['DeviceId', 'TripID', 'Dist_km', 'Duration_h', 'DistanciaST', 'DuracionST_h', 'Dur_50_h', 'Dur_70_h', 'Dur_80_h', 'Dur_90_h', 'Dur_100_h', 'Dist_50', 'Dist_70', 'Dist_80', 'Dist_90', 'Dist_100', 'Latitude_ON', 'Longitude_ON', 'EC_Time_ON', 'Latitude_OFF', 'Longitude_OFF', 'EC_Time_OFF', 'DayTime', 'DayType', 'DayWeek', 'Month', 'Date', 'ON_ID', 'OFF_ID', 'ON_N3', 'ON_N2', 'ON_N1', 'OFF_N3', 'OFF_N2', 'OFF_N1', 'ViajeInterparroquial', 'ViajeIntercantonal', 'ViajeInterprovincial', 'Tipo_Viaje', 'Provincia_OFF', 'DayTime_0_6_dist', 'DayTime_6_9_dist', 'DayTime_9_18_dist', 'DayTime_18_21_dist', 'DayTime_21_24_dist', 'DayTime_0_6_dur', 'DayTime_6_9_dur', 'DayTime_9_18_dur', 'DayTime_18_21_dur', 'DayTime_21_24_dur', 'N_Trips_DayTime_0_6', 'N_Trips_DayTime_6_9', 'N_Trips_DayTime_9_18', 'N_Trips_DayTime_18_21', 'N_Trips_DayTime_21_24', 'DT_Weekday_dist', 'DT_Weekend_dist', 'DT_Weekday_dur', 'DT_Weekend_dur', 'N_Trips_DT_Weekday', 'N_Trips_DT_Weekend', 'Tipo_Viaje_A_dist', 'Tipo_Viaje_B_dist',

## Day aggregations

In [18]:
# Datos agregados por día

cols = ['Dist_km', 'Duration_h', 'Dur_50_h', 'Dur_70_h', 'Dur_80_h', 'Dur_90_h', 'Dur_100_h', 
        'Dist_50', 'Dist_70', 'Dist_80', 'Dist_90', 'Dist_100', 
        'DayTime_0_6_dist', 'DayTime_6_9_dist', 'DayTime_9_18_dist', 'DayTime_18_21_dist', 
        'DayTime_21_24_dist', 'DayTime_0_6_dur', 'DayTime_6_9_dur', 'DayTime_9_18_dur', 
        'DayTime_18_21_dur', 'DayTime_21_24_dur', 'N_Trips_DayTime_0_6', 'N_Trips_DayTime_6_9',
        'N_Trips_DayTime_9_18', 'N_Trips_DayTime_18_21', 'N_Trips_DayTime_21_24', 
        'DT_Weekday_dist', 'DT_Weekend_dist', 'DT_Weekday_dur', 'DT_Weekend_dur',
        'N_Trips_DT_Weekday', 'N_Trips_DT_Weekend', 'Tipo_Viaje_A_dist', 'Tipo_Viaje_B_dist',
        'Tipo_Viaje_C_dist', 'Tipo_Viaje_D_dist', 'Tipo_Viaje_A_dur', 'Tipo_Viaje_B_dur',
        'Tipo_Viaje_C_dur', 'Tipo_Viaje_D_dur', 'N_Trips_Tipo_Viaje_A', 'N_Trips_Tipo_Viaje_B',
        'N_Trips_Tipo_Viaje_C', 'N_Trips_Tipo_Viaje_D', 'Guayas_y_alr_dist', 'Pich_y_alr_dist',
        'Otras_dist', 'Guayas_y_alr_dur', 'Pich_y_alr_dur', 'Otras_dur', 'N_Trips_Guayas_y_alr',
        'N_Trips_Pich_y_alr', 'N_Trips_Otras']

exprs_Sum = [F.sum(x).alias(x) for x in cols]
Devices_day = Devices_day.groupby("DeviceId","Month","Date").agg(F.count("*").alias("Viajes_dia"), *exprs_Sum)
print(Devices_day.columns)

['DeviceId', 'Month', 'Date', 'Viajes_dia', 'Dist_km', 'Duration_h', 'Dur_50_h', 'Dur_70_h', 'Dur_80_h', 'Dur_90_h', 'Dur_100_h', 'Dist_50', 'Dist_70', 'Dist_80', 'Dist_90', 'Dist_100', 'DayTime_0_6_dist', 'DayTime_6_9_dist', 'DayTime_9_18_dist', 'DayTime_18_21_dist', 'DayTime_21_24_dist', 'DayTime_0_6_dur', 'DayTime_6_9_dur', 'DayTime_9_18_dur', 'DayTime_18_21_dur', 'DayTime_21_24_dur', 'N_Trips_DayTime_0_6', 'N_Trips_DayTime_6_9', 'N_Trips_DayTime_9_18', 'N_Trips_DayTime_18_21', 'N_Trips_DayTime_21_24', 'DT_Weekday_dist', 'DT_Weekend_dist', 'DT_Weekday_dur', 'DT_Weekend_dur', 'N_Trips_DT_Weekday', 'N_Trips_DT_Weekend', 'Tipo_Viaje_A_dist', 'Tipo_Viaje_B_dist', 'Tipo_Viaje_C_dist', 'Tipo_Viaje_D_dist', 'Tipo_Viaje_A_dur', 'Tipo_Viaje_B_dur', 'Tipo_Viaje_C_dur', 'Tipo_Viaje_D_dur', 'N_Trips_Tipo_Viaje_A', 'N_Trips_Tipo_Viaje_B', 'N_Trips_Tipo_Viaje_C', 'N_Trips_Tipo_Viaje_D', 'Guayas_y_alr_dist', 'Pich_y_alr_dist', 'Otras_dist', 'Guayas_y_alr_dur', 'Pich_y_alr_dur', 'Otras_dur', 'N_Tri

## Device aggregations

In [19]:
cols = ['Viajes_dia', 'Dist_km', 'Duration_h', 'Dur_50_h', 'Dur_70_h', 'Dur_80_h', 'Dur_90_h', 'Dur_100_h',
        'Dist_50', 'Dist_70', 'Dist_80', 'Dist_90', 'Dist_100', 
        'DayTime_0_6_dist', 'DayTime_6_9_dist', 'DayTime_9_18_dist', 'DayTime_18_21_dist', 'DayTime_21_24_dist', 
        'DayTime_0_6_dur', 'DayTime_6_9_dur', 'DayTime_9_18_dur', 'DayTime_18_21_dur', 'DayTime_21_24_dur', 
        'N_Trips_DayTime_0_6', 'N_Trips_DayTime_6_9', 'N_Trips_DayTime_9_18', 'N_Trips_DayTime_18_21', 'N_Trips_DayTime_21_24', 
        'DT_Weekday_dist', 'DT_Weekend_dist', 'DT_Weekday_dur', 'DT_Weekend_dur', 'N_Trips_DT_Weekday', 'N_Trips_DT_Weekend', 
        'Tipo_Viaje_A_dist', 'Tipo_Viaje_B_dist', 'Tipo_Viaje_C_dist', 'Tipo_Viaje_D_dist', 
        'Tipo_Viaje_A_dur', 'Tipo_Viaje_B_dur', 'Tipo_Viaje_C_dur', 'Tipo_Viaje_D_dur', 
        'N_Trips_Tipo_Viaje_A', 'N_Trips_Tipo_Viaje_B', 'N_Trips_Tipo_Viaje_C', 'N_Trips_Tipo_Viaje_D', 
        'Guayas_y_alr_dist', 'Pich_y_alr_dist', 'Otras_dist', 'Guayas_y_alr_dur', 'Pich_y_alr_dur', 'Otras_dur', 
        'N_Trips_Guayas_y_alr', 'N_Trips_Pich_y_alr', 'N_Trips_Otras']

exprs_Min = [F.min(x).alias("Min_"+x) for x in ['Date']]
exprs_Max = [F.max(x).alias("Max_"+x) for x in ['Date']]
exprs_CD = [F.countDistinct(x).alias("Count_Dist_"+x) for x in ['Date']]
exprs_Sum = [F.sum(x).alias("Sum_"+x) for x in cols]
#exprs_Avg = [F.avg(x).alias("Promedio_"+x) for x in cols]
#exprs_Max = [F.max(x).alias("Max_"+x) for x in cols]

#Devices_Month = Devices_day.groupBy("DeviceId").agg(*exprs_CD, *exprs_Sum, *exprs_Avg, *exprs_Max)
Devices_Month = Devices_day.groupBy("DeviceId").agg(*exprs_Min,*exprs_Max , 
                                                    *exprs_CD, *exprs_Sum)

print(Devices_Month.columns)

['DeviceId', 'Min_Date', 'Max_Date', 'Count_Dist_Date', 'Sum_Viajes_dia', 'Sum_Dist_km', 'Sum_Duration_h', 'Sum_Dur_50_h', 'Sum_Dur_70_h', 'Sum_Dur_80_h', 'Sum_Dur_90_h', 'Sum_Dur_100_h', 'Sum_Dist_50', 'Sum_Dist_70', 'Sum_Dist_80', 'Sum_Dist_90', 'Sum_Dist_100', 'Sum_DayTime_0_6_dist', 'Sum_DayTime_6_9_dist', 'Sum_DayTime_9_18_dist', 'Sum_DayTime_18_21_dist', 'Sum_DayTime_21_24_dist', 'Sum_DayTime_0_6_dur', 'Sum_DayTime_6_9_dur', 'Sum_DayTime_9_18_dur', 'Sum_DayTime_18_21_dur', 'Sum_DayTime_21_24_dur', 'Sum_N_Trips_DayTime_0_6', 'Sum_N_Trips_DayTime_6_9', 'Sum_N_Trips_DayTime_9_18', 'Sum_N_Trips_DayTime_18_21', 'Sum_N_Trips_DayTime_21_24', 'Sum_DT_Weekday_dist', 'Sum_DT_Weekend_dist', 'Sum_DT_Weekday_dur', 'Sum_DT_Weekend_dur', 'Sum_N_Trips_DT_Weekday', 'Sum_N_Trips_DT_Weekend', 'Sum_Tipo_Viaje_A_dist', 'Sum_Tipo_Viaje_B_dist', 'Sum_Tipo_Viaje_C_dist', 'Sum_Tipo_Viaje_D_dist', 'Sum_Tipo_Viaje_A_dur', 'Sum_Tipo_Viaje_B_dur', 'Sum_Tipo_Viaje_C_dur', 'Sum_Tipo_Viaje_D_dur', 'Sum_N_Trips_

### Add installation date 

In [20]:
Devices_Month = Devices_Month.join(dev_df.select('DeviceId', 'installeddate', 'dias_circ'), on=['DeviceId'])
Devices_Month = Devices_Month.dropDuplicates()

In [21]:
#Compute percentage of days in movement
Devices_Month = Devices_Month.withColumn('PCT_Dias_Mov', F.col('Count_Dist_Date')/F.col('dias_circ'))
Devices_Month = Devices_Month.withColumn('PCT_Horas_Mov', F.col('Sum_Duration_h')/(F.col('dias_circ')*24))

In [22]:
print(dev_df.columns)
Devices_Month.columns

['DeviceId', 'min_date', 'max_date', 'days_passed', 'installeddate', 'circ_time', 'dias_circ']


['DeviceId',
 'Min_Date',
 'Max_Date',
 'Count_Dist_Date',
 'Sum_Viajes_dia',
 'Sum_Dist_km',
 'Sum_Duration_h',
 'Sum_Dur_50_h',
 'Sum_Dur_70_h',
 'Sum_Dur_80_h',
 'Sum_Dur_90_h',
 'Sum_Dur_100_h',
 'Sum_Dist_50',
 'Sum_Dist_70',
 'Sum_Dist_80',
 'Sum_Dist_90',
 'Sum_Dist_100',
 'Sum_DayTime_0_6_dist',
 'Sum_DayTime_6_9_dist',
 'Sum_DayTime_9_18_dist',
 'Sum_DayTime_18_21_dist',
 'Sum_DayTime_21_24_dist',
 'Sum_DayTime_0_6_dur',
 'Sum_DayTime_6_9_dur',
 'Sum_DayTime_9_18_dur',
 'Sum_DayTime_18_21_dur',
 'Sum_DayTime_21_24_dur',
 'Sum_N_Trips_DayTime_0_6',
 'Sum_N_Trips_DayTime_6_9',
 'Sum_N_Trips_DayTime_9_18',
 'Sum_N_Trips_DayTime_18_21',
 'Sum_N_Trips_DayTime_21_24',
 'Sum_DT_Weekday_dist',
 'Sum_DT_Weekend_dist',
 'Sum_DT_Weekday_dur',
 'Sum_DT_Weekend_dur',
 'Sum_N_Trips_DT_Weekday',
 'Sum_N_Trips_DT_Weekend',
 'Sum_Tipo_Viaje_A_dist',
 'Sum_Tipo_Viaje_B_dist',
 'Sum_Tipo_Viaje_C_dist',
 'Sum_Tipo_Viaje_D_dist',
 'Sum_Tipo_Viaje_A_dur',
 'Sum_Tipo_Viaje_B_dur',
 'Sum_Tipo_Viaje_C

We have computed the totals of each variable. Now, we need to calculate percentages and percentiles since we won’t use totals in the modelization. 

### Compute percentages variables

In [24]:
## % de Segment Type
cols = ['Sum_Dist_50', 'Sum_Dist_70', 'Sum_Dist_80', 'Sum_Dist_90', 'Sum_Dist_100']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Dist_km'))
        
cols = ['Sum_Dur_50_h', 'Sum_Dur_70_h', 'Sum_Dur_80_h', 'Sum_Dur_90_h', 'Sum_Dur_100_h']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Duration_h'))

In [25]:
## % DayTime
cols = [ 'Sum_DayTime_0_6_dist', 'Sum_DayTime_6_9_dist', 'Sum_DayTime_9_18_dist', 'Sum_DayTime_18_21_dist', 'Sum_DayTime_21_24_dist']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Dist_km'))
        
cols = ['Sum_DayTime_0_6_dur', 'Sum_DayTime_6_9_dur', 'Sum_DayTime_9_18_dur', 'Sum_DayTime_18_21_dur', 'Sum_DayTime_21_24_dur']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Duration_h'))
        
cols = ['Sum_N_Trips_DayTime_0_6', 'Sum_N_Trips_DayTime_6_9', 'Sum_N_Trips_DayTime_9_18', 'Sum_N_Trips_DayTime_18_21', 'Sum_N_Trips_DayTime_21_24']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Viajes_dia'))

In [26]:
## % DayType
cols = [ 'Sum_DT_Weekday_dist', 'Sum_DT_Weekend_dist']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Dist_km'))
        
cols = [ 'Sum_DT_Weekday_dur', 'Sum_DT_Weekend_dur']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Duration_h'))
        
cols = ['Sum_N_Trips_DT_Weekday', 'Sum_N_Trips_DT_Weekend']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Viajes_dia'))

In [27]:
## % Provincia
cols = [ 'Sum_Guayas_y_alr_dist', 'Sum_Pich_y_alr_dist', 'Sum_Otras_dist']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Dist_km'))
        
cols = [ 'Sum_Guayas_y_alr_dur', 'Sum_Pich_y_alr_dur', 'Sum_Otras_dur']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Duration_h'))
        
cols = ['Sum_N_Trips_Guayas_y_alr', 'Sum_N_Trips_Pich_y_alr', 'Sum_N_Trips_Otras']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Viajes_dia'))

In [28]:
## % Tipo Viaje
cols = ['Sum_Tipo_Viaje_A_dist', 'Sum_Tipo_Viaje_B_dist', 'Sum_Tipo_Viaje_C_dist', 'Sum_Tipo_Viaje_D_dist']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Dist_km'))
cols = ['Sum_Tipo_Viaje_A_dur', 'Sum_Tipo_Viaje_B_dur', 'Sum_Tipo_Viaje_C_dur', 'Sum_Tipo_Viaje_D_dur']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Duration_h'))
        
cols = ['Sum_N_Trips_Tipo_Viaje_A', 'Sum_N_Trips_Tipo_Viaje_B', 'Sum_N_Trips_Tipo_Viaje_C', 'Sum_N_Trips_Tipo_Viaje_D']

for field in cols:
    if field != ['DeviceId','Min_Date','Max_Date','Count_Dist_Date']:
        name = "PCT"+field.replace("Sum","")
        Devices_Month = Devices_Month.withColumn(name, F.col(field) / F.col('Sum_Viajes_dia'))

## Find percentiles

We will use pandas library to calcular the percentiles.

In [29]:
Devices_Month_pandas = Devices_Month.toPandas()
print("***********Done***********")
Devices_Month_pandas.head()

***********Done***********


Unnamed: 0,DeviceId,Min_Date,Max_Date,Count_Dist_Date,Sum_Viajes_dia,Sum_Dist_km,Sum_Duration_h,Sum_Dur_50_h,Sum_Dur_70_h,Sum_Dur_80_h,...,PCT_Tipo_Viaje_C_dist,PCT_Tipo_Viaje_D_dist,PCT_Tipo_Viaje_A_dur,PCT_Tipo_Viaje_B_dur,PCT_Tipo_Viaje_C_dur,PCT_Tipo_Viaje_D_dur,PCT_N_Trips_Tipo_Viaje_A,PCT_N_Trips_Tipo_Viaje_B,PCT_N_Trips_Tipo_Viaje_C,PCT_N_Trips_Tipo_Viaje_D
0,28078,2018-12-31,2019-09-30,254,981,6844.2,301.871389,247.434167,1.384167,0.0,...,0.198752,0.059452,0.434671,0.340338,0.194396,0.030595,0.675841,0.153925,0.163099,0.007136
1,29824,2019-02-02,2019-09-26,17,26,381.2,17.964167,14.995278,0.254444,0.092778,...,0.590766,0.005247,0.55889,0.0,0.431291,0.009819,0.653846,0.0,0.307692,0.038462
2,29841,2019-01-02,2019-09-30,200,507,2881.0,136.312222,108.500833,0.0,3.783611,...,0.111003,0.353141,0.658464,0.012003,0.044868,0.284665,0.800789,0.00789,0.013807,0.177515
3,30025,2019-01-02,2019-08-09,169,352,5707.4,243.858611,164.458611,47.831667,0.168611,...,0.09139,0.0,0.932562,0.0,0.067438,0.0,0.934659,0.0,0.065341,0.0
4,30562,2019-01-01,2019-09-30,232,573,2385.6,128.651667,114.4875,0.0,0.0,...,0.029804,0.022133,0.872539,0.109186,0.010355,0.00792,0.961606,0.034904,0.001745,0.001745


In [30]:
#Devices_Month
df_test = Devices_Month_pandas
cols = ['Count_Dist_Date', 'Sum_Viajes_dia', 'Sum_Dist_km', 'Sum_Duration_h',
        'Sum_Dist_50', 'Sum_Dist_70', 'Sum_Dist_80', 'Sum_Dist_90', 'Sum_Dist_100',
        'Sum_Dur_50_h', 'Sum_Dur_70_h','Sum_Dur_80_h', 'Sum_Dur_90_h', 'Sum_Dur_100_h',
        'Sum_DayTime_0_6_dist', 'Sum_DayTime_6_9_dist', 'Sum_DayTime_9_18_dist', 'Sum_DayTime_18_21_dist', 'Sum_DayTime_21_24_dist',
        'Sum_DayTime_0_6_dur', 'Sum_DayTime_6_9_dur', 'Sum_DayTime_9_18_dur', 'Sum_DayTime_18_21_dur', 'Sum_DayTime_21_24_dur',
        'Sum_N_Trips_DayTime_0_6', 'Sum_N_Trips_DayTime_6_9', 'Sum_N_Trips_DayTime_9_18', 'Sum_N_Trips_DayTime_18_21', 'Sum_N_Trips_DayTime_21_24',
        'Sum_DT_Weekday_dist', 'Sum_DT_Weekend_dist', 'Sum_DT_Weekday_dur', 'Sum_DT_Weekend_dur', 'Sum_N_Trips_DT_Weekday', 'Sum_N_Trips_DT_Weekend',
        'Sum_Tipo_Viaje_A_dist', 'Sum_Tipo_Viaje_B_dist', 'Sum_Tipo_Viaje_C_dist', 'Sum_Tipo_Viaje_D_dist',
        'Sum_Tipo_Viaje_A_dur', 'Sum_Tipo_Viaje_B_dur', 'Sum_Tipo_Viaje_C_dur', 'Sum_Tipo_Viaje_D_dur',
        'Sum_N_Trips_Tipo_Viaje_A', 'Sum_N_Trips_Tipo_Viaje_B', 'Sum_N_Trips_Tipo_Viaje_C', 'Sum_N_Trips_Tipo_Viaje_D',
        'Sum_Guayas_y_alr_dist', 'Sum_Pich_y_alr_dist', 'Sum_Otras_dist', 
        'Sum_Guayas_y_alr_dur', 'Sum_Pich_y_alr_dur', 'Sum_Otras_dur',
        'Sum_N_Trips_Guayas_y_alr', 'Sum_N_Trips_Pich_y_alr', 'Sum_N_Trips_Otras'
        ]
for col in cols:
    name = "PRCNTIL_POS_"+col.replace("Sum_","") 
    df_test[name] = df_test[col].rank(pct=True).round(2)
    

In [31]:
df_test.columns.values

array(['DeviceId', 'Min_Date', 'Max_Date', 'Count_Dist_Date',
       'Sum_Viajes_dia', 'Sum_Dist_km', 'Sum_Duration_h', 'Sum_Dur_50_h',
       'Sum_Dur_70_h', 'Sum_Dur_80_h', 'Sum_Dur_90_h', 'Sum_Dur_100_h',
       'Sum_Dist_50', 'Sum_Dist_70', 'Sum_Dist_80', 'Sum_Dist_90',
       'Sum_Dist_100', 'Sum_DayTime_0_6_dist', 'Sum_DayTime_6_9_dist',
       'Sum_DayTime_9_18_dist', 'Sum_DayTime_18_21_dist',
       'Sum_DayTime_21_24_dist', 'Sum_DayTime_0_6_dur',
       'Sum_DayTime_6_9_dur', 'Sum_DayTime_9_18_dur',
       'Sum_DayTime_18_21_dur', 'Sum_DayTime_21_24_dur',
       'Sum_N_Trips_DayTime_0_6', 'Sum_N_Trips_DayTime_6_9',
       'Sum_N_Trips_DayTime_9_18', 'Sum_N_Trips_DayTime_18_21',
       'Sum_N_Trips_DayTime_21_24', 'Sum_DT_Weekday_dist',
       'Sum_DT_Weekend_dist', 'Sum_DT_Weekday_dur', 'Sum_DT_Weekend_dur',
       'Sum_N_Trips_DT_Weekday', 'Sum_N_Trips_DT_Weekend',
       'Sum_Tipo_Viaje_A_dist', 'Sum_Tipo_Viaje_B_dist',
       'Sum_Tipo_Viaje_C_dist', 'Sum_Tipo_Viaje_D_dist'

In [33]:
print(np.percentile(df_test['Sum_Dist_km'], range(1,100)))

#[i for i in range(1,10)]
print(df_test[['Sum_Dist_km','PRCNTIL_POS_Dist_km']].head(10))

[  317.         453.5        548.55       622.40035    704.75
   764.699846   815.8        865.8        914.         963.1
  1016.3       1063.2       1103.55      1148.3       1185.15
  1232.1       1273.4       1321.        1358.75      1408.5
  1445.65      1488.3       1529.1       1562.4       1606.5
  1650.5       1690.4       1725.2       1770.45      1809.2
  1855.45      1889.        1926.7       1974.5       2014.35
  2047.        2084.7       2121.        2161.5       2203.4
  2245.1       2290.        2329.85      2369.2       2410.55
  2455.5       2501.8       2543.2       2580.7431    2631.
  2671.95      2714.5       2763.3       2817.5       2862.2
  2910.8       2967.85      3013.8       3065.65      3129.1
  3188.95      3237.9       3306.05      3372.72      3443.75
  3506.8       3571.899975  3638.        3697.85      3767.6
  3825.        3898.5       3962.65      4043.6       4128.55
  4220.9       4306.1       4383.7       4491.9       4582.29975
  4689.7       

In [34]:
Devices_Month = sqlContext.createDataFrame(df_test)
print("Done")

Done


In [35]:
print(Devices_Month.columns)
print(len(Devices_Month.columns))
#Devices_Month.select('Sum_Tipo_Viaje_A_dist_%', 'Sum_Tipo_Viaje_B_dist_%', 'Sum_Tipo_Viaje_C_dist_%', 'Sum_Tipo_Viaje_D_dist_%').show()

['DeviceId', 'Min_Date', 'Max_Date', 'Count_Dist_Date', 'Sum_Viajes_dia', 'Sum_Dist_km', 'Sum_Duration_h', 'Sum_Dur_50_h', 'Sum_Dur_70_h', 'Sum_Dur_80_h', 'Sum_Dur_90_h', 'Sum_Dur_100_h', 'Sum_Dist_50', 'Sum_Dist_70', 'Sum_Dist_80', 'Sum_Dist_90', 'Sum_Dist_100', 'Sum_DayTime_0_6_dist', 'Sum_DayTime_6_9_dist', 'Sum_DayTime_9_18_dist', 'Sum_DayTime_18_21_dist', 'Sum_DayTime_21_24_dist', 'Sum_DayTime_0_6_dur', 'Sum_DayTime_6_9_dur', 'Sum_DayTime_9_18_dur', 'Sum_DayTime_18_21_dur', 'Sum_DayTime_21_24_dur', 'Sum_N_Trips_DayTime_0_6', 'Sum_N_Trips_DayTime_6_9', 'Sum_N_Trips_DayTime_9_18', 'Sum_N_Trips_DayTime_18_21', 'Sum_N_Trips_DayTime_21_24', 'Sum_DT_Weekday_dist', 'Sum_DT_Weekend_dist', 'Sum_DT_Weekday_dur', 'Sum_DT_Weekend_dur', 'Sum_N_Trips_DT_Weekday', 'Sum_N_Trips_DT_Weekend', 'Sum_Tipo_Viaje_A_dist', 'Sum_Tipo_Viaje_B_dist', 'Sum_Tipo_Viaje_C_dist', 'Sum_Tipo_Viaje_D_dist', 'Sum_Tipo_Viaje_A_dur', 'Sum_Tipo_Viaje_B_dur', 'Sum_Tipo_Viaje_C_dur', 'Sum_Tipo_Viaje_D_dur', 'Sum_N_Trips_

### Save the final data set

In [None]:
#Dev_Facts_pandas = Devices_Month.toPandas()
#print("************* Done *************")

In [36]:
name = "Device_Facts_Final_18_nov_2019"
Devices_Month.write.parquet(local_cos.url(name , local_bucket), mode = 'overwrite')

In [37]:


#save clean data frame of totals
name = "Device_Facts_Final_18_nov_2019.csv"
project.save_data(name, df_test.to_csv(index=False, header = True), overwrite=True)
print("************* Done *************")


************* Done *************
