In [2]:
import numpy as np
import pandas as pd

# this is the URL to the CSV file containing the training data
data_url = "https://databricksdemostore.blob.core.windows.net/data/connected-car/training-formatted.csv"

In [3]:
data = pd.read_csv(data_url)
data

Unnamed: 0,Survival_In_Days,Province,Region,Trip_Length_Mean,Trip_Length_Sigma,Trips_Per_Day_Mean,Trips_Per_Day_Sigma,Battery_Rated_Cycles,Manufacture_Month,Manufacture_Year,...,Sensor_Reading_52,Sensor_Reading_53,Sensor_Reading_54,Sensor_Reading_55,Sensor_Reading_56,Sensor_Reading_57,Sensor_Reading_58,Sensor_Reading_59,Sensor_Reading_60,Sensor_Reading_61
0,1283,Bretagne,West,18.103250,6.034416,4.733162,1.183291,275,M8,Y2010,...,16.418910,17.441310,24.718290,11.812310,19.437210,15.079740,16.982440,18.893610,13.590000,14.510940
1,1427,Occitanie,South,14.637070,4.879023,4.325950,1.081487,250,M8,Y2014,...,14.703280,16.154500,27.789550,22.292230,29.158610,21.739530,23.830780,19.480210,10.264120,18.009700
2,1436,Auvergne_Rhone_Alpes,South,14.505640,4.835215,4.418737,1.104684,250,M9,Y2018,...,22.389700,21.834420,28.743260,26.313940,15.589060,15.317560,19.613730,28.397800,19.807990,15.425770
3,894,Martinique,West,20.850520,6.950172,4.284968,1.071242,200,M10,Y2003,...,2.794836,13.993500,15.524580,6.298875,11.355190,14.396860,2.890394,6.362495,10.916070,10.004320
4,1539,Reunion,South,11.579590,3.859862,4.561532,1.140383,200,M10,Y2007,...,26.631860,26.116980,18.011900,25.257760,25.320780,26.894640,18.863220,25.744930,24.027720,23.657220
5,1872,Marseille,South,14.070980,4.690325,4.697100,1.174275,300,M11,Y2011,...,11.889280,7.358676,10.700270,8.218617,13.397930,2.973648,11.031080,3.532511,12.841720,8.153067
6,151,Ile_de_France,MidWest,13.388510,4.462836,4.539887,1.134972,300,M12,Y2015,...,-13.361990,12.716510,-25.999620,-0.855164,-19.726040,5.154581,-9.921854,-0.260530,-20.260940,6.349902
7,1975,Normandie,MidWest,16.718670,5.572891,4.641222,1.160305,275,M12,Y2000,...,21.916700,14.228060,11.378330,-0.157791,13.303480,7.164655,10.716000,4.709601,19.316740,-0.762613
8,1957,Paris,MidWest,12.280450,4.093483,4.417785,1.104446,275,M1,Y2005,...,-10.007310,5.053398,-12.770300,4.404034,-8.639040,3.357793,-12.641930,-0.040223,-11.433320,4.471225
9,1150,Corse,South,19.615720,6.538573,4.318250,1.079563,250,M2,Y2009,...,17.566970,0.334354,11.381730,-9.591447,9.592852,-2.911261,8.209939,-2.191684,12.873960,-3.348420


In [4]:
data.columns

Index(['Survival_In_Days', 'Province', 'Region', 'Trip_Length_Mean',
       'Trip_Length_Sigma', 'Trips_Per_Day_Mean', 'Trips_Per_Day_Sigma',
       'Battery_Rated_Cycles', 'Manufacture_Month', 'Manufacture_Year',
       'Alternator_Efficiency', 'Car_Has_EcoStart',
       'Twelve_hourly_temperature_history_for_last_31_days_before_death_last_recording_first',
       'Sensor_Reading_1', 'Sensor_Reading_2', 'Sensor_Reading_3',
       'Sensor_Reading_4', 'Sensor_Reading_5', 'Sensor_Reading_6',
       'Sensor_Reading_7', 'Sensor_Reading_8', 'Sensor_Reading_9',
       'Sensor_Reading_10', 'Sensor_Reading_11', 'Sensor_Reading_12',
       'Sensor_Reading_13', 'Sensor_Reading_14', 'Sensor_Reading_15',
       'Sensor_Reading_16', 'Sensor_Reading_17', 'Sensor_Reading_18',
       'Sensor_Reading_19', 'Sensor_Reading_20', 'Sensor_Reading_21',
       'Sensor_Reading_22', 'Sensor_Reading_23', 'Sensor_Reading_24',
       'Sensor_Reading_25', 'Sensor_Reading_26', 'Sensor_Reading_27',
       'Sensor_Rea

In [5]:
data.Manufacture_Year.unique()

array(['Y2010', 'Y2014', 'Y2018', 'Y2003', 'Y2007', 'Y2011', 'Y2015',
       'Y2000', 'Y2005', 'Y2009', 'Y2013', 'Y2017', 'Y2002', 'Y2006',
       'Y2004', 'Y2008', 'Y2012', 'Y2016', 'Y2001'], dtype=object)

In [6]:
# assumtions: survival in days is a measure of how many days passed until the battery died
# assumptions: trip length is in TIME (minutes), and trip length sigma is the standard deviation
# assumptions: trips per day mean is measured in count and trips per day sigma is the standard deviation
# assumptions: a battery rated cycle defines how many complete charge/discharge cycles the batter is measured to survive
# assumptions: the Life span of a battery depends on usage—usually 6 to 48 months, yet only 30% of all batteries actually reach the 48-month mark.
reduced_df = data[['Survival_In_Days', 'Trip_Length_Mean', 'Trip_Length_Sigma', 'Trips_Per_Day_Mean', 'Trips_Per_Day_Sigma','Battery_Rated_Cycles']]
reduced_df

Unnamed: 0,Survival_In_Days,Trip_Length_Mean,Trip_Length_Sigma,Trips_Per_Day_Mean,Trips_Per_Day_Sigma,Battery_Rated_Cycles
0,1283,18.103250,6.034416,4.733162,1.183291,275
1,1427,14.637070,4.879023,4.325950,1.081487,250
2,1436,14.505640,4.835215,4.418737,1.104684,250
3,894,20.850520,6.950172,4.284968,1.071242,200
4,1539,11.579590,3.859862,4.561532,1.140383,200
5,1872,14.070980,4.690325,4.697100,1.174275,300
6,151,13.388510,4.462836,4.539887,1.134972,300
7,1975,16.718670,5.572891,4.641222,1.160305,275
8,1957,12.280450,4.093483,4.417785,1.104446,275
9,1150,19.615720,6.538573,4.318250,1.079563,250


In [7]:
reduced_df.describe()

Unnamed: 0,Survival_In_Days,Trip_Length_Mean,Trip_Length_Sigma,Trips_Per_Day_Mean,Trips_Per_Day_Sigma,Battery_Rated_Cycles
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1442.1644,16.378385,5.459462,4.499357,1.124839,256.2475
std,497.226191,3.119748,1.039916,0.249737,0.062434,36.976925
min,2.0,8.826437,2.942146,4.000052,1.000013,200.0
25%,1171.0,14.13258,4.71086,4.312267,1.078066,237.5
50%,1440.0,16.229325,5.409774,4.500469,1.125117,250.0
75%,1726.0,18.42681,6.142269,4.685923,1.17148,281.25
max,3485.0,27.34972,9.116573,4.999959,1.24999,300.0


In [8]:
max_life_months = 48
max_life_days = max_life_months * 365
max_life_days

17520

In [9]:
# Our batteries have a max of of about 9.5 months of surival
3485/365

9.547945205479452

In [10]:
# Our batteries are rated for 200 to 300 cycles
# Users take 4 to 5 trips per day
# These trips are between 8 and 27 minutes long
# Generally battery rated cycles means the number of charge/recharge cycles before a battery starts to reduce visibly its performance.
# Cycles used per day = Battery_Rated_Cycles / Survival_in_days
# Cycles used per trip
250 / 3485

0.07173601147776183

In [11]:
250/1440

0.1736111111111111

In [12]:
250/1442

0.17337031900138697

In [13]:
250/1726

0.14484356894553882

In [57]:
# Cycles per minute
engineered_df = reduced_df.copy()
engineered_df['Cycles_Per_Min_Mean'] = reduced_df['Battery_Rated_Cycles'] / (reduced_df['Survival_In_Days'] * reduced_df['Trip_Length_Mean'] * reduced_df['Trips_Per_Day_Mean'])
engineered_df

Unnamed: 0,Survival_In_Days,Trip_Length_Mean,Trip_Length_Sigma,Trips_Per_Day_Mean,Trips_Per_Day_Sigma,Battery_Rated_Cycles,Cycles_Per_Min_Mean
0,1283,18.103250,6.034416,4.733162,1.183291,275,0.002501
1,1427,14.637070,4.879023,4.325950,1.081487,250,0.002767
2,1436,14.505640,4.835215,4.418737,1.104684,250,0.002716
3,894,20.850520,6.950172,4.284968,1.071242,200,0.002504
4,1539,11.579590,3.859862,4.561532,1.140383,200,0.002460
5,1872,14.070980,4.690325,4.697100,1.174275,300,0.002425
6,151,13.388510,4.462836,4.539887,1.134972,300,0.032686
7,1975,16.718670,5.572891,4.641222,1.160305,275,0.001794
8,1957,12.280450,4.093483,4.417785,1.104446,275,0.002590
9,1150,19.615720,6.538573,4.318250,1.079563,250,0.002566


In [58]:
# how many examples with survival in days between 50% (1440) and 75% (1726)
reference_batteries = engineered_df[ (engineered_df['Survival_In_Days'] > 1439) & (engineered_df['Survival_In_Days'] < 1727) ]
reference_batteries

Unnamed: 0,Survival_In_Days,Trip_Length_Mean,Trip_Length_Sigma,Trips_Per_Day_Mean,Trips_Per_Day_Sigma,Battery_Rated_Cycles,Cycles_Per_Min_Mean
4,1539,11.579590,3.859862,4.561532,1.140383,200,0.002460
18,1671,12.264820,4.088274,4.933418,1.233355,250,0.002473
19,1517,10.623340,3.541114,4.496125,1.124031,200,0.002760
21,1525,16.301340,5.433779,4.013108,1.003277,300,0.003007
22,1679,16.955670,5.651891,4.539671,1.134918,300,0.002321
25,1441,17.208020,5.736008,4.302850,1.075713,250,0.002343
27,1583,17.165870,5.721958,4.172488,1.043122,200,0.001764
31,1688,12.016690,4.005563,4.528741,1.132185,275,0.002994
35,1482,14.879160,4.959722,4.384995,1.096249,200,0.002068
36,1468,17.791710,5.930570,4.661558,1.165390,200,0.001643


In [59]:
reference_batteries.describe()

Unnamed: 0,Survival_In_Days,Trip_Length_Mean,Trip_Length_Sigma,Trips_Per_Day_Mean,Trips_Per_Day_Sigma,Battery_Rated_Cycles,Cycles_Per_Min_Mean
count,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0
mean,1573.443468,16.006708,5.335569,4.495374,1.123844,266.919696,0.002398
std,82.339755,2.51289,0.83763,0.248596,0.062149,30.888785,0.000305
min,1440.0,9.320985,3.106995,4.000909,1.000227,200.0,0.001355
25%,1501.0,14.370765,4.790254,4.307823,1.076955,250.0,0.002233
50%,1571.0,15.88912,5.296373,4.493789,1.123447,275.0,0.002439
75%,1645.5,17.642465,5.880821,4.678719,1.16968,300.0,0.002602
max,1726.0,26.98219,8.994061,4.999739,1.249935,300.0,0.003525


In [17]:
len(reference_batteries)

2503

### Create time-series with one report per day

In [60]:
from datetime import date, timedelta
start_date = date(2013,1,1)
one_day = timedelta(days=1)
start_date + one_day

datetime.date(2013, 1, 2)

In [61]:
import random 
from datetime import date, timedelta

num_ref_batteries = 1

rows = []

curr_date = date(2013,1,1)
one_day = timedelta(days=1)

for i in range(num_ref_batteries):
    print("Preparing battery ", i) 
    lifetime_cycles_used = 0
    battery_rated_cycles = reference_batteries.iloc[i]['Battery_Rated_Cycles']
    for day in range(np.int64(reference_batteries.iloc[i]['Survival_In_Days'])):
        number_of_trips = int(round(reference_batteries.iloc[i]['Trips_Per_Day_Mean']))
        daily_trip_duration = 0
        daily_cycles_used = 0
        for trip in range(number_of_trips):
            trip_duration = random.uniform(reference_batteries.iloc[i]['Trip_Length_Mean']-reference_batteries.iloc[i]['Trip_Length_Sigma'], reference_batteries.iloc[i]['Trip_Length_Mean']+reference_batteries.iloc[i]['Trip_Length_Sigma']) 
            cycles_used_on_trip = trip_duration * reference_batteries.iloc[i]['Cycles_Per_Min_Mean']
            daily_trip_duration = daily_trip_duration + trip_duration
            daily_cycles_used = daily_cycles_used + cycles_used_on_trip
        lifetime_cycles_used = lifetime_cycles_used + daily_cycles_used
        rows.append([curr_date, i, day, number_of_trips, daily_trip_duration, daily_cycles_used, lifetime_cycles_used, battery_rated_cycles])
        curr_date = curr_date + one_day
            

time_series_df = pd.DataFrame(rows, columns=['Date','Battery_ID','Battery_Age_Days','Number_Of_Trips','Daily_Trip_Duration','Daily_Cycles_Used', 'Lifetime_Cycles_Used', 'Battery_Rated_Cycles'])
time_series_df.to_csv('./daily-battery-time-series.csv')
print('Time series created and saved')

Preparing battery  0
Time series created and saved


In [62]:
time_series_df

Unnamed: 0,Date,Battery_ID,Battery_Age_Days,Number_Of_Trips,Daily_Trip_Duration,Daily_Cycles_Used,Lifetime_Cycles_Used,Battery_Rated_Cycles
0,2013-01-01,0,0,5,67.845608,0.166920,0.166920,200.0
1,2013-01-02,0,1,5,53.450798,0.131505,0.298425,200.0
2,2013-01-03,0,2,5,58.841433,0.144767,0.443193,200.0
3,2013-01-04,0,3,5,60.638403,0.149188,0.592381,200.0
4,2013-01-05,0,4,5,62.646910,0.154130,0.746511,200.0
5,2013-01-06,0,5,5,56.939079,0.140087,0.886598,200.0
6,2013-01-07,0,6,5,58.644465,0.144283,1.030881,200.0
7,2013-01-08,0,7,5,57.124115,0.140542,1.171423,200.0
8,2013-01-09,0,8,5,49.177812,0.120992,1.292415,200.0
9,2013-01-10,0,9,5,58.959360,0.145058,1.437473,200.0


In [63]:
time_series_df['Daily_Cycles_Used'].sum()

218.67700814849968

In [64]:
reference_batteries.iloc[0]

Survival_In_Days        1539.000000
Trip_Length_Mean          11.579590
Trip_Length_Sigma          3.859862
Trips_Per_Day_Mean         4.561532
Trips_Per_Day_Sigma        1.140383
Battery_Rated_Cycles     200.000000
Cycles_Per_Min_Mean        0.002460
Name: 4, dtype: float64

In [66]:
time_series_df.describe()

Unnamed: 0,Battery_ID,Battery_Age_Days,Number_Of_Trips,Daily_Trip_Duration,Daily_Cycles_Used,Lifetime_Cycles_Used,Battery_Rated_Cycles
count,1539.0,1539.0,1539.0,1539.0,1539.0,1539.0,1539.0
mean,0.0,769.0,5.0,57.753331,0.14209,109.518414,200.0
std,0.0,444.415346,0.0,4.935313,0.012142,63.164398,0.0
min,0.0,0.0,5.0,42.94109,0.105648,0.16692,200.0
25%,0.0,384.5,5.0,54.331187,0.133671,55.035844,200.0
50%,0.0,769.0,5.0,57.636466,0.141803,109.390133,200.0
75%,0.0,1153.5,5.0,61.012603,0.150109,164.153188,200.0
max,0.0,1538.0,5.0,75.294341,0.185246,218.677008,200.0
