This notebook is used for feature engineering.

# Preliminaries

In [2]:
%%capture
!pip install vincenty    # calculates distances for gps

In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [4]:
# Go to the project folder
import os
os.chdir(r'/gdrive/My Drive/Colab Notebooks/GITHUB/X_FPB')

if not os.path.exists("data/prepared/"):
  os.mkdir("data/prepared")


In [5]:
import sys
if "." not in sys.path:
  sys.path.append(".") 

In [6]:
import pandas as pd
import numpy as np
from scipy import stats
import pickle
import time
import copy

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

#import catboost

matplotlib.style.use('fivethirtyeight')
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['figure.figsize'] = (10,10)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

sns.set(color_codes=True)

  import pandas.util.testing as tm


# Dataset Preparation

**Prerequisites**:<br>
* The initial datasets are to be first downloaded from [https://challengedata.ens.fr/participants/challenges/21/] <br>
and copied in the subfolder data.<br>
* The core of the python code is in helper/data_prep.py module.

In [7]:
# The core of the python code is in helper/data_prep.py module.
import helpers.data_prep as data_prep

In [8]:
# import importlib
# importlib.reload(data_prep)  # Reimport after change

<module 'helpers.data_prep' from '/gdrive/My Drive/Colab Notebooks/GITHUB/X_FPB/helpers/data_prep.py'>

In [None]:
# x_train, y_train, x_test, x_train_ad, x_test_ad = data_prep.load_all(data_dir='data/')
# x_train = pd.concat([x_train, x_train_ad], axis=1)
# x_test = pd.concat([x_test, x_test_ad], axis=1)
# data_prep.add_waypoints(x_train)
# data_prep.add_waypoints(x_test)

In [None]:
# x_train, y_train, x_test, x_train_ad, x_test_ad = data_prep.load_all(data_dir='data/')
# x_train = pd.concat([x_train, x_train_ad], axis=1)
# data_prep.add_gps_tracking_info(x_train)

In [9]:
#@title Execute data prepartation
%%time
import warnings
warnings.filterwarnings('ignore')

x_train, y_train, x_test, x_train_1hot, x_test_1hot = data_prep.data_prep_all()

CPU times: user 7min, sys: 4.11 s, total: 7min 4s
Wall time: 7min 10s


In [10]:
x_train.head(2)

Unnamed: 0,alert reason category,alert reason,intervention on public roads,floor,location of the event,longitude intervention,latitude intervention,emergency vehicle,rescue center,date key sélection,time key sélection,status preceding selection,delta status preceding selection-selection,departed from its rescue center,longitude before departure,latitude before departure,delta position gps previous departure-departure,OSRM estimated distance,OSRM estimated duration,OSRM estimated distance from last observed GPS position,OSRM estimated duration from last observed GPS position,time elapsed between selection and last observed GPS position,updated OSRM estimated duration,selection_weekday,selection_month,selection_day,selection_hour,selection_is_holiday,OSRM_estimated_speed,departure2intervention_bearing,mid_point_lat,mid_point_lon,waypoint1_name,waypoint1_lon,waypoint1_lat,waypoint2_name,waypoint2_lon,waypoint2_lat,waypoint3_name,waypoint3_lon,waypoint3_lat,waypoint4_name,waypoint4_lon,waypoint4_lat,paris2intervention_bearing,paris2intervention_km,paris2departure_bearing,paris2departure_km,paris2mid_point_bearing,paris2mid_point_km,paris2waypoint1_bearing,paris2waypoint1_km,paris2waypoint2_bearing,paris2waypoint2_km,GPS_Tracks_records,GPS_Tracks_duration_hr,GPS_Tracks_distance_km,GPS_Tracks_mean_kmh,GPS_Tracks_std_kmh,GPS_Tracks_80pc_kmh,vehicule_type,vehicule_ownrer,speed_mean_kmh,intervention_place
0,3,2162,0,0,148,2.284796,48.879669,4511,2447,20180708,190243,Rentré,2027,1,2.288053,48.884698,0.0,952.5,105.8,663.2,88.8,394.01,482.81,6,7,8,19,False,9.002836,203.070219,48.882184,2.286424,Boulevard de l'Yser,2.287674,48.884954,Boulevard Gouvion-Saint-Cyr,2.284755,48.879677,Boulevard de l'Yser,2.28624,48.880195,Boulevard Gouvion-Saint-Cyr,2.284755,48.879677,297.513299,5.570931,303.685128,5.648822,300.62108,5.601765,303.769415,5.687728,297.507185,5.574011,5,0.076389,1.024271,13.417797,10.365761,19.955243,VSAV,BSPP,22.3244,226
1,3,2124,0,1,136,2.247464,48.818191,4327,2464,20180104,90259,Rentré,28233,1,2.268519,48.823958,0.0,2238.5,243.2,0.0,0.0,0.0,0.0,3,1,4,9,False,9.204359,247.420464,48.821075,2.257991,Missing,2.268451,48.823945,Avenue de Verdun,2.247447,48.818154,Missing,0.0,0.0,Avenue de Verdun,0.0,0.0,240.914608,8.79542,239.377252,7.135055,240.226026,7.964531,239.387686,7.140089,240.895118,8.798512,0,0.0,0.0,0.0,0.0,0.0,PSE,Missing,27.706635,164


# Save the prepared datasets to the folder data/prepared/.

In [11]:
# saves to disk
dfs = [x_train, x_test, x_train_1hot, x_test_1hot]
file_names = ['x_train', 'x_test', 'x_train_1hot', 'x_test_1hot']
file_names = [f + '.csv.zip' for f in file_names]

new_dir='data/prepared/'
if not os.path.exists(new_dir):
  os.makedirs(new_dir)

data_prep.save_to_disk(
    dfs=dfs,
    file_names=file_names,
    DATA_DIR=new_dir)