# Road quality assesment based on vehicle speed

Common sense tells us that whether dirt road conditions are bad, an average driver will slow down in order to avoid damage in his vehicle. In this notebook, I'll try to validate this hypothesis using available data of car speed along dirt roads.

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import os


# Folder	Car				Driver		Scenario	Distance
#----------------------------------------------------------------
#PVS 1		VW Saveiro		Driver 1	Scenario 1	13.81 km
#PVS 2		VW Saveiro		Driver 1	Scenario 2	11.62 km
#PVS 3		VW Saveiro		Driver 1	Scenario 3	10.72 km
#PVS 4		Fiat Bravo		Driver 2	Scenario 1	13.81 km
#PVS 5		Fiat Bravo		Driver 2	Scenario 2	11.63 km
#PVS 6		Fiat Bravo		Driver 2	Scenario 3	10.73 km
#PVS 7		Fiat Palio		Driver 3	Scenario 1	13.78 km
#PVS 8		Fiat Palio		Driver 3	Scenario 2	11.63 km
#PVS 9		Fiat Palio		Driver 3	Scenario 3	10.74 km

### CONSTANTS

root_dir = '/kaggle/input/pvs-passive-vehicular-sensors-datasets/PVS'        

filenames = [ # Datasets file names
    'video_environment_dataset_left.mp4', # 0
    'video_environment_dataset_right.mp4', # 1
    'video_environment.mp4', # 2
    'dataset_gps.csv', # 3 - GPS data, including latitude, longitude, altitude, speed, accuracy, etc.
    'dataset_labels.csv', # 4 - Data classes for each sample data in the dataset (for both sides).
    'dataset_gps_mpu_left.csv', # 5 - Inertial sensor data on the left side of the vehicle, combined with GPS data.
    'dataset_settings_left.csv', # 6
    'dataset_mpu_right.csv', # 7
    'video_dataset_left.mp4', # 8
    'map.html', # 9
    'dataset_mpu_left.csv', # 10
    'dataset_settings_right.csv', # 11
    'dataset_gps_mpu_right.csv', # 12 - Inertial sensor data on the right side of the vehicle, combined with GPS data.
    'video_dataset_right.mp4' # 13
]

# List of all columns from file 5 or 12
all_columns = ['timestamp', 'acc_x_dashboard', 'acc_y_dashboard', 'acc_z_dashboard',
       'acc_x_above_suspension', 'acc_y_above_suspension',
       'acc_z_above_suspension', 'acc_x_below_suspension',
       'acc_y_below_suspension', 'acc_z_below_suspension', 'gyro_x_dashboard',
       'gyro_y_dashboard', 'gyro_z_dashboard', 'gyro_x_above_suspension',
       'gyro_y_above_suspension', 'gyro_z_above_suspension',
       'gyro_x_below_suspension', 'gyro_y_below_suspension',
       'gyro_z_below_suspension', 'mag_x_dashboard', 'mag_y_dashboard',
       'mag_z_dashboard', 'mag_x_above_suspension', 'mag_y_above_suspension',
       'mag_z_above_suspension', 'temp_dashboard', 'temp_above_suspension',
       'temp_below_suspension', 'timestamp_gps', 'latitude', 'longitude',
       'speed']

drivers = ['D1 VW Saveiro','D1 VW Saveiro','D1 VW Saveiro',
           'D2 Fiat Bravo','D2 Fiat Bravo','D2 Fiat Bravo',
           'D3 Fiat Palio','D3 Fiat Palio','D3 Fiat Palio']

scenarios = ['Scenario 1', 'Scenario 2', 'Scenario 3',
             'Scenario 1', 'Scenario 2', 'Scenario 3',
             'Scenario 1', 'Scenario 2', 'Scenario 3']

# Data tags (from dataset_labels.csv)
road_classes = ['dirt_road', 'cobblestone_road', 'asphalt_road']
quality_left_classes = ['good_road_left', 'regular_road_left', 'bad_road_left']
quality_right_classes = ['good_road_right', 'regular_road_right', 'bad_road_right']
other_classes = ['paved_road', 'unpaved_road', 'no_speed_bump', 'speed_bump_asphalt', 'speed_bump_cobblestone']

# Helper functions
def list_files(): # For unknown directory content
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

def list_files2(fn, filenames): # For known directory content (requires number of folders and filenames)
    for k in range(1,fn):
        for f in filenames:
            print('{} {}/{}'.format(root_dir, k, f))
            
def load_file(k, f, filenames): # Cargar dataset f de experimento k
    data = pd.read_csv('{} {}/{}'.format(root_dir, k, filenames[f]))
    return data
        
def save_csv(dataframe, filename): # For export a dataframe to csv file
    dataframe.to_csv(filename, index = False)
    
def one_hot_to_label(df_in, classes, df_out, class_name): # Convert encoding type
    conditions = []
    for r in classes:
        conditions.append(df_in[r] == 1)
    df_out[class_name] = np.select(conditions, classes)
    return df_out
    
print("Ready.")

# Generate a single dataset containing all the relevant information for the analysis

Combining all nine observations and selecting columns with gps, speed, acceleration and label tags, we build a single dataset.

In [None]:
# Columns to use from datasets from file 5 or 12 (both should have same values on these columns)
use_columns = [
    'timestamp',
    'latitude',
    'longitude',
    'speed' # m/s
]
# Columns to drop from datasets
drop_columns = [c for c in all_columns if c not in use_columns]

# Columns with accelerometer values
acc_columns = [
    'acc_x_dashboard', 'acc_y_dashboard', 'acc_z_dashboard',
    'acc_x_above_suspension', 'acc_y_above_suspension',
    'acc_z_above_suspension', 'acc_x_below_suspension',
    'acc_y_below_suspension', 'acc_z_below_suspension'
]

# If only vertical
# acc_columns = ['acc_z_dashboard','acc_z_above_suspension', 'acc_z_below_suspension']


acc_axis = len(acc_columns)


In [None]:

all_data = [] # New dataframe containing all data

for k in range(1,10):
    # Load labels dataset
    dataset_labels = load_file(k, 4, filenames)
    labels_only = pd.DataFrame(columns = ['road', 'quality_right', 'quality_left']) # Convert from one-hot to label
    
    # Convert from one-hot encoding to single label encoding
    labels_only = one_hot_to_label(dataset_labels, road_classes, labels_only, 'road')
    labels_only = one_hot_to_label(dataset_labels, quality_right_classes, labels_only, 'quality_right')
    labels_only = one_hot_to_label(dataset_labels, quality_left_classes, labels_only, 'quality_left')

    # Convert road quality labels to numeric values
    labels_only = labels_only.replace({'quality_right' : { 'good_road_right' : 2, 'regular_road_right' : 1, 'bad_road_right' : 0 }})
    labels_only = labels_only.replace({'quality_left' : { 'good_road_left' : 2, 'regular_road_left' : 1, 'bad_road_left' : 0 }})
    
    # Average road quality
    labels_only['quality'] = labels_only.loc[: , "quality_right":"quality_left"].mean(axis=1)
    
    # Drop quality columns
    labels_only = labels_only.drop(columns = ["quality_right","quality_left"], axis = 1)
    
    # Add columns for driver and scenario
    labels_only['driver'] = pd.Series([drivers[k-1] for _ in range(len(labels_only.index))])    
    labels_only['scenario'] = pd.Series([scenarios[k-1] for _ in range(len(labels_only.index))])    

    # Load gps dataset (only left, as should be the same as right side)
    dataset_gps_left = load_file(k,5, filenames)
    dataset_gps_right = load_file(k, 12, filenames)
    
    # Sum absolute values of all 18 accelerometers and combine in a single column (divided by number of axis (9))
    sum_left = dataset_gps_left[acc_columns].abs().sum(axis=1).div(acc_axis)
    sum_right = dataset_gps_right[acc_columns].abs().sum(axis=1).div(acc_axis)
    
    dataset_gps = dataset_gps_left.drop(columns = drop_columns, axis = 1)
    dataset_gps['acceleration'] = pd.concat([sum_left, sum_right], axis=1).mean(axis=1)

    # Add labels and quality labels
    temp = pd.concat([dataset_gps, labels_only], axis = 1)
    if len(all_data) == 0: # First create, then concatenate
        all_data = temp
    else:
        all_data = pd.concat([all_data, temp], axis = 0)

# To remove multiple and repeated indexes
all_data.reset_index(drop = True, inplace = True)
# Print results
all_data.head()

In [None]:
# Export resulting dataset
#save_csv(all_data, 'road_data.csv')  

In [None]:
# Show element counts
print('Road types:')
print(all_data['road'].value_counts())
print('\nDrivers:')
print(all_data['driver'].value_counts())
print('\nScenarios:')
print(all_data['scenario'].value_counts())


In [None]:
# Display some basic column-wise statistics
all_data.describe()

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Road types
Road types are classified according to the road surface as 'Asphalt', 'Cobblestone' and 'Dirt'. The pie chart below shows the proportion of data that correspond to each class.


In [None]:
# Road types proportion
fig, ax = plt.subplots(1,1)
fig.set_figheight(10)
fig.set_figwidth(10)
fig.set_facecolor((1.0, 1.0, 1.0))

grouped = all_data['road'].value_counts()
plt.pie(grouped, labels = grouped.index, autopct = '%1.1f%%');
plt.axis('equal')
plt.title('Road types')
plt.show()

# Road quality of different road types

Data  is labeled according to the road condition. For left and right sides of the car, road quality is labeled as 'Good', 'Regular' or 'Bad'. Values are converted to numeric type and the average between both sides is used. Pie charts below, shows the data proportions that corresponds to each label.

In [None]:
# Road quality proportions for each road type
fig, ax = plt.subplots(1,3)
fig.set_figheight(10)
fig.set_figwidth(25)
fig.suptitle('Road quality proportions')
fig.set_facecolor((1.0, 1.0, 1.0))
	
dirt_grouped = all_data.loc[all_data['road']=='dirt_road']['quality'].value_counts()
asphalt_grouped = all_data.loc[all_data['road']=='asphalt_road']['quality'].value_counts()
cobblestone_grouped = all_data.loc[all_data['road']=='cobblestone_road']['quality'].value_counts()

map_values = {'0.0':'Bad', '0.5':'Regular bad', '1.0':'Regular', '1.5':'Regular good', '2.0':'Good'}

ax[0].pie(dirt_grouped, labels = [map_values[str(k)] for k in dirt_grouped.index], autopct = '%1.1f%%');
ax[0].set_title('Dirt road')
ax[1].pie(asphalt_grouped, labels = [map_values[str(k)] for k in asphalt_grouped.index], autopct = '%1.1f%%');
ax[1].set_title('Asphalt road')
ax[2].pie(cobblestone_grouped, labels = [map_values[str(k)] for k in cobblestone_grouped.index], autopct = '%1.1f%%');
ax[2].set_title('Cobblestone road')

plt.show()	

### We can see that 'Dirt road' class have the higher proportion of bad quality. This means that the labels are a general indicator of road condition and are not relative to each road type.

# Speed vs road quality
Lets check if the acquired speed data has some correlation with road quality for different road types.

In [None]:
moving_data = all_data.loc[all_data['speed']>1] # Discard data when car is stopped
print('Discarded {:.2f}% of data'.format((1 - len(moving_data)/len(all_data))*100)) # Porcentage of discarded data

# Data distribution for different road types
fig, ax = plt.subplots(2,1)
fig.set_figheight(20)
fig.set_figwidth(15)

plt.subplot(2,1,1)
sns.boxplot(x="quality", y="speed", hue="road", data=moving_data)
plt.title('Speed vs road quality')
plt.grid()

plt.subplot(2,1,2)
sns.violinplot(x="quality", y="speed", hue="road", data=moving_data)
plt.title('Speed vs road quality')
plt.grid()


plt.show()

### We can see that speed recorded is higher for asphalt road, and for cobblestone and dirt road, speed is higher when the quality is better, except for good quality labeled data. This could mean that there are sections traveled at slow speed where road was labeled as good.

# Acceleration vs road quality
Same as previous analysis but using acceleration values.

In [None]:
# Data distribution for different road types
fig, ax = plt.subplots(2,1)
fig.set_figheight(20)
fig.set_figwidth(15)
fig.suptitle("Acceleration vs road quality")

plt.subplot(2,1,1)
sns.boxplot(x="quality", y="acceleration", hue="road", data=moving_data)
plt.grid()

plt.subplot(2,1,2)
sns.violinplot(x="quality", y="acceleration", hue="road", data=moving_data)
plt.grid()


plt.show()

### Here we see that for cobblestone and dirt road, acceleration values have a higher amplitude than asphalt roads, which makes sense. The next plot shows acceleration data distribution for each road type, which reaffirms this observation.

In [None]:
plt.figure(figsize=(15,10))
plt.title('Acceleration distribution on different roads')
moving_data.loc[moving_data['road']=='asphalt_road']['acceleration'].hist(bins = 20)    
moving_data.loc[moving_data['road']=='dirt_road']['acceleration'].hist(bins = 20)    
moving_data.loc[moving_data['road']=='cobblestone_road']['acceleration'].hist(bins = 20)    
plt.legend(['Asphalt', 'Dirt', 'Cobblestone'])
plt.xlabel('Acceleration')
plt.ylabel('Frequency')
plt.show()	

# From now on, lets focus on dirt road only


In [None]:
dirt_data = moving_data.loc[moving_data['road']=='dirt_road']
dirt_data.describe()

# Trajectory identification

The goal is to plot speed variations of every driver along a continuous path or trajectory, so first we need to isolate and identify data corresponding to each one.


In [None]:
# Plot gps data

not_dirt_data = moving_data.loc[(moving_data["road"] == "cobblestone_road") | (moving_data["road"] == "asphalt_road")]

# Plot road paths coloring scenarios of dirt road.
plt.figure(figsize=(10,10))
# Plot other road types first
plt.scatter(not_dirt_data['longitude'], not_dirt_data['latitude'], c = "gray", label = 'Other roads')
# Add the dirt road paths
colors = {'Scenario 1': 'red', 'Scenario 2': 'blue', 'Scenario 3': 'green'}
for c in colors:
    d = dirt_data.loc[dirt_data['scenario'] == c]
    plt.scatter(d['longitude'], d['latitude'], c = colors[c], label = c)

plt.legend()
plt.show()

In [None]:
# Label data based on timestamp discontinuities

pd.options.mode.chained_assignment = None  # For using .at for assignment

last_label = 0
last_timestamp = 0
thres = 120 # 2 minutes
for index, row in dirt_data.iterrows():
    if row["timestamp"]-last_timestamp > thres:
        last_label = last_label+1
    dirt_data.at[index, "label"] = "path_{}".format(last_label)
    last_timestamp = row["timestamp"]

#print(dirt_data.head())
print("Total labels = {}".format(last_label))


### Here we have isolated 18 different trajectories. We know that there are three drivers, and each one traveled the same route at least once, so we identify each one based on number of total rows or data observations. 
### The best approach could involve some geospatial data clustering, but for now, lets keep this simple.

In [None]:
# Determine trajectories of interest
grouped = dirt_data.groupby("label") # Group data according to trajectory label
sgrouped = sorted(grouped, key = lambda x: len(x[1]), reverse=True) # Sort by number of rows

# Print number of rows of each group
print("Label\t\tSize")
for index, g in sgrouped:
    print("{}\t\t{}".format(index,len(g)))

### For determining traveled distance from gps data, we need the Haversine equation:

In [None]:
def haversine(lat1, lon1, lat2, lon2): # Vectorized Harvesine equation
    #from https://stackoverflow.com/questions/40452759/pandas-latitude-longitude-to-distance-between-successive-rows
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
    return 12742 * np.arcsin(np.sqrt(a))

### Finally we select data from the first 6 labeled groups and combine them in a new dataframe for continue with the analysis

In [None]:
# Function for extract a single trajectory subset from dirt road dataframe with speed and distance data
def get_trj_data(label_name, new_label):
    df = dirt_data.loc[dirt_data["label"] == label_name] # Isolate trajectory data
    
    df['speed'] = df['speed'].apply(lambda x: x*3600/1000) # Convert to km/h

    # Add column with normalized speed
    df['speed_norm'] = df['speed'].div(df["speed"].mean()) 

    # Add column with distance from start point
    df['dist'] = haversine(df['latitude'].shift(), df['longitude'].shift(), df.loc[1:, 'latitude'], df.loc[1:, 'longitude'])
    df['dist'].fillna(df['dist'].iloc[-1])
    df['dist'] = df['dist'].cumsum() # Cummulative sum of distances between points
    
    # Add new label
    df['trj_label'] = new_label

    return df

# Build dataframe for first trajectory
trj_data = []
for p in ['path_10', 'path_16', 'path_4']:
    d = get_trj_data(p, 'trj_1')
    if len(trj_data) == 0:
        trj_data = d
    else:
        trj_data = pd.concat([trj_data, d], axis=0)
        
# Add second trajectory
for p in ['path_5', 'path_11', 'path_17']:
    d = get_trj_data(p, 'trj_2')
    trj_data = pd.concat([trj_data, d], axis=0)

trj_data.reset_index(drop = True, inplace = True)
trj_data.head()

## Plots of the normalized speed vs linear position for the selected subsets

In [None]:
def trj_speed(trj_name):
    grouped = trj_data.loc[trj_data['trj_label'] == trj_name].groupby('driver')
    plt.figure(figsize=(15,10))
    for ind,g in grouped:
        g.reset_index(inplace=True)
        plt.plot(g["dist"], g["speed_norm"], '.-', label = g.loc[0,"driver"])

    plt.grid()
    plt.ylabel("Norm. speed [avg={:.2f} km/h]".format(g["speed"].mean()))
    plt.xlabel("Distance traveled [km]")
    plt.legend()

trj_speed('trj_1')
trj_speed('trj_2')

# Partial conclusions

### As we can see, speed variations are similar between the three different drivers. Now, lets see if we can find a correlation between this speed variations and road conditions
### For this, we are using a scatter plot where each data point is colored based on the labeled quality

In [None]:
temp = trj_data.loc[trj_data['trj_label'] == 'trj_1']
grouped = temp.groupby('quality')
plt.figure(figsize=(15,10))
for ind, g in grouped:
    plt.scatter(g['dist'], g['speed_norm'], label = ind)
plt.grid()
plt.title('Norm. speed vs position and road quality')
plt.xlabel('Position [km]')
plt.ylabel('Normalized speed')
plt.legend()

### There are a few low speed data points labeled as 'good', but most of low speed data points corresponds to bad quality labeled data, which makes sense. Lets see how is the speed correlated to the total acceleration indicator

In [None]:
temp = trj_data.loc[trj_data['trj_label'] == 'trj_1']
acc = temp['acceleration'].rolling(30, min_periods = 1).mean() # Acceleration data smoothed with rolling window size=30

plt.figure(figsize=(15,10))
plt.scatter(temp['dist'], temp['speed'], c = acc, cmap = 'brg')
cbar = plt.colorbar()
cbar.set_label('Acceleration', rotation=90)
plt.title('Velocity through trajectory and acceleration')
plt.xlabel('Distance [km]')
plt.ylabel('Speed [km/h]')
plt.grid()

### Here we can see that low velocity data points have small acceleration values. We can say that the faster the car goes, the higher are the values recorded by the accelerometers, but we cannot use this variable as an indicator of the road condition. 
### It would be more appropriate to use the frequency components of the acceleration data instead of the instantaneous values, this is, a vibration analysis might shed some light on measuring the road quality.

# Car vibration analysis

Work in progress...