In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from haversine import haversine_vector, Unit, haversine
import osmnx as ox
import pickle
import os
from tqdm import tqdm
import folium
from rtree import index
import seaborn as sns
from collections import defaultdict
import warnings
import numpy as np
warnings.filterwarnings('ignore')

#### Reading the Dataset

In [None]:
stops_df = pd.read_csv(r"../Dataset/bus_stop.csv")

In [None]:
gps_df = pd.read_csv(r"../Dataset/GPS.csv")

In [None]:
print(len(gps_df))
gps_df.head()

In [None]:
# Drop duplicate based on DEVICE_ID, IST_DATE, LAT and LONGITUDE
gps_df = gps_df.drop_duplicates(subset=['DEVICE_ID', 'IST_DATE', 'LAT', 'LONGITUDE'])

In [None]:
# Filter the rows which LAT and LONGITUDE are between 12 and 14 and 77 and 79
stops_df = stops_df[(stops_df['latitude_current'] > 12.8265) & (stops_df['latitude_current'] < 13.2694) & (
        stops_df['longitude_current'] > 77.3740) & (stops_df['longitude_current'] < 77.8313)]
gps_df = gps_df[(gps_df['LAT'] > 12) & (gps_df['LAT'] < 13.2694) & (gps_df['LONGITUDE'] > 77.3740) & (
        gps_df['LONGITUDE'] < 77.8313)]
print(len(gps_df))
gps_df.head()

In [None]:
# Plot the data point based on year, Day and date
gps_df['IST_DATE'] = pd.to_datetime(gps_df['IST_DATE'])
gps_df['YEAR'] = gps_df['IST_DATE'].dt.year
gps_df['DAY'] = gps_df['IST_DATE'].dt.dayofweek
gps_df['DATE'] = gps_df['IST_DATE'].dt.date
gps_df['TIME'] = gps_df['IST_DATE'].dt.time

In [None]:
gps_df['DATE'].value_counts()

In [None]:
# Remove the data points which are not in 2019
gps_df = gps_df[gps_df['YEAR'] == 2019]

In [None]:
# Sort the dataframe by DEVICE_ID and then by IST_DATE
gps_df = gps_df.sort_values(by=['DEVICE_ID', 'IST_DATE'])
gps_df.head()

In [None]:
# Shift the rows by 1 and get the previous row's LAT and LONGITUDE and IST_DATE
gps_df['PREV_LAT'] = gps_df.groupby(['DEVICE_ID'])['LAT'].shift(1)
gps_df['PREV_LONGITUDE'] = gps_df.groupby(['DEVICE_ID'])['LONGITUDE'].shift(1)
gps_df['PREV_IST_DATE'] = gps_df.groupby(['DEVICE_ID'])['IST_DATE'].shift(1)
gps_df.head()

In [None]:
# Drop the na values
gps_df = gps_df.dropna()

### Estimating the speed

In [None]:
# Calculate the distance between the current row and the previous row using haversine formula
gps_df['DISTANCE'] = haversine_vector(gps_df[['LAT', 'LONGITUDE']], gps_df[['PREV_LAT', 'PREV_LONGITUDE']], Unit.METERS)

In [None]:
# Calculate the time difference between the current row and the previous row
gps_df['TIME_DIFF'] = (pd.to_datetime(gps_df['IST_DATE']) - pd.to_datetime(gps_df['PREV_IST_DATE'])).dt.total_seconds()

In [None]:
print(len(gps_df))
gps_df.head()

In [None]:
# PLot the distance and time difference
plt.scatter(gps_df['DISTANCE'], gps_df['TIME_DIFF'])
plt.xlabel('Distance')
plt.ylabel('Time Difference')
plt.show()

In [None]:
gps_df['SPEED'] = gps_df['DISTANCE'] / (gps_df['TIME_DIFF'] + 0.0001) * 18 / 5

# Plot the scatter plot speed with respect index
plt.scatter(gps_df.index, gps_df['SPEED'])
plt.ylim(0, 200)
plt.xlabel('Index')
plt.ylabel('Speed')
plt.show()

In [None]:
# Value counts of speed between 0 and 1
gps_df[(gps_df['SPEED'] > 0) & (gps_df['SPEED'] < 1)]['SPEED'].count()

In [None]:
gps_df['SPEED'].min()

In [None]:
gps_df_copy = gps_df.copy()

In [None]:
gps_df = gps_df_copy.copy()
gps_df = gps_df.loc[(gps_df["TIME_DIFF"] < 70) & (gps_df["SPEED"] < 100), :]

### Estimating the consecutive time at same location for each GPS ping

In [None]:
# reset the index
gps_df = gps_df.reset_index(drop=True)

# Add column stop_time and initialize it with 0
gps_df['STOP_TIME'] = 0

# Take the dataframe with speed 0
gps_df0 = gps_df[gps_df['SPEED'] <= 1]

# Iterate over the dataframe with speed 0 and add the time_diff with the previous rows STOP_TIME to the current row STOP_TIME
prev_index = 0
for index,row in tqdm(gps_df0.iterrows(),total=len(gps_df0)):
    if index > 0 and row['DEVICE_ID'] == gps_df0.loc[prev_index,'DEVICE_ID']:
        gps_df.loc[index,'STOP_TIME'] = gps_df.loc[index-1,'STOP_TIME'] + row['TIME_DIFF']
    else:
        gps_df.loc[index,'STOP_TIME'] = row['TIME_DIFF']
    prev_index = index

In [None]:
# Save the gps_df to csv
gps_df.to_csv(r"../Dataset/gps_point1.csv", index=False)

#### Check Point 1

In [None]:
gps_df = pd.read_csv(r"../Dataset/gps_point1.csv")

In [None]:
gps_df_copy = gps_df.copy()

Assuming the GPS pings which are having STOP_TIME greater than 120 seconds as stops.

In [None]:
stop_time = 120
gps_df = gps_df_copy.copy()
st = gps_df.loc[gps_df['STOP_TIME'] > stop_time, ['LAT', 'LONGITUDE']].groupby(['LAT', 'LONGITUDE']).count()
st = st.reset_index()
len(st)

In [None]:
# Filter the gps_df with respect to STOP_TIME less than 600 and LAT and LONGITUDE are not in st
gps_df = gps_df_copy.copy()
gps_df = gps_df[gps_df['STOP_TIME'] < stop_time]

gps_df = gps_df[
    ~((gps_df['LAT'].isin(st['LAT'])) & (gps_df['LONGITUDE'].isin(st['LONGITUDE'])) & (gps_df['SPEED'] < 10))]

len(gps_df)

In [None]:
gps_df_copy = gps_df.copy()

In [None]:
# Filter the gps_df with repect to DEVICE_ID where total distance greater than 10000
gps_df = gps_df_copy.copy()
gps_df = gps_df.groupby(['DEVICE_ID']).filter(lambda x: x['DISTANCE'].sum() > 10000)
len(gps_df)

In [None]:
gps_df.DEVICE_ID.value_counts().mean()

In [None]:
# Filter the gps_df with repect to DEVICE_ID where number of pings greater than 1000
gps_df2 = gps_df.groupby(['DEVICE_ID']).filter(lambda x: len(x) > 2000)
len(gps_df2)

In [None]:
gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == 150218177, :]
len(gps_df_bus1)

In [None]:
# Create a map for each Device ID and plot the gps pings as circular markers
for device_id in gps_df['DEVICE_ID'].unique()[:5]:
    gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == device_id, :]
    m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
    for index, row in gps_df_bus1.iterrows():
        folium.CircleMarker([row['LAT'], row['LONGITUDE']], radius=0.01, color='blue', fill=True).add_to(m)
    m.save(f'./maps/bus_{device_id}.html')

### Filtering the GPS pings based on the Speed

In [None]:
# Filter the gps data with respect to speed less than 120
gps_df = gps_df[gps_df['SPEED'] < 120]
print(gps_df.SPEED.mean())

In [None]:
gps_df_bus1_copy = gps_df_bus1.copy()

In [None]:
gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == 150218177, :]
gps_df_bus1.head(100)

### Bangalore Graph

In [None]:
G = ox.graph_from_bbox(12.8265, 13.2694, 77.3740, 77.8313, network_type='drive')

# edges from G
edges = ox.graph_to_gdfs(G, nodes=False, edges=True)

In [None]:
edges_df = pd.DataFrame(edges)

In [None]:
# edges_df reset index
edges_df = edges_df.reset_index(drop=False)
# Filter the edges_df based on length
edges_df = edges_df[(edges_df['length'] > 20) & (edges_df['length'] < 200)]

# Consider only the columns osmid, highway, length, geometry,reversed, oneway, u,v
edges_df = edges_df[['osmid', 'highway', 'length', 'geometry', 'reversed', 'oneway', 'u', 'v']]

# Create lat_u, long_u, lat_v, long_v columns and assign from geometry
edges_df['lat_u'] = edges_df['geometry'].apply(lambda x: x.coords[0][1])
edges_df['long_u'] = edges_df['geometry'].apply(lambda x: x.coords[0][0])
edges_df['lat_v'] = edges_df['geometry'].apply(lambda x: x.coords[-1][1])
edges_df['long_v'] = edges_df['geometry'].apply(lambda x: x.coords[-1][0])

In [None]:
edges_df.head(5)

### Mapping the GPS pings to the nearest segment

In [None]:
from rtree import index

# Create an R-tree index
idx = index.Index()

# Populate the R-tree index with stop segments
for i, row in tqdm(edges_df.iterrows(), total=len(edges_df)):
    start_lat, start_lon, end_lat, end_lon = row['lat_u'], row['long_u'], row['lat_v'], row['long_v']
    idx.insert(i, (min(start_lat, end_lat), min(start_lon, end_lon), max(start_lat, end_lat), max(start_lon, end_lon)))

In [None]:
# Function to find the stop segment for a given GPS ping
def find_stop_segment(gps_lat, gps_lon):
    for segment_id in idx.intersection((gps_lat, gps_lon, gps_lat, gps_lon)):
        start_lat, start_lon, end_lat, end_lon = edges_df.loc[segment_id, ['lat_u', 'long_u', 'lat_v', 'long_v']]
        if min(start_lat, end_lat) <= gps_lat <= max(start_lat, end_lat) and min(start_lon, end_lon) <= gps_lon <= max(
                start_lon, end_lon):
            l1 = haversine((gps_lat, gps_lon), (start_lat, start_lon), Unit.METERS)
            l2 = haversine((gps_lat, gps_lon), (end_lat, end_lon), Unit.METERS)
            l3 = haversine((start_lat, start_lon), (end_lat, end_lon), Unit.METERS)
            if l1 + l2 <= 1.02 * l3:
                return segment_id
    return None  # GPS ping does not belong to any stop segment

In [None]:
# create a new column segment id and find it using find_stop_segment function using tqdm
print("Mapping to nearest segment...")
length = len(gps_df)
segment_list = []
for index, row in tqdm(gps_df.iterrows(), total=length):
    segment_list.append(find_stop_segment(row['LAT'], row['LONGITUDE']))

In [None]:
gps_df['segment_id'] = segment_list

In [None]:
# Save the gps_df to csv
gps_df.to_csv(r"../Dataset/gps_point2.csv", index=False)

In [None]:
# Save the edges_df to csv
edges_df.to_csv(r"../Dataset/edges.csv", index=False)

In [None]:
gps_df.segment_id.notna().sum()

In [None]:
gps_df.segment_id.value_counts().min()

In [None]:
gps_df_1 = gps_df.loc[gps_df.segment_id.notna(), :]
segments = gps_df_1.segment_id.unique().tolist()

segments_df = edges_df.loc[edges_df.index.isin(segments), ['osmid', 'lat_u', 'long_u', 'lat_v', 'long_v']]
# create folium map and plot the gps pings as circular markers and segments as lines and markers
m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
for index, row in tqdm(gps_df_1.iterrows()):
    folium.CircleMarker([row['LAT'], row['LONGITUDE']], radius=0.01, color='blue', fill=True).add_to(m)
for index, row in tqdm(segments_df.iterrows()):
    folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color='red').add_to(m)
    folium.Marker([row['lat_u'], row['long_u']], icon=folium.Icon(color='red')).add_to(m)
    folium.Marker([row['lat_v'], row['long_v']], icon=folium.Icon(color='red')).add_to(m)
m.save('bus1.html')

In [None]:
len(gps_df_bus1)

In [None]:
gps_df_bus2 = gps_df_bus1.loc[gps_df_bus1.segment_id.notna(), :]
segments = gps_df_bus2.segment_id.unique().tolist()

segments_df = edges_df.loc[edges_df.index.isin(segments), ['osmid', 'lat_u', 'long_u', 'lat_v', 'long_v']]
# create folium map and plot the gps pings as circular markers and segments as lines and markers
m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
for index, row in gps_df_bus2.iterrows():
    folium.CircleMarker([row['LAT'], row['LONGITUDE']], radius=0.01, color='blue', fill=True).add_to(m)
for index, row in segments_df.iterrows():
    folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color='red').add_to(m)
    folium.Marker([row['lat_u'], row['long_u']], icon=folium.Icon(color='red')).add_to(m)
    folium.Marker([row['lat_v'], row['long_v']], icon=folium.Icon(color='red')).add_to(m)
m.save('bus1.html')

## Exploratory Data Analysis

##### Check Point 4

In [None]:
# Check if './Dataset/gps_point2.csv' exists if yes read to gps_df
if os.path.exists('./Dataset/gps_point2.csv'):
    gps_df = pd.read_csv('./Dataset/gps_point2.csv')
else:
    print("File not found")

In [None]:
# Check if './Dataset/edges.csv' exists if yes read to edges_df
if os.path.exists('./Dataset/edges.csv'):
    edges_df = pd.read_csv('./Dataset/edges.csv')
else:
    print("File not found")

In [None]:
gps_df_copy = gps_df.copy()

In [None]:
# Plot the data point based on year, Day and date
gps_df['IST_DATE'] = pd.to_datetime(gps_df['IST_DATE'])

In [None]:
# Apply speed limit filter and remove the rows which are having speed greater than 120
gps_df = gps_df[gps_df['SPEED'] < 120]

In [None]:
# Remove the seged_id which are null
gps_df = gps_df[gps_df['segment_id'].notna()]
len(gps_df)

In [None]:
sum(gps_df.length > 5000)

In [None]:
# Consider only segments which are having pings count more than 200
segments = gps_df.segment_id.value_counts()
segments = segments[segments > 200]
segments = segments.index.tolist()
gps_df1 = gps_df[gps_df['segment_id'].isin(segments)]
print("Length of gps_df", len(gps_df1))
print("No of segments", len(segments))

In [None]:
# Create unique segments into dataframe
segments_df1 = gps_df1[
    ['segment_id', 'osmid', 'lat_u', 'long_u', 'lat_v', 'long_v', 'length', 'oneway', 'highway']].drop_duplicates()

In [None]:
# Create map and plot of the segments
m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
segments = segments_df.drop_duplicates(subset=['lat_u', 'long_u', 'lat_v', 'long_v'])
for index, row in tqdm(segments.iterrows()):
    if row['length'] < 20:
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color='red').add_to(m)
    elif row['length'] < 1000:
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color='blue').add_to(m)
    elif row['length'] < 4000:
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color='brown').add_to(m)
    else:
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color='green').add_to(m)
m.save('segments_4.html')

In [None]:
# Create a dataframe based on the time hour  and count no of pings for each segment
gps_df1['HOUR'] = gps_df1['IST_DATE'].apply(lambda x: x.hour)

segments_df = gps_df1[['segment_id', 'HOUR']].groupby(['segment_id', 'HOUR']).size().reset_index(name='counts')

In [None]:
# Filter the dataframe based on HOUR from 5 to 21
segments_df = segments_df[(segments_df['HOUR'] >= 5) & (segments_df['HOUR'] <= 21)]

# Consider the segments which are having pings across all the hours
segments_df = segments_df.groupby(['segment_id']).filter(lambda x: len(x) >= 17)

print(len(segments_df))
print(len(segments_df.segment_id.unique()))

In [None]:
# Get the length from segments_df1 and add it to segments_df using merge
segments_df = segments_df.merge(segments_df1, on='segment_id', how='left')

In [None]:
# Create a dataframe from gps_df1 and groupby segment_id and hour and get the mean of speed and std.dev of speed
segments_df2 = gps_df1.groupby(['segment_id', 'HOUR']).agg({'SPEED': ['mean', 'std']}).reset_index()

# Filter the dataframe based on HOUR from 5 to 21
segments_df2 = segments_df2[(segments_df2['HOUR'] >= 5) & (segments_df2['HOUR'] <= 21)]

# Consider the segments which are having pings across all the hours
segments_df2 = segments_df2.groupby(['segment_id']).filter(lambda x: len(x) >= 17)


In [None]:
# reset the index
segments_df2 = segments_df2.reset_index(drop=True)

segments_df2.columns = ['segment_id', 'HOUR', 'AVG_SPEED', 'STD_DEV_SPEED']

segments_df2.head()

In [None]:
# Create a dataframe where it contains length, lat_u, long_u, lat_v, long_v and AVG_SPEED for each HOUR like AVG_SPEED_5, AVG_SPEED_6 etc
segments_df3 = segments_df2.pivot(index='segment_id', columns='HOUR', values='AVG_SPEED').reset_index()
segments_df4 = segments_df2.pivot(index='segment_id', columns='HOUR', values='STD_DEV_SPEED').reset_index()
segments_df4.columns = ['segment_id'] + [f'v{i}' for i in segments_df4.columns[1:]]

# Merge the segments_df3 with segments_df4
segments_df3 = segments_df3.merge(segments_df4, on='segment_id', how='left')

# Merge the segments_df3 with segments_df1
segments_df3 = segments_df3.merge(segments_df1, on='segment_id', how='left')

len(segments_df3)


In [None]:
segments_df3.columns

In [None]:
segments_df3.to_csv(r"./Dataset/segments_df4.csv", index=False)

#### Check Point 3

In [None]:
segments_df3 = pd.read_csv(r"./Dataset/segments_df3.csv")

In [None]:
# Convert the column names from 5 to 22 if they string to int
col_list = [str(i) for i in range(5, 22)]
col_dict = {str(i): i for i in range(5, 22)}
for col in col_list:
    if col in segments_df3.columns.to_list():
        segments_df3.rename(columns={col: int(col)}, inplace=True)

In [None]:
# Create a boxplot for each column
col_list = [i for i in range(5, 22)]
ax = sns.boxplot(data=segments_df3[col_list], width=0.75, showfliers=False)
sns.stripplot(data=segments_df3[col_list], jitter=False, color='blue', alpha=0.1)

mean_list = []
# Plot means and connect them
for i, col in enumerate(segments_df3[col_list].columns):
    mean_val = segments_df3[col].mean()
    mean_list.append(mean_val)
    ax.scatter(i, mean_val, color='red', zorder=5)
    if i > 1 and i < len(col_list) - 1:
        ax.plot([i - 0.5, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
    elif i == 1:
        ax.plot([i - 1, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
    elif i == len(col_list) - 1:
        ax.plot([i - 0.5, i], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
    prev_mean = mean_val

# Set labels and title
ax.set_xticklabels(col_list)
ax.set_xlabel('Hour')
ax.set_ylabel('Average Speed (kmph)')
ax.set_title('Total Average Speed')

# Show the plot
plt.show()

In [None]:
# Bar plot for oneway and highway
# fig, ax = plt.subplots(1, 1, figsize=(15, 5))
sns.countplot(x='oneway', data=segments_df3)
# sns.countplot(x='highway', data=segments_df3, ax=ax[1],rotation=90)
plt.show()

In [None]:
# Distribution of lengths
sns.distplot(segments_df3[segments_df3['length'] < 1000]['length'], kde=True, bins=100, color='orange')
plt.xlabel('length')
plt.ylabel('Density')
plt.title('Distribution of segment lengths')
plt.show()

In [None]:
# segments_df3.highway.value_counts()

In [None]:
# # Replace the values in highway column using dictionary
# word_dict = {'primary_link':'primary','secondary_link':'secondary','trunk_link':'trunk',"['primary_link', 'primary']":'primary',
#              "['secondary_link','secondary']":'secondary',"['trunk_link','trunk']":'trunk','tertiary_link':'tertiary',"['tertiary_link','tertiary']":'tertiary','motorway_link':'motorway', 'unclassified': 'primary', "['trunk', 'trunk_link']": 'trunk', "['trunk', 'primary']" : 'primary', 'trunk':'primary',''}
# segments_df3 = segments_df3.replace({"highway": word_dict})
# segments_df3.highway.value_counts()

### Descriptive Analysis

In [None]:
segments_df3.describe()

## Modelling

Modelling using different models

1) Clustering using KMeans
2) Clustering using DBSCAN
3) Clustering using Spectral Clustering
4) Clustering using Agglomerative Clustering
5) Clustering using GMM

In [None]:
# Clustering the segments based on speeds
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

col_list = [i for i in range(5, 22)]
# col_list.append('length')
# col_list.append('oneway')
wcss = []
silhouette_scores = []
X = np.array(segments_df3[col_list])
scaler = StandardScaler()
X = scaler.fit_transform(X)
labels_dict = defaultdict(list)

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    labels_dict[i] = kmeans.fit_predict(X)
    wcss.append(kmeans.inertia_)
    if i> 1:
        silhouette_scores.append(silhouette_score(X, labels_dict[i]))
    if i==3:
        cluster_centers_kmeans = kmeans.cluster_centers_


# Plot the graph to visualize the Elbow Method to find the optimal number of cluster and add dots at the points
plt.plot(range(1, 11), wcss, color='blue', marker='o', markerfacecolor='red', markersize=3)
# plt.scatter(range(1, 11), wcss, color='red')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.xlim(1, 11)
plt.ylabel('Within-cluster scatter, W')
plt.show()

# Silhouette Score
plt.plot(range(2, 11), silhouette_scores, color='blue', marker='o', markerfacecolor='red', markersize=3)
plt.title('The Silhouette Score')

In [None]:
for key, labels in labels_dict.items():
    m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
    segments_df3['cluster'] = labels
    colors_dict = {0: 'red', 1: 'blue', 2: 'green', 3: 'orange'}
    for index, row in tqdm(segments_df3.iterrows()):
        color = colors_dict[row['cluster']]
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color=color).add_to(m)
    m.save(f'{key}.html')

### Blox plots for every cluster based on results from the KMeans clustering

In [None]:
# Create a boxplot for each column
segments_df3['cluster'] = labels_dict[3]
mean_dict = defaultdict(list)
for l in set(labels):
    df = segments_df3.loc[segments_df3['cluster'] == l, col_list]
    col_list = [i for i in range(5, 22)]
    plt.figure().patch.set_facecolor('white')
    sns.set(style="whitegrid")
    colors = sns.color_palette("pastel")
    ax = sns.boxplot(data=df, width=0.75, showfliers=False,  palette=colors)
    sns.stripplot(data=df, jitter=False, color='blue', alpha=0.1)
    
    mean_list = []
    # Plot means and connect them
    for i, col in enumerate(df[col_list].columns):
        mean_val = df[col].mean()
        mean_list.append(mean_val)
        ax.scatter(i, mean_val, color='red', zorder=5)
        if i > 1 and i < len(col_list) - 1:
            ax.plot([i - 0.5, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        elif i == 1:
            ax.plot([i - 1, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        elif i == len(col_list) - 1:
            ax.plot([i - 0.5, i], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        prev_mean = mean_val
    
    mean_dict[l] = mean_list
    # Set labels and title
    ax.set_xticklabels(col_list)
    ax.set_xlabel('Hour')
    ax.set_ylabel('Average Speed (kmph)')
    ax.set_title(f'Total Average Speed across cluster_{l+1}')
    
    # Show the plot
    plt.show()

In [None]:
# Plotting the mean values of each cluster across the hours
for l in set(labels):
    plt.plot(mean_dict[l], label=f'cluster_{l+1}')
plt.legend()
plt.xlabel('Hour')
plt.ylabel('Average Speed (kmph)')
plt.title('Average Speed across clusters (KMeans before reduction)')
plt.show()

### Hierarchical Clustering

In [None]:
segments_df3.cluster.value_counts()

In [None]:
# Clustering using Heirarchical Clustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

col_list = [i for i in range(5, 22)]
silhouette_scores = []
# scaler = StandardScaler()
# X = scaler.fit_transform(segments_df3[col_list])
X = np.array(segments_df3[col_list])
labels_dict_heir = defaultdict(list)
metrics = ['euclidean', 'l1']

for metric in metrics:
    for i in range(2, 4):
        model = AgglomerativeClustering(n_clusters=i, metric='l1', linkage='average')
        labels_dict_heir[f'{metric}_{i}'] = model.fit_predict(X)
        silhouette_scores.append(silhouette_score(X, labels))



# Plot the graph to visualize the Silhouette Score to find the optimal number of cluster
# plt.plot(range(2,11),silhouette_scores)
# plt.title('The Silhouette Score')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Score')
# plt.show()

### Folium maps for different number of cluster based on results from the Agglomerative Clustering

In [None]:
# Create a folium map and plot the segments based on cluster and color is based on cluster
for key, labels in labels_dict_heir.items():
    m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
    segments_df3['cluster'] = labels
    colors_dict = {0: 'red', 1: 'blue', 2: 'green', 3: 'orange'}
    for index, row in tqdm(segments_df3.iterrows()):
        color = colors_dict[row['cluster']]
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color=color).add_to(m)
    m.save(f'./Maps/{key}.html')


### Box plots for every cluster based on results from the Agglomerative Clustering

In [None]:
segments_df3['cluster'] = labels_dict_heir[f'{metrics[1]}_{3}']
labels = labels_dict_heir[f'{metrics[1]}_{3}']
mean_dict = defaultdict(list)
for l in set(labels):
    df = segments_df3.loc[segments_df3['cluster'] == l, col_list]
    col_list = [i for i in range(5, 22)]
    plt.figure().patch.set_facecolor('white')
    sns.set(style="whitegrid")
    colors = sns.color_palette("pastel")
    ax = sns.boxplot(data=df, width=0.75, showfliers=False,  palette=colors)
    sns.stripplot(data=df, jitter=False, color='blue', alpha=0.1)
    
    mean_list = []
    # Plot means and connect them
    for i, col in enumerate(df[col_list].columns):
        mean_val = df[col].mean()
        mean_list.append(mean_val)
        ax.scatter(i, mean_val, color='red', zorder=5)
        if i > 1 and i < len(col_list) - 1:
            ax.plot([i - 0.5, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        elif i == 1:
            ax.plot([i - 1, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        elif i == len(col_list) - 1:
            ax.plot([i - 0.5, i], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        prev_mean = mean_val
    mean_dict[l] = mean_list
    # Set labels and title
    ax.set_xticklabels(col_list)
    ax.set_xlabel('Hour')
    ax.set_ylabel('Average Speed (kmph)')
    ax.set_title(f'Total Average Speed across cluster_{l+1}')
    
    # Show the plot
    plt.show()

In [None]:
# Plotting the mean values of each cluster across the hours
for l in set(labels):
    plt.plot(mean_dict[l], label=f'cluster_{l + 1}')
plt.legend()
plt.xlabel('Hour')
plt.ylabel('Average Speed (kmph)')
plt.title('Average Speed across clusters (Hierarchical before reduction)')
plt.show()

In [None]:
segments_df3.cluster.value_counts()

In [None]:
# Print the segments which are values of all columns in col_list more than 50
col_list = [i for i in range(5, 22)]
segments_df3[(segments_df3[col_list] > 50).all(axis=1)]

### Guassian Mixture Model

In [None]:
# Clustering using GMM
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

col_list = [i for i in range(5, 22)]
silhouette_scores = []
scaler = StandardScaler()
X = scaler.fit_transform(segments_df3[col_list])
labels_dict_gmm = defaultdict(list)

for i in range(2, 11):
    model = GaussianMixture(n_components=i, covariance_type='spherical')
    labels_dict_gmm[i] = model.fit_predict(X)
    silhouette_scores.append(silhouette_score(X, labels))
    if i == 3:
        cluster_centers_gmm = model.means_

# Plot the graph to visualize the Silhouette Score to find the optimal number of cluster
plt.plot(range(2,11),silhouette_scores)
plt.title('The Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()
    

In [None]:
segments_df3.cluster.value_counts()

### Folium maps for different number of cluster based on results from the GMM

In [None]:
for key, labels in labels_dict_gmm.items():
    if key > 3:
        continue
    m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
    segments_df3['cluster'] = labels
    colors_dict = {0: 'red', 1: 'blue', 2: 'green', 3: 'orange'}
    for index, row in tqdm(segments_df3.iterrows()):
        color = colors_dict[row['cluster']]
        folium.PolyLine([[row['lat_u'], row['long_u']], [row['lat_v'], row['long_v']]], color=color).add_to(m)
    m.save(f'GMM_{key}.html')

### Box plots for every cluster based on results from the GMM

In [None]:
segments_df3['cluster'] = labels_dict_gmm[3]
labels = labels_dict_gmm[3]
mean_dict = defaultdict(list)
for l in set(labels):
    df = segments_df3.loc[segments_df3['cluster'] == l, col_list]
    col_list = [i for i in range(5, 22)]
    plt.figure().patch.set_facecolor('white')
    sns.set(style="whitegrid")
    colors = sns.color_palette("pastel")
    ax = sns.boxplot(data=df, width=0.75, showfliers=False,  palette=colors)
    sns.stripplot(data=df, jitter=False, color='blue', alpha=0.1)
    
    mean_list = []
    # Plot means and connect them
    for i, col in enumerate(df[col_list].columns):
        mean_val = df[col].mean()
        mean_list.append(mean_val)
        ax.scatter(i, mean_val, color='red', zorder=5)
        if i > 1 and i < len(col_list) - 1:
            ax.plot([i - 0.5, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        elif i == 1:
            ax.plot([i - 1, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        elif i == len(col_list) - 1:
            ax.plot([i - 0.5, i], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
        prev_mean = mean_val
    
    mean_dict[l] = mean_list
    # Set labels and title
    ax.set_xticklabels(col_list)
    ax.set_xlabel('Hour')
    ax.set_ylabel('Average Speed (kmph)')
    ax.set_title(f'Total Average Speed across cluster_{l+1}')
    
    # Show the plot
    plt.show()

In [None]:
# Plotting the mean values of each cluster across the hours
for l in set(labels):
    plt.plot(mean_dict[l], label=f'cluster_{l + 1}')
plt.legend()
plt.xlabel('Hour')
plt.ylabel('Average Speed (kmph)')
plt.title('Average Speed across clusters (GMM before Reduction)')
plt.show()

### Preprocessing the data visualizations

1) Comparing the removal of Depot pings present in the GPS data
2) Comparing the removal of pings with speed greater than 120 kmph
3) 

In [None]:
# Create a folium map of GPS pings for top 5 devices
m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
for device_id in gps_df['DEVICE_ID'].value_counts(ascending=False).head(5).index.tolist():
    gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == device_id, :]
    for index, row in gps_df_bus1.iterrows():
        folium.CircleMarker([row['LAT'], row['LONGITUDE']], radius=0.01, color='blue', fill=True).add_to(m)
m.save('After_depot_pings.html')

In [None]:
# Read the './Dataset/GPS.csv' file to gps_df10
gps_df10 = pd.read_csv('./Dataset/GPS.csv')

In [None]:
# Create a folium map of GPS pings for same devices earlier
m = folium.Map(location=[12.9716, 77.5946], zoom_start=12)
for device_id in gps_df['DEVICE_ID'].value_counts(ascending=False).head(5).index.tolist():
    gps_df_bus1 = gps_df10.loc[gps_df10['DEVICE_ID'] == device_id, :]
    for index, row in gps_df_bus1.iterrows():
        folium.CircleMarker([row['LAT'], row['LONGITUDE']], radius=0.01, color='blue', fill=True).add_to(m)
m.save('Before_depot_pings.html')

Distribution of number of segements based on minimum speed across time intervals

In [None]:
# Finding the column in col_list which are having minimum across each row
col_list = [i for i in range(5, 22)]
x_labels = [f'AvgSpeed{(i,i+1)}' for i in range(5, 22)]
segments_df3['min_column'] = segments_df3[col_list].idxmin(axis=1)

# Plot the bar plot for min column
plt.figure(figsize=(15, 5))
plt.bar(segments_df3['min_column'].value_counts().index, segments_df3['min_column'].value_counts().values)
plt.xlabel('Time interval')
plt.xticks(ticks=range(5, 22), labels=x_labels, rotation=45, ha='right')
plt.xlim(4,22)
plt.ylabel('Count')
plt.title('Distribution of number of segements based on minimum speed across time intervals')
plt.show()

In [None]:
# Correlation matrix for segments_df3 of columns in col_list
col_list = [i for i in range(5, 22)]
col_dict = [f'AvgSpeed{(i,i+1)}' for i in range(5, 22)]
# col_list.append('length')
corr = segments_df3[col_list].corr()

# Plot the heatmap for correlation matrix
plt.figure(figsize=(15, 5))
sns.heatmap(corr, annot=True, cmap='coolwarm')

# Indices as in col_dict
plt.xticks(ticks=range(len(col_list)), labels=col_dict, rotation=45)
plt.yticks(ticks=range(len(col_list)), labels=col_dict, rotation=0)
plt.title('Correlation Matrix')
plt.show()


# Dimension reduction Analysis 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# PCA for segments_df3 of columns in col_list
col_list = [i for i in range(5, 22)]

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(segments_df3[col_list])

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X)


# Variance explained by each principal component
explained_variance_ratio = pca.explained_variance_ratio_

# Cumulative variance explained
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# Plot the explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='-', color='b')
plt.title('Cumulative Variance Explained by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Variance Explained')
plt.grid(True)
plt.show()

In [None]:
# Plot the scatter plot for PCA based on first two components
cluster_centers_kmeans_red = pca.transform(cluster_centers_kmeans)
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_dict[3], cmap='viridis')
plt.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1],c=list(set(labels_dict[3])), cmap='coolwarm', s=100)
plt.title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

fig = plt.figure(figsize=(10, 8))

ax.view_init(elev=20, azim=30)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels_dict[3], cmap='viridis')
ax.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1], cluster_centers_kmeans_red[:, 2],c=list(set(labels_dict[3])), cmap='coolwarm', s=100)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.show()

In [None]:
# Plot the scatter plot for PCA based on first two components
cluster_centers_gmm_red = pca.transform(cluster_centers_gmm)
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_dict_gmm[3], cmap='viridis')
plt.scatter(cluster_centers_gmm_red[:, 0], cluster_centers_gmm_red[:, 1],c=list(set(labels_dict_gmm[3])), cmap='coolwarm', s=100)
plt.title('PCA Scatter Plot (Without Reduction Clustering using GMM)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

fig = plt.figure(figsize=(10, 8))

ax.view_init(elev=20, azim=30)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels_dict_gmm[3], cmap='viridis')
ax.scatter(cluster_centers_gmm_red[:, 0], cluster_centers_gmm_red[:, 1], cluster_centers_gmm_red[:, 2],c=list(set(labels_dict_gmm[3])), cmap='coolwarm', s=100)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Scatter Plot (Without Reduction Clustering using GMM)')
plt.show()

In [None]:
# Non-linear dimension reduction using MDS

from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

mds = MDS(n_components= 3,metric=False, dissimilarity='precomputed', random_state=0, max_iter=3000, eps=1e-4, normalized_stress=True)
col_list = [i for i in range(5, 22)]
X_mds = mds.fit_transform(euclidean_distances(X))


In [None]:
# Plot the scatter plot for PCA based on first two components
# cluster_centers_kmeans_red = mds.transform(cluster_centers_kmeans)
plt.figure(figsize=(10, 6))
plt.scatter(X_mds[:, 0], X_mds[:, 1], c=labels_dict[3], cmap='viridis')
# plt.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1],c=list(set(labels_dict[3])), cmap='coolwarm', s=100)
plt.title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

fig = plt.figure(figsize=(10, 8))

ax.view_init(elev=20, azim=30)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_mds[:, 0], X_mds[:, 1], X_mds[:, 2], c=labels_dict[3], cmap='viridis')
# ax.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1], cluster_centers_kmeans_red[:, 2],c=list(set(labels_dict[3])), cmap='coolwarm', s=100)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.show()

In [None]:
# Plot the scatter plot for PCA based on first two components
# cluster_centers_gmm_red = mds.transform(cluster_centers_gmm)
plt.figure(figsize=(10, 6))
plt.scatter(X_mds[:, 0], X_mds[:, 1], c=labels_dict_gmm[3], cmap='viridis')
# plt.scatter(cluster_centers_gmm_red[:, 0], cluster_centers_gmm_red[:, 1],c=list(set(labels_dict_gmm[3])), cmap='coolwarm', s=100)
plt.title('PCA Scatter Plot (Without Reduction Clustering using GMM)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

fig = plt.figure(figsize=(10, 8))

ax.view_init(elev=20, azim=30)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_mds[:, 0], X_mds[:, 1], X_mds[:, 2], c=labels_dict_gmm[3], cmap='viridis')
# ax.scatter(cluster_centers_gmm_red[:, 0], cluster_centers_gmm_red[:, 1], cluster_centers_gmm_red[:, 2],c=list(set(labels_dict_gmm[3])), cmap='coolwarm', s=100)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Scatter Plot (Without Reduction Clustering using GMM)')
plt.show()

In [None]:
# Non-linear dimension reduction using Kernel PCA
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances

col_list = [i for i in range(5, 22)]
scaler = StandardScaler()
X = scaler.fit_transform(segments_df3[col_list])

kpca = KernelPCA(n_components=3, kernel='rbf')
X_kpca = kpca.fit_transform(X)

In [None]:
# Plot the scatter plot for PCA based on first two components
cluster_centers_kmeans_red = kpca.transform(cluster_centers_kmeans)
plt.figure(figsize=(10, 6))
plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=labels_dict[3], cmap='viridis')
plt.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1],c=list(set(labels_dict[3])), cmap='coolwarm', s=100)
plt.title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

fig = plt.figure(figsize=(10, 8))

ax.view_init(elev=20, azim=30)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_kpca[:, 0], X_kpca[:, 1], X_kpca[:, 2], c=labels_dict[3], cmap='viridis')
ax.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1], cluster_centers_kmeans_red[:, 2],c=list(set(labels_dict[3])), cmap='coolwarm', s=100)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.show()

In [None]:
# Kmeans clustering after dimension reduction
from sklearn.cluster import KMeans

col_list = [i for i in range(5, 22)]
kmeans = KMeans(n_clusters=3, random_state=0)
labels_dict['3_reduced'] = kmeans.fit_predict(X_kpca)
cluster_centers_kmeans_red = kmeans.cluster_centers_

In [None]:
# Plot the scatter plot for PCA based on first two components
# cluster_centers_kmeans_red = kpca.transform(cluster_centers_kmeans)
plt.figure(figsize=(10, 6))
plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=labels_dict['3_reduced'], cmap='viridis')
plt.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1],c=list(set(labels_dict['3_reduced'])), cmap='coolwarm', s=100)
plt.title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

fig = plt.figure(figsize=(10, 8))

ax.view_init(elev=20, azim=30)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_kpca[:, 0], X_kpca[:, 1], X_kpca[:, 2], c=labels_dict['3_reduced'], cmap='viridis')
ax.scatter(cluster_centers_kmeans_red[:, 0], cluster_centers_kmeans_red[:, 1], cluster_centers_kmeans_red[:, 2],c=list(set(labels_dict['3_reduced'])), cmap='coolwarm', s=100)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('PCA Scatter Plot (Without Reduction Clustering using Kmeans)')
plt.show()

#### Check Point 3

In [None]:
#### Check Point 3
segments_df4 = pd.read_csv(r"./Dataset/segments_df4.csv")

In [None]:
# Convert the column names from 5 to 22 if they string to int
col_list = [str(i) for i in range(5, 22)]
col_dict = {str(i): i for i in range(5, 22)}
for col in col_list:
    if col in segments_df3.columns.to_list():
        segments_df4.rename(columns={col: int(col)}, inplace=True)

In [None]:
# Create a boxplot for each column
col_list = [i for i in range(5, 22)]
var_col_list = [f'v{i}' for i in range(5, 22)]
ax = sns.boxplot(data=segments_df3[col_list], width=0.75, showfliers=False)
sns.stripplot(data=segments_df3[col_list], jitter=False, color='blue', alpha=0.1)

mean_list = []
var_list = []
# Plot means and connect them
for i, col in enumerate(segments_df3[col_list].columns):
    mean_val = segments_df3[col].mean()
    var_val = segments_df3[f'v{col}'].mean()
    mean_list.append(mean_val)
    var_list.append(var_val)
    ax.scatter(i, mean_val, color='red', zorder=5)
    if i > 1 and i < len(col_list) - 1:
        ax.plot([i - 0.5, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
    elif i == 1:
        ax.plot([i - 1, i + 0.5], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
    elif i == len(col_list) - 1:
        ax.plot([i - 0.5, i], [prev_mean, mean_val], color='red', linestyle='-', linewidth=2)
    prev_mean = mean_val

# Set labels and title
ax.set_xticklabels(col_list)
ax.set_xlabel('Hour')
ax.set_ylabel('Average Speed (kmph)')
ax.set_title('Total Average Speed')

# Show the plot
plt.show()

In [None]:
# Create line plot for mean of each column joining the col_list and var_list
plt.figure(figsize=(15, 5))
plt.plot(col_list, mean_list, marker='o', linestyle='-', color='b')
plt.plot(col_list, var_list,  marker='o', linestyle='-', color='r')
plt.xlabel('Hour')
plt.ylabel('Average Speed (kmph)')
plt.title('Total Average Speed')
plt.legend(['Mean', 'Variance'])
plt.show()


In [None]:
# Bar plot for oneway and highway
# fig, ax = plt.subplots(1, 1, figsize=(15, 5))
sns.countplot(x='oneway', data=segments_df3)
# sns.countplot(x='highway', data=segments_df3, ax=ax[1],rotation=90)
plt.show()
# Distribution of lengths
sns.distplot(segments_df3[segments_df3['length'] < 1000]['length'], kde=True, bins=100, color='orange')
plt.xlabel('length')
plt.ylabel('Density')
plt.title('Distribution of segment lengths')
plt.show()