In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import folium 
from folium.plugins import HeatMap
from datetime import datetime
from folium.plugins import HeatMap
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics import silhouette_score
from datetime import datetime as dt

### Functions from code

In [1]:
def coord_location_array(df, lat_col, long_col):
    location_lst = df[[lat_col, long_col]]
    array = location_lst.to_numpy()
    return array

def kmeans_model(array, n_clust=8):
    kmeans = KMeans(n_clusters=n_clust)
    kmeans.fit(array)
    return kmeans

def calc_silhouette_score(nclust, array):
    kmeans = KMeans(n_clusters = nclust, 
                init = 'random', 
                n_init = 10, 
                max_iter = 100, 
                n_jobs = -1)
    kmeans.fit(array)
    sil_avg = silhouette_score(array, kmeans.labels_)
    return sil_avg

def silhouette_plot(ax, sil_score_lst, color, label, title):
    ax.plot(range(2,10), sil_score_lst, 'o--', c = color, label=label)
    ax.set_xlabel('K', fontsize=18)
    ax.set_ylabel('Silhouette Score', fontsize=18)
    ax.set_title(title, fontsize=20)
    ax.legend()

def kmeans_scatter_plot(ax, cluster_array, color, size, label, title):
    ax.scatter(cluster_array[:,0], cluster_array[:,1], s=size, 
                color=color, label=label)
    ax.set_xlabel('Latitude', fontsize=16)
    ax.set_ylabel('Longtitude', fontsize=16)
    ax.set_title(title, fontsize=20)
    ax.legend()

def cluster_center_df(cluster_array, col_lst):
    df = pd.DataFrame(cluster_array, columns=col_lst)
    name = ['cluster' + str(idx+1) for idx in range(0,len(df))]
    df['name'] = name
    return df

def hierarchical_cluster_model(dist_metric, link_method, array):
    distmetric = dist_metric
    linkmethod = link_method
    dist = pdist(array, metric=distmetric)
    clust = linkage(dist, method=linkmethod)
    return clust

def dendrogram_plot(cluster, ax, level_num, trunc_mode, color_thresh, title):
    dendrogram(cluster, ax, p=level_num, truncate_mode=trunc_mode, color_threshold=color_thresh)
    plt.xticks(fontsize-12)
    plt.title(title, fontsize=24)
    
def load_data(filepath):
        df = pd.read_csv(filepath)
        return df

def drop_cols_update_names(df, col_lst):
    df.drop(columns=col_lst, inplace=True)
    df.columns = df.columns.str.replace(' ', '_')
    return df

def drop_nans(df):
    return df.dropna(inplace=True)

def cols_to_datetime(df, col_lst):
    for col in col_lst:
        df[col] = pd.to_datetime(df[col])
    return df

def add_weekday_column(df, new_col, date_col):
    df[new_col] = df[date_col].dt.weekday
    return df

def add_hour_column(df, new_col, date_col):
    df[new_col] = df[date_col].dt.hour
    return df

def change_col_units(df, col, divisor):
    df[col] = df[col].apply(lambda x: x/divisor)
    return df

def filter_out_bad_data(df, col_lst, threshold_lst):
    for col, thresh in zip(col_lst, threshold_lst):
        if thresh < 20:
            df = df[df[col] > thresh]
        else:
            df = df[df[col] < thresh]
    df.reset_index(drop=True)
    return df

def clean_dataframe(df, drop_col_lst, date_col_lst, day_col, hour_col, date_col, 
                        unit_col_1, unit_col_2, div_1, div_2, filter_col_lst, thresh_lst):

    df = drop_cols_update_names(df, drop_col_lst)
    drop_nans(df)
    df = cols_to_datetime(df, date_col_lst)
    df = add_weekday_column(df, day_col, date_col)
    df = add_hour_column(df, hour_col, date_col) 
    df = change_col_units(df, unit_col_1, div_1)
    df = change_col_units(df, unit_col_2, div_2)
    df = filter_out_bad_data(df, filter_col_lst, thresh_lst)
    return df.reset_index(drop=True)

def data_snapshot(df, col, low_end, high_end):
    df = df[df[col] > low_end]
    df = df[df[col] <= high_end]
    return df.reset_index(drop=True)

def save_dataframe_to_csv(df, filepath):
    return df.to_csv(filepath)

def histogram_of_column(df, col, ax, color, title, x_label, x_tick_loc, y_tick_loc, x_low, x_high):
    df[col].hist(color=color, grid=False, bins=30)
    ax.set_title(title, fontsize=20)
    ax.set_xlabel(x_label, fontsize=16)
    ax.set_ylabel('Frequency', fontsize=16)
    ax.tick_params(axis='x', which='minor', length=7, width=1)
    ax.tick_params(axis='x', which='major', length=10, width=1, labelsize='medium')
    ax.tick_params(axis='y', which='minor', length=5, width=1)
    ax.tick_params(axis='y', which='major', length=7, width=1, labelsize='medium')
    ax.xaxis.set_minor_locator(MultipleLocator(x_tick_loc))
    ax.yaxis.set_minor_locator(MultipleLocator(y_tick_loc))   
    ax.set_xlim(x_low, x_high)

def bar_chart(df, col, ax, color, label_lst, title, x_label):
    series = df[col].value_counts().sort_index()

    series.plot(kind='bar', color=color)
    ax.set_title(title, fontsize=20)
    ax.set_xlabel(x_label, fontsize=16)
    ax.set_ylabel('Frequency', fontsize=16)
    if label_lst == None:
        plt.xticks(rotation=45, fontsize=12)
    else:
        ax.set_xticklabels(labels=label_lst, rotation=45, fontsize=12)

def location_heat_map(df, lat_col, long_col, map_name):
    heat_map = folium.FeatureGroup(name = 'heat_map')
    heat_map.add_child( HeatMap( list(zip(df[lat_col].values,
                                        df[long_col].values)),
                                  name=map_name, max_val=float(60),
                                  min_opacity=0.2, radius=5.5, 
                                  blur=3.5, max_zoom=1))
    return heat_map

def image_file(filepath):
    return plt.savefig(filepath, transparent=False, bbox_inches='tight', format='svg', dpi=1200)

def map_html_file(map_name, filepath):
    return map_name.save(filepath)

### Cleaned Full Dataframe

In [3]:
clean = pd.read_csv('../data/full_clean_scooter.csv')
clean.drop(columns=['Unnamed: 0', 'Trip_ID', 'Start_Time', 'End_Time', 'Accuracy', 'End_Centroid_Latitude',
            'End_Centroid_Longitude'], inplace=True)
clean.head()

Unnamed: 0,Trip_Distance,Trip_Duration,Start_Centroid_Latitude,Start_Centroid_Longitude,Day_of_Week,Time_of_Day
0,0.569919,5.983333,41.894101,-87.763112,0,19
1,0.069608,7.066667,41.938666,-87.711211,5,15
2,1.072094,19.016667,41.938666,-87.711211,5,18
3,0.688005,41.983333,41.938666,-87.711211,6,20
4,1.850218,9.533333,41.927261,-87.765502,4,15


In [4]:
hours={0:'Midnight', 1:'1am', 2:'2am',3:'3am', 4:'4am',5:'5am', 6:'6am', 7:'7am', 8:'8am', 9:'9am', 
       10:'10am', 11:'11am', 12:'12pm',13:'1pm', 14:'2pm', 15:'3pm', 16:'4pm', 17:'5pm', 18:'6pm', 
       19:'7pm', 20:'8pm', 21:'9pm', 22:'10pm', 23:'11pm'}
weekdays={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday',4:'Friday', 5:'Saturday', 6:'Sunday'}

In [5]:
clean.replace({"Day_of_Week": weekdays, 'Time_of_Day': hours}, inplace=True)
clean.head()

Unnamed: 0,Trip_Distance,Trip_Duration,Start_Centroid_Latitude,Start_Centroid_Longitude,Day_of_Week,Time_of_Day
0,0.569919,5.983333,41.894101,-87.763112,Monday,7pm
1,0.069608,7.066667,41.938666,-87.711211,Saturday,3pm
2,1.072094,19.016667,41.938666,-87.711211,Saturday,6pm
3,0.688005,41.983333,41.938666,-87.711211,Sunday,8pm
4,1.850218,9.533333,41.927261,-87.765502,Friday,3pm


### use gower distance to be able to do kmeans with categorical + numerical data

In [6]:
import gower

In [7]:
gower.gower_matrix(clean)

KeyboardInterrupt: 