# INTRO

This kernel is forked from [BigQuery Machine Learning Tutorial](https://www.kaggle.com/rtatman/bigquery-machine-learning-tutorial).

Let's use BigQuery to cluster the city intersection based on their location.

In [None]:
# Replace 'kaggle-competitions-project' with YOUR OWN project id here --  
PROJECT_ID = 'kaggle-bq-geotag' #
#PROJECT_ID='kaggle-competitions-project'

from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID, location="US")
dataset = client.create_dataset('bqml_example', exists_ok=True)

from google.cloud.bigquery import magics
from kaggle.gcp import KaggleKernelCredentials
magics.context.credentials = KaggleKernelCredentials()
magics.context.project = PROJECT_ID

import seaborn as sns
import matplotlib.pyplot as plt

# create a reference to our table
table = client.get_table("kaggle-competition-datasets.geotab_intersection_congestion.train")

# look at five rows from our dataset
client.list_rows(table, max_results=5).to_dataframe()

In [None]:
%load_ext google.cloud.bigquery

In [None]:
%%bigquery city_df
SELECT t.city
  FROM `kaggle-competition-datasets.geotab_intersection_congestion.test` t
UNION DISTINCT
SELECT t.city
  FROM `kaggle-competition-datasets.geotab_intersection_congestion.train` t

In [None]:
cities = list(city_df['city'])
cities

# Cluster Intersections per City

Set *model_changed* to True to generate the Models.

In [None]:
model_changed = True

if model_changed:
    for c in cities:
        sql="""CREATE MODEL IF NOT EXISTS `bqml_example.model_cluster_"""+c+"""`
        OPTIONS(model_type='kmeans',
                NUM_CLUSTERS = 10) AS
        SELECT
            latitude,
            longitude
        FROM
          `kaggle-competition-datasets.geotab_intersection_congestion.train` t
        WHERE city = '"""+c+"""'
        UNION ALL
        SELECT
            latitude,
            longitude
        FROM
          `kaggle-competition-datasets.geotab_intersection_congestion.test` t
        WHERE city = '"""+c+"""'"""
        
        client.query(sql)

        print('Done with',c)

# Basic EDA using the clusters

Using Boston as an example

In [None]:
%%bigquery eda_df
WITH city_cluster AS (
    SELECT (SELECT MIN(d.DISTANCE) FROM UNNEST(NEAREST_CENTROIDS_DISTANCE) d) AS dist_to_cluster_center, 
           CONCAT(m.city,"_",CAST(m.CENTROID_ID AS STRING)) AS city_cluster,
           m.* EXCEPT (nearest_centroids_distance, CENTROID_ID) 
      FROM ML.PREDICT(MODEL `bqml_example.model_cluster_boston`, 
                   (SELECT t.RowId,
                   t.city,
                           t.IntersectionId,
                           t.Latitude,
                           t.Longitude,
                           #t.EntryStreetName,
                           #t.ExitStreetName,
                           t.EntryHeading,
                           t.ExitHeading,
                           t.Hour,
                           t.Weekend,
                           t.Month,
                           #t.Path,
                           t.TotalTimeStopped_p20,
                           #t.TotalTimeStopped_p40,
                           t.TotalTimeStopped_p50,
                           #t.TotalTimeStopped_p60,
                           t.TotalTimeStopped_p80,
                           #t.TimeFromFirstStop_p20,
                           #t.TimeFromFirstStop_p40,
                           #t.TimeFromFirstStop_p50,
                           #t.TimeFromFirstStop_p60,
                           #t.TimeFromFirstStop_p80,
                           t.DistanceToFirstStop_p20,
                           #t.DistanceToFirstStop_p40,
                           t.DistanceToFirstStop_p50,
                           #t.DistanceToFirstStop_p60,
                           t.DistanceToFirstStop_p80,
                           'TRAIN' AS source
                     FROM `kaggle-competition-datasets.geotab_intersection_congestion.train` t
                    WHERE city = 'Boston' 
                    #  AND rowid in(2209678,2209692)
                    UNION ALL
                    SELECT t.RowId,
                    t.city,
                           t.IntersectionId,
                           t.Latitude,
                           t.Longitude,
                           #t.EntryStreetName,
                           #t.ExitStreetName,
                           t.EntryHeading,
                           t.ExitHeading,
                           t.Hour,
                           t.Weekend,
                           t.Month,
                           #t.Path,
                           null as TotalTimeStopped_p20,
                           #null as TotalTimeStopped_p40,
                           null as TotalTimeStopped_p50,
                           #null as TotalTimeStopped_p60,
                           null as TotalTimeStopped_p80,
                           #null as TimeFromFirstStop_p20,
                           #null as TimeFromFirstStop_p40,
                           #null as TimeFromFirstStop_p50,
                           #null as TimeFromFirstStop_p60,
                           #null as TimeFromFirstStop_p80,
                           null as DistanceToFirstStop_p20,
                           #null as DistanceToFirstStop_p40,
                           null as DistanceToFirstStop_p50,
                           #null as DistanceToFirstStop_p60,
                           null as DistanceToFirstStop_p80,
                           'TEST' AS source
                     FROM `kaggle-competition-datasets.geotab_intersection_congestion.test` t
                    WHERE city = 'Boston' 
                    #  AND rowid in(2209678,2209692)
                    )) m
)
SELECT cc.source,
       cc.city,
       cc.city_cluster, 
       count(1) cnt, 
       avg(cc.dist_to_cluster_center) avg_dist_to_cluster_center, 
       stddev(cc.dist_to_cluster_center) stddev_dist_to_cluster_center, 
       min(cc.dist_to_cluster_center) min_dist_to_cluster_center, 
       max(cc.dist_to_cluster_center) max_dist_to_cluster_center,
       avg(avg(cc.dist_to_cluster_center)*(count(1)-1)) over(partition by cc.city) avg_dist_to_cluster_center_over_city
  FROM city_cluster cc
 GROUP BY cc.source, 
          cc.city, 
          cc.city_cluster;

In [None]:
eda_df.sort_values(by=['source','city_cluster']).head(100)

In [None]:
sns.swarmplot(x='city_cluster',y='cnt', data=eda_df ,hue='source')

plt.xticks(rotation=45)

In [None]:
sns.swarmplot(x='city_cluster',y='avg_dist_to_cluster_center', data=eda_df ,hue='source')

plt.xticks(rotation=45)