# Intro

The kernel relies as much as possible on BigQuery (BQ). All features are generated in BQ. And the prediction model is also build in BQ.

## Summary
Features:
- city, hour, weekend, month
- 10 clusters of Intersections per city
- Distance to the nearest cluster
- 8 directions of turn (left, right, centered, uturn, centered-left, ...)
- Directions, Entry- and Exit-Heading are embedded and also they are translated into degrees (centered = 90deg; North = 90deg). Afterwards the degree features are split into two features sin(feature_x_deg) and cos(feature_x_deg).

Model:
- Linear Regression


## Next TODOs
- reasonable Train-Valid-Split strategy

## Credits
Some of the ideas are inspired by the following kernels. Please visit and give them upvotes if you like them.
- This kernel is a forked from [BigQuery Machine Learning Tutorial](https://www.kaggle.com/rtatman/bigquery-machine-learning-tutorial).
- The direction features are like the Flow feature in https://www.kaggle.com/jpmiller/intersection-level-eda

In [None]:
# Replace 'kaggle-competitions-project' with YOUR OWN project id here --  
PROJECT_ID = 'kaggle-bq-geotag' #
#PROJECT_ID='kaggle-competitions-project'

from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID, location="US")
dataset = client.create_dataset('bqml_example', exists_ok=True)

from google.cloud.bigquery import magics
from kaggle.gcp import KaggleKernelCredentials
magics.context.credentials = KaggleKernelCredentials()
magics.context.project = PROJECT_ID

import seaborn as sns
import matplotlib.pyplot as plt

# create a reference to our table
table = client.get_table("kaggle-competition-datasets.geotab_intersection_congestion.train")

# look at five rows from our dataset
client.list_rows(table, max_results=5).to_dataframe()

In [None]:
%load_ext google.cloud.bigquery

In [None]:
mod_names= ['TotalTimeStopped_p20','TotalTimeStopped_p50','TotalTimeStopped_p80',
            'DistanceToFirstStop_p20','DistanceToFirstStop_p50','DistanceToFirstStop_p80']

# Cluster Intersections per City

In [None]:
%%bigquery city_df
SELECT t.city
  FROM `kaggle-competition-datasets.geotab_intersection_congestion.test` t
UNION DISTINCT
SELECT t.city
  FROM `kaggle-competition-datasets.geotab_intersection_congestion.train` t


In [None]:
cities = list(city_df['city'])

In [None]:
model_changed = False

if model_changed:
    for c in cities:
        sql="""DROP MODEL `bqml_example.model_cluster_"""+c+"""`"""
        client.query(sql)

        print('Dropped',c)
    
    
for c in cities:
    sql="""CREATE MODEL IF NOT EXISTS `bqml_example.model_cluster_"""+c+"""`
    OPTIONS(model_type='kmeans',
            NUM_CLUSTERS = 10) AS
    SELECT
        latitude,
        longitude
    FROM
      `kaggle-competition-datasets.geotab_intersection_congestion.train` t
    WHERE city = '"""+c+"""'
    UNION ALL
    SELECT
        latitude,
        longitude
    FROM
      `kaggle-competition-datasets.geotab_intersection_congestion.test` t
    WHERE city = '"""+c+"""'"""

    client.query(sql)

    print('Done with',c)

## Checkout the clusters
Using Boston as an example

In [None]:
%%bigquery eda_df
WITH city_cluster AS (
    SELECT (SELECT MIN(d.DISTANCE) FROM UNNEST(NEAREST_CENTROIDS_DISTANCE) d) AS dist_to_cluster_center, 
           CONCAT(m.city,"_",CAST(m.CENTROID_ID AS STRING)) AS city_cluster,
           m.* EXCEPT (nearest_centroids_distance, CENTROID_ID) 
      FROM ML.PREDICT(MODEL `bqml_example.model_cluster_boston`, 
                   (SELECT t.RowId,
                   t.city,
                           t.IntersectionId,
                           t.Latitude,
                           t.Longitude,
                           #t.EntryStreetName,
                           #t.ExitStreetName,
                           t.EntryHeading,
                           t.ExitHeading,
                           t.Hour,
                           t.Weekend,
                           t.Month,
                           #t.Path,
                           t.TotalTimeStopped_p20,
                           #t.TotalTimeStopped_p40,
                           t.TotalTimeStopped_p50,
                           #t.TotalTimeStopped_p60,
                           t.TotalTimeStopped_p80,
                           #t.TimeFromFirstStop_p20,
                           #t.TimeFromFirstStop_p40,
                           #t.TimeFromFirstStop_p50,
                           #t.TimeFromFirstStop_p60,
                           #t.TimeFromFirstStop_p80,
                           t.DistanceToFirstStop_p20,
                           #t.DistanceToFirstStop_p40,
                           t.DistanceToFirstStop_p50,
                           #t.DistanceToFirstStop_p60,
                           t.DistanceToFirstStop_p80,
                           'TRAIN' AS source
                     FROM `kaggle-competition-datasets.geotab_intersection_congestion.train` t
                    WHERE city = 'Boston' 
                    #  AND rowid in(2209678,2209692)
                    UNION ALL
                    SELECT t.RowId,
                    t.city,
                           t.IntersectionId,
                           t.Latitude,
                           t.Longitude,
                           #t.EntryStreetName,
                           #t.ExitStreetName,
                           t.EntryHeading,
                           t.ExitHeading,
                           t.Hour,
                           t.Weekend,
                           t.Month,
                           #t.Path,
                           null as TotalTimeStopped_p20,
                           #null as TotalTimeStopped_p40,
                           null as TotalTimeStopped_p50,
                           #null as TotalTimeStopped_p60,
                           null as TotalTimeStopped_p80,
                           #null as TimeFromFirstStop_p20,
                           #null as TimeFromFirstStop_p40,
                           #null as TimeFromFirstStop_p50,
                           #null as TimeFromFirstStop_p60,
                           #null as TimeFromFirstStop_p80,
                           null as DistanceToFirstStop_p20,
                           #null as DistanceToFirstStop_p40,
                           null as DistanceToFirstStop_p50,
                           #null as DistanceToFirstStop_p60,
                           null as DistanceToFirstStop_p80,
                           'TEST' AS source
                     FROM `kaggle-competition-datasets.geotab_intersection_congestion.test` t
                    WHERE city = 'Boston' 
                    #  AND rowid in(2209678,2209692)
                    )) m
)
SELECT cc.source,
       cc.city,
       cc.city_cluster, 
       count(1) cnt, 
       avg(cc.dist_to_cluster_center) avg_dist_to_cluster_center, 
       stddev(cc.dist_to_cluster_center) stddev_dist_to_cluster_center, 
       min(cc.dist_to_cluster_center) min_dist_to_cluster_center, 
       max(cc.dist_to_cluster_center) max_dist_to_cluster_center,
       avg(avg(cc.dist_to_cluster_center)*(count(1)-1)) over(partition by cc.city) avg_dist_to_cluster_center_over_city
  FROM city_cluster cc
 GROUP BY cc.source, 
          cc.city, 
          cc.city_cluster;

In [None]:
eda_df.sort_values(by=['source','city_cluster']).head(100)

In [None]:
sns.swarmplot(x='city_cluster',y='cnt', data=eda_df ,hue='source')

plt.xticks(rotation=45)

In [None]:
sns.swarmplot(x='city_cluster',y='avg_dist_to_cluster_center', data=eda_df ,hue='source')

plt.xticks(rotation=45)

## Create city_cluster table

In [None]:
def feature_sql(model_name, rowid_split, incl_rowid, tab): 
    
    if incl_rowid:
        rowid = "t.RowId,"
    else:
        rowid = ""
    
    if tab == 'test':
        label = ""
    elif model_name == 'ALL':
        label = """t.TotalTimeStopped_p20,
                t.TotalTimeStopped_p50,
                t.TotalTimeStopped_p80,
                t.DistanceToFirstStop_p20,
                t.DistanceToFirstStop_p50,
                t.DistanceToFirstStop_p80,"""
    else:
        label = """t."""+model_name+""" as label,"""
    
    sql = ""
    
    for c in cities:
        features = """SELECT """+label+"""
                             """+rowid+"""
                             t.city,
                             t.EntryHeading,
                             t.ExitHeading,
                             t.Hour,
                             t.Weekend,
                             t.Month,
                             t.Latitude,
                             t.Longitude,
                             case 
                                 when t.entryheading = t.exitheading THEN
                                  "C"
                                 when ("N" in (t.entryheading, t.exitheading) and "S" in (t.entryheading, t.exitheading)) 
                                      OR 
                                      ("E" in (t.entryheading, t.exitheading) and "W" in (t.entryheading, t.exitheading)) 
                                      OR 
                                      ("NE" in (t.entryheading, t.exitheading) and "SW" in (t.entryheading, t.exitheading))  
                                      OR 
                                      ("SE" in (t.entryheading, t.exitheading) and "NW" in (t.entryheading, t.exitheading)) 
                                 THEN
                                  "U" 
                                 when (t.entryheading="N" and t.exitheading = "W") 
                                      OR(t.entryheading="NW" and t.exitheading = "SW") 
                                      OR(t.entryheading="W" and t.exitheading = "S") 
                                      OR(t.entryheading="SW" and t.exitheading = "SE") 
                                      OR(t.entryheading="S" and t.exitheading = "E") 
                                      OR(t.entryheading="SE" and t.exitheading = "NE") 
                                      OR(t.entryheading="E" and t.exitheading = "N") 
                                      OR(t.entryheading="NE" and t.exitheading = "NW") 
                                 THEN
                                  "L" 
                                 when (t.entryheading="N" and t.exitheading = "E") 
                                      OR(t.entryheading="NW" and t.exitheading = "NE") 
                                      OR(t.entryheading="W" and t.exitheading = "N") 
                                      OR(t.entryheading="SW" and t.exitheading = "NW") 
                                      OR(t.entryheading="S" and t.exitheading = "W") 
                                      OR(t.entryheading="SE" and t.exitheading = "SW") 
                                      OR(t.entryheading="E" and t.exitheading = "S") 
                                      OR(t.entryheading="NE" and t.exitheading = "SE") 
                                 THEN
                                  "R" 
                                 when (t.entryheading="N" and t.exitheading = "NW") 
                                      OR(t.entryheading="NW" and t.exitheading = "W") 
                                      OR(t.entryheading="W" and t.exitheading = "SW") 
                                      OR(t.entryheading="SW" and t.exitheading = "S") 
                                      OR(t.entryheading="S" and t.exitheading = "SE") 
                                      OR(t.entryheading="SE" and t.exitheading = "E") 
                                      OR(t.entryheading="E" and t.exitheading = "NE") 
                                      OR(t.entryheading="NE" and t.exitheading = "N") 
                                 THEN
                                  "CL" 
                                 when (t.entryheading="N" and t.exitheading = "NE") 
                                      OR(t.entryheading="NW" and t.exitheading = "N") 
                                      OR(t.entryheading="W" and t.exitheading = "NW") 
                                      OR(t.entryheading="SW" and t.exitheading = "W") 
                                      OR(t.entryheading="S" and t.exitheading = "SW") 
                                      OR(t.entryheading="SE" and t.exitheading = "S") 
                                      OR(t.entryheading="E" and t.exitheading = "SE") 
                                      OR(t.entryheading="NE" and t.exitheading = "E") 
                                 THEN
                                  "CR" 
                                 when (t.entryheading="N" and t.exitheading = "SW") 
                                      OR(t.entryheading="NW" and t.exitheading = "S") 
                                      OR(t.entryheading="W" and t.exitheading = "SE") 
                                      OR(t.entryheading="SW" and t.exitheading = "E") 
                                      OR(t.entryheading="S" and t.exitheading = "NE") 
                                      OR(t.entryheading="SE" and t.exitheading = "N") 
                                      OR(t.entryheading="E" and t.exitheading = "NW") 
                                      OR(t.entryheading="NE" and t.exitheading = "W") 
                                 THEN
                                  "UL" 
                                 when (t.entryheading="N" and t.exitheading = "SE") 
                                      OR(t.entryheading="NW" and t.exitheading = "E") 
                                      OR(t.entryheading="W" and t.exitheading = "NE") 
                                      OR(t.entryheading="SW" and t.exitheading = "N") 
                                      OR(t.entryheading="S" and t.exitheading = "NW") 
                                      OR(t.entryheading="SE" and t.exitheading = "W") 
                                      OR(t.entryheading="E" and t.exitheading = "SW") 
                                      OR(t.entryheading="NE" and t.exitheading = "S") 
                                 THEN
                                  "UR" 
                               else null end direction
                       FROM `kaggle-competition-datasets.geotab_intersection_congestion."""+tab+"""` t
                      WHERE city = '"""+c+"""' 
                       AND rowid """+rowid_split
                            
        sql += """
               SELECT (SELECT MIN(d.DISTANCE) FROM UNNEST(NEAREST_CENTROIDS_DISTANCE) d) AS dist_to_cluster_center, 
                      CONCAT(m.city,"_",CAST(m.CENTROID_ID AS STRING)) AS city_cluster,
                      m.* EXCEPT (nearest_centroids_distance, CENTROID_ID,Latitude,Longitude) 
                 FROM ML.PREDICT(MODEL `bqml_example.model_cluster_"""+c+"""`, 
                              ("""+features+""")) m
               UNION ALL"""
        
        
    return sql[:-len("UNION ALL")]

In [None]:
model_changed = False

if model_changed:
    sql="DROP TABLE IF EXISTS `bqml_example.city_cluster_train`"
    job_result=client.query(sql).result()
    sql="DROP TABLE IF EXISTS `bqml_example.city_cluster_test`"
    job_result=client.query(sql).result()

    
sql = "CREATE TABLE IF NOT EXISTS `bqml_example.city_cluster_train` as " + feature_sql('ALL','=rowid', True, 'train')
job_result=client.query(sql).result()
sql = "CREATE TABLE IF NOT EXISTS `bqml_example.city_cluster_test` as " + feature_sql('ALL','=rowid', True, 'test')
job_result=client.query(sql).result()
    
    

In [None]:
%%bigquery

SELECT t.TotalTimeStopped_p20 as label, cc.* except (rowid, TotalTimeStopped_p20,TotalTimeStopped_p50,TotalTimeStopped_p80,DistanceToFirstStop_p20, DistanceToFirstStop_p50, DistanceToFirstStop_p80) 
  FROM `bqml_example.city_cluster_train` cc,
       `kaggle-competition-datasets.geotab_intersection_congestion.train` t
 WHERE cc.rowid = t.rowid
 LIMIT 20;

# Model

In [None]:
%%bigquery
CREATE OR REPLACE FUNCTION `bqml_example.direction2degree`(dir string) AS (
 case dir
   when "C" then
    90
   when 'CL' then
    135
   when "L" then
    180
   when 'UL' then
    225
   when "U" then
    270
   when 'UR' then
    315
   when "R" then
    0
   when 'CR' then
    45
    
   when "N" then
    90
   when 'NW' then
    135
   when "W" then
    180
   when 'SW' then
    225
   when "S" then
    270
   when 'SE' then
    315
   when "E" then
    0
   when 'NE' then
    45
 end
);

## Train

In [None]:
%%time
model_changed = True

if model_changed:
    for mn in mod_names:
        sql="DROP MODEL IF EXISTS `bqml_example.model_"+mn+"`"
        client.query(sql).result()

        print('Drop',mn)

for mn in mod_names:
    
    sql="""
    CREATE MODEL IF NOT EXISTS `bqml_example.model_"""+mn+"""`
    OPTIONS(MODEL_TYPE='linear_reg', 
            L2_REG=0.1,
            LS_INIT_LEARN_RATE=0.4) AS 
    SELECT  t."""+mn+""" as label,
            cc.city_cluster,
            cc.city,
            cc.hour,
            cc.weekend,
            cc.month,
            cc.direction,
            cc.entryheading,
            cc.exitheading,
            round(sin(bqml_example.direction2degree(cc.direction)*ACOS(-1)/180),6) direction_sin,
            round(cos(bqml_example.direction2degree(cc.direction)*ACOS(-1)/180),6) direction_cos,
            round(sin(bqml_example.direction2degree(cc.entryheading)*ACOS(-1)/180),6) entryheading_sin,
            round(cos(bqml_example.direction2degree(cc.entryheading)*ACOS(-1)/180),6) entryheading_cos,
            round(sin(bqml_example.direction2degree(cc.exitheading)*ACOS(-1)/180),6) exitheading_sin,
            round(cos(bqml_example.direction2degree(cc.exitheading)*ACOS(-1)/180),6) exitheading_cos,
            round(cc.dist_to_cluster_center,8) dist_to_cluster_center
      FROM `bqml_example.city_cluster_train` cc,
           `kaggle-competition-datasets.geotab_intersection_congestion.train` t
     WHERE t.rowid = cc.rowid
       AND cc.rowid < 2600000;
    """

    client.query(sql).result()

    print('Done with',mn)

## Get training statistics


In [None]:
%%time
%%bigquery
SELECT
  *
FROM
  ML.TRAINING_INFO(MODEL `bqml_example.model_TotalTimeStopped_p20`)
ORDER BY iteration 

In [None]:
%%time
%%bigquery
SELECT
  *
FROM
  ML.FEATURE_INFO(MODEL `bqml_example.model_TotalTimeStopped_p20`)

In [None]:
%%bigquery
SELECT
  *
FROM
  ML.WEIGHTS(MODEL  `bqml_example.model_TotalTimeStopped_p20`,
    STRUCT(true AS standardize))

## Evaluate your model


In [None]:
sql="""SELECT
          *
        FROM ML.EVALUATE(MODEL `bqml_example.model_TotalTimeStopped_p20`, (
        SELECT  t.TotalTimeStopped_p20 as label, 
                cc.city_cluster,
                cc.city,
                cc.hour,
                cc.weekend,
                cc.month,
                cc.direction,
                cc.entryheading,
                cc.exitheading,
                round(sin(bqml_example.direction2degree(cc.direction)*ACOS(-1)/180),6) direction_sin,
                round(cos(bqml_example.direction2degree(cc.direction)*ACOS(-1)/180),6) direction_cos,
                round(sin(bqml_example.direction2degree(cc.entryheading)*ACOS(-1)/180),6) entryheading_sin,
                round(cos(bqml_example.direction2degree(cc.entryheading)*ACOS(-1)/180),6) entryheading_cos,
                round(sin(bqml_example.direction2degree(cc.exitheading)*ACOS(-1)/180),6) exitheading_sin,
                round(cos(bqml_example.direction2degree(cc.exitheading)*ACOS(-1)/180),6) exitheading_cos,
                round(cc.dist_to_cluster_center,8) dist_to_cluster_center
          FROM `bqml_example.city_cluster_train` cc,
               `kaggle-competition-datasets.geotab_intersection_congestion.train` t
         WHERE t.rowid = cc.rowid
         AND t.rowid > 2600000))"""

client.query(sql).to_dataframe()

# Predict outcomes


In [None]:
def pred(mn, debug=False):
    
    if debug:
        lmt='LIMIT 10'
    else:
        lmt=''
    
    sql="""
    SELECT
      RowId,
      predicted_label as """+mn+"""
    FROM
      ML.PREDICT(MODEL `bqml_example.model_"""+mn+"""`,
        (
        SELECT  cc.RowId, 
                cc.city_cluster,
                cc.city,
                cc.hour,
                cc.weekend,
                cc.month,
                cc.direction,
                cc.entryheading,
                cc.exitheading,
                round(sin(bqml_example.direction2degree(cc.direction)*ACOS(-1)/180),6) direction_sin,
                round(cos(bqml_example.direction2degree(cc.direction)*ACOS(-1)/180),6) direction_cos,
                round(sin(bqml_example.direction2degree(cc.entryheading)*ACOS(-1)/180),6) entryheading_sin,
                round(cos(bqml_example.direction2degree(cc.entryheading)*ACOS(-1)/180),6) entryheading_cos,
                round(sin(bqml_example.direction2degree(cc.exitheading)*ACOS(-1)/180),6) exitheading_sin,
                round(cos(bqml_example.direction2degree(cc.exitheading)*ACOS(-1)/180),6) exitheading_cos,
                round(cc.dist_to_cluster_center,8) dist_to_cluster_center
          FROM `bqml_example.city_cluster_test` cc,
               `kaggle-competition-datasets.geotab_intersection_congestion.test` t
         WHERE t.rowid = cc.rowid
          """+lmt+"""))
        ORDER BY RowId ASC"""

    return client.query(sql).to_dataframe()
    
df=None
for i, mn in enumerate(mod_names):
    if i == 0:
        print('Start', i)
        df = pred(mn)
        df['RowId'] = df['RowId'].apply(str) + '_'+str(i)
        df.rename(columns={'RowId': 'TargetId', mn: 'Target'}, inplace=True)
    else:
        print('Start', i)
        df_temp = pred(mn)
        df_temp['RowId'] = df_temp['RowId'].apply(str) + '_'+str(i)
        df_temp.rename(columns={'RowId': 'TargetId', mn: 'Target'}, inplace=True)
        df=df.append(df_temp)

    print('Done with',mn)

In [None]:
print(df.shape)
df.head(100)

## Output as CSV


In [None]:
df.to_csv('submission.csv', index=False)