In [1]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
import os

In [2]:
# Function to load bigquery API key to allow permissions to use 
# BigQuery client. Function not included with repo to keep the
# API key hidden. Users will have to set up their own project and
# permissions in Google Cloud to run this notebook.

from apiconfig.config import set_bigquery_api_key
set_bigquery_api_key()

Loading BigQuery API Key...
API Key Loaded


In [3]:
sql_query = '''

SELECT 
fullVisitorId,
visitId,
date,
visitNumber,
totals.visits,
totals.hits,
totals.pageviews,
totals.timeOnSite,
totals.bounces,
totals.transactions,
totals.totalTransactionRevenue / 1000000 AS totalTransactionRevenue,
device.deviceCategory,
geoNetwork.country,
trafficSource.source,
channelGrouping
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20170101' AND '20170131'

'''

In [4]:
client = bigquery.Client(location='US', project='ecommerce-analytics-364919')

query_job = client.query(sql_query)
df = query_job.to_dataframe()

In [5]:
df.head()


Unnamed: 0,fullVisitorId,visitId,date,visitNumber,visits,hits,pageviews,timeOnSite,bounces,transactions,totalTransactionRevenue,deviceCategory,country,source,channelGrouping
0,6380625739879480331,1485515046,20170127,1,1,1,1,,1,,,desktop,Italy,(direct),Organic Search
1,5171703479284554898,1485514123,20170127,1,1,1,1,,1,,,desktop,Croatia,(direct),Organic Search
2,5306338675227486844,1485523388,20170127,1,1,1,1,,1,,,desktop,Iceland,siliconvalley.about.com,Referral
3,5679671765152105885,1485549414,20170127,1,1,1,1,,1,,,desktop,Ukraine,(direct),Organic Search
4,9477190930349489962,1485554139,20170127,1,1,1,1,,1,,,desktop,Germany,(direct),Direct


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64694 entries, 0 to 64693
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   fullVisitorId            64694 non-null  object 
 1   visitId                  64694 non-null  Int64  
 2   date                     64694 non-null  object 
 3   visitNumber              64694 non-null  Int64  
 4   visits                   64694 non-null  Int64  
 5   hits                     64694 non-null  Int64  
 6   pageviews                64691 non-null  Int64  
 7   timeOnSite               33286 non-null  Int64  
 8   bounces                  31326 non-null  Int64  
 9   transactions             697 non-null    Int64  
 10  totalTransactionRevenue  697 non-null    float64
 11  deviceCategory           64694 non-null  object 
 12  country                  64694 non-null  object 
 13  source                   64694 non-null  object 
 14  channelGrouping       

In [7]:
df.to_csv('overall_data.csv', index=False)