<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/Weather_KMSO_and_MPG_Precip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Wrangle Notebook

# Load Tools

In [0]:
import pandas as pd

# Query BigQuery

## KMSO

In [0]:
kmso_query = """
SELECT
  day,
  precip_sum_in
FROM
  `mpg-data-warehouse.weather_noaa.noaa_gsod_rename`
"""

In [0]:
# send query to BigQuery
df_kmso = pd.read_gbq(query=kmso_query, project_id='mpg-data-warehouse', dialect='standard')

In [0]:
# years collected
df_kmso.day.dt.year.unique()

array([2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010])

In [0]:
# rename column
df_kmso.columns = ['day', 'kmso']

In [0]:
df_kmso.head(2)

Unnamed: 0,day,kmso
0,2019-12-31,0.01
1,2019-12-30,0.04


In [0]:
df_kmso.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3652 entries, 0 to 3651
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   day     3652 non-null   datetime64[ns]
 1   kmso    3652 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 57.2 KB


## MPG

In [0]:
mpg_query = """
SELECT
  day,
  station,
  precip_sum_in
FROM
  `mpg-data-warehouse.weather_mpg.daily`
"""

In [0]:
# send query to BigQuery
df_mpg = pd.read_gbq(query=mpg_query, project_id='mpg-data-warehouse', dialect='standard')

In [0]:
df_mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15918 entries, 0 to 15917
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   day            15918 non-null  object 
 1   station        15918 non-null  object 
 2   precip_sum_in  15909 non-null  float64
dtypes: float64(1), object(2)
memory usage: 373.2+ KB


In [0]:
df_mpg.tail(2)

Unnamed: 0,day,station,precip_sum_in
15916,2016-03-14,baldy summit,0.0
15917,2016-03-15,baldy summit,0.0


# ReStructure

## Long to Wide

### KMSO

In [0]:
df_mpg = df_mpg.pivot(index='day', columns='station', values='precip_sum_in')

In [0]:
df_mpg.columns.name = 'precip_sum_in'

In [0]:
df_mpg = df_mpg.reset_index()

In [0]:
# set datetime datatype
df_mpg.day = pd.to_datetime(df_mpg.day)

In [0]:
# update column names to comply with BigQuery naming conventions
df_mpg.columns = ['day', 'baldy_draw', 'baldy_summit', 'indian_ridge', 'orchard_house',
       'sanfoin_bench', 'south_baldy']

In [0]:
df_mpg.head(2)

Unnamed: 0,day,baldy_draw,baldy_summit,indian_ridge,orchard_house,sanfoin_bench,south_baldy
0,2012-06-19,0.0,0.0,,0.0,0.0,0.0
1,2012-06-20,0.0,0.1,,0.0,0.0,0.0


In [0]:
df_mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2653 entries, 0 to 2652
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   day            2653 non-null   datetime64[ns]
 1   baldy_draw     2653 non-null   float64       
 2   baldy_summit   2653 non-null   float64       
 3   indian_ridge   2644 non-null   float64       
 4   orchard_house  2653 non-null   float64       
 5   sanfoin_bench  2653 non-null   float64       
 6   south_baldy    2653 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 145.2 KB


## Join DataFrames

In [0]:
left = df_mpg
right = df_kmso

In [0]:
df = pd.merge(left, right, on='day')

# Push to BigQuery

In [0]:
table_name = 'mpg_kmso_precip'

In [0]:
df.to_gbq('weather_views.' + table_name, 'mpg-data-warehouse')

1it [00:03,  3.40s/it]
