In [2]:
from sqlalchemy import create_engine
from sqlalchemy import types
from sqlalchemy.sql import text

from graph_gen import pollution_epa
from config import pyowm_api_key

import pandas as pd
import pyowm
import time

from timefunc import utc_to_pst_24

In [8]:
# Establish connection to SQL
engine = create_engine("mysql://root:password@localhost/weather_data")

### Consolidating scraping functions to update weather values:

Code from all group members was fused into this function.

In [9]:
def get_weather_data(df):
    
    '''Takes in dataframe with city coordinates and returns of updated weather data for each city
        via OpenWeatherMap and Breezometer APIs.'''

    temperature = []
    clouds = []
    pressure = []
    rain = []
    date = []
    wind = []
    uv_index = []
    
    aqi = []
    category = []
    dominant_pollutant = []
    
    # Initialize connection to pyowm
    owm = pyowm.OWM(pyowm_api_key) 

    for city,lat,lng in zip(df['city'],df['lat'],df['lng']):
        print(f"Gathering data for {city}...")
        
        ## Make API calls to OpenWeatherMap: ##
        try:
            # Get weather and uv index data at coordinates specified.
            weather = owm.weather_at_coords(lat, lng)
            uvi = owm.uvindex_around_coords(lat, lng)
        except:
            print(f"Error adding data for {city}. Appending NaN for all OpenWeatherMap categories.")
            temperature.append("NaN")
            clouds.append("NaN")
            pressure.append("NaN")
            date.append("NaN")
            wind.append("NaN")
            uv_index.append("NaN")
            rain.append("NaN")
        else:
            weather_data = weather.get_weather()
            
            # Add data to lists
            temperature.append(weather_data.get_temperature('fahrenheit')['temp'])
            clouds.append(weather_data.get_clouds())
            pressure.append(weather_data.get_pressure()['press'])
            date.append(weather_data.get_reference_time(timeformat='iso'))
            wind.append(weather_data.get_wind()['speed'])
            uv_index.append(uvi.get_value())

            precip = weather_data.get_rain()

            # Account for empty rain data.
            if precip == {}:
                rain.append(0)
            else:
                rain.extend([v for v in precip.values()])
        
        ## Make API calls to Breezometer for air quality data:
        data = pollution_epa(lat,lng)
        
        try:
            index = data['data']['indexes']['usa_epa']

            air_quality = index['aqi']
            categories = index['category']
            dom_pollutant = index['dominant_pollutant']

            aqi.append(air_quality)
            category.append(categories)
            dominant_pollutant.append(dom_pollutant)
            
        except TypeError:
            print(f"Error adding data for {city}. Appending NaN for all Breezometer categories.")
            aqi.append("NaN")
            category.append("NaN")
            dominant_pollutant.append("NaN")
        
        # sleep for 1.1 second. Limited to 60 API calls/min
        time.sleep(1.1)    
        
    return temperature,clouds,pressure,rain,date,wind,uv_index,aqi,category,dominant_pollutant

In [10]:
def update_df(df):
    
    '''Updates dataframe columns with new weather data.'''
    
    temperature,clouds,pressure,rain,date,wind,uv_index,aqi,category,dominant_pollutant = get_weather_data(df)
    
    df['temperature'] = temperature
    df['cloud'] = clouds
    df['pressure'] = pressure
    df['rain'] = rain
    df['date'] = pd.to_datetime(date)
    df['wind_speed'] = wind
    df['uv_index'] = uv_index
    df['aqi'] = aqi
    df['category'] = category
    df['dominant_pollutant'] = dominant_pollutant
    # make column for date scraped, but time is converted to PST.
    df['date_scraped'] = [utc_to_pst_24(df['date'][0]) for i in range(len(df['date']))]
    
    return df.dropna()

In [11]:
def format_date(df):
    ''' Formats date column if the date is not in datetime format. Used for csvs.'''
    df['date'] = pd.to_datetime(df['date'])
    return df

In [12]:
def push_to_sql(df,table_name):
    
    ''' Inserts dataframe rows into SQL table.'''
    
    # Get dictionaries for each row
    data = df.to_dict(orient='index').values()
    
    columns = ','.join(df.columns)
    col_values = ','.join([f':{col}' for col in df.columns])

    with engine.connect() as con:

        test_statement=text(f"""
            INSERT INTO {table_name} 
            ({columns})
            VALUES
            ({col_values})""")

        for line in data:
            con.execute(test_statement, **line)
            
            
    print(f"{len(data)} rows inserted into {table_name}.")    

In [13]:
# Query our cities table.
query = '''
select * from california_cities
'''

# Read query.
df = pd.read_sql_query(query, engine)
df.head()

Unnamed: 0,city,type,county,state,lat,lng
0,Adelanto,City,San Bernardino,california,34.5828,-117.409
1,Agoura Hills,City,Los Angeles,california,34.1533,-118.762
2,Alameda,City,Alameda,california,37.7652,-122.242
3,Albany,City,Alameda,california,37.8869,-122.298
4,Alhambra,City,Los Angeles,california,34.0953,-118.127


In [19]:
recent_data = update_df(df)

Gathering data for Adelanto...
Gathering data for Agoura Hills...
Gathering data for Alameda...
Gathering data for Albany...
Gathering data for Alhambra...
Gathering data for Aliso Viejo...
Gathering data for Alturas...
Gathering data for Amador City...
Gathering data for American Canyon...
Gathering data for Anaheim...
Gathering data for Anderson...
Gathering data for Angels Camp...
Gathering data for Antioch...
Gathering data for Apple Valley...
Gathering data for Arcadia...
Gathering data for Arcata...
Gathering data for Arroyo Grande...
Gathering data for Artesia...
Gathering data for Arvin...
Gathering data for Atascadero...
Gathering data for Atherton...
Gathering data for Atwater...
Gathering data for Auburn...
Gathering data for Avalon...
Gathering data for Avenal...
Gathering data for Azusa...
Gathering data for Bakersfield...
Gathering data for Baldwin Park...
Gathering data for Banning...
Gathering data for Barstow...
Gathering data for Beaumont...
Gathering data for Bell...

Gathering data for Mill Valley...
Gathering data for Millbrae...
Gathering data for Milpitas...
Gathering data for Mission Viejo...
Gathering data for Modesto...
Gathering data for Monrovia...
Gathering data for Montague...
Gathering data for Montclair...
Gathering data for Monte Sereno...
Gathering data for Montebello...
Gathering data for Monterey...
Gathering data for Monterey Park...
Gathering data for Moorpark...
Gathering data for Moraga...
Gathering data for Moreno Valley...
Gathering data for Morgan Hill...
Gathering data for Morro Bay...
Gathering data for Mount Shasta...
Gathering data for Mountain View...
Gathering data for Murrieta...
Gathering data for Napa...
Gathering data for National City...
Gathering data for Needles...
Gathering data for Nevada City...
Gathering data for Newark...
Gathering data for Newman...
Gathering data for Newport Beach...
Gathering data for Norco...
Gathering data for Norwalk...
Gathering data for Novato...
Gathering data for Oakdale...
Gatheri

In [20]:
# Check to make sure data was updated.
recent_data.head()

Unnamed: 0,city,type,county,state,lat,lng,temperature,cloud,pressure,rain,date,wind_speed,uv_index,aqi,category,dominant_pollutant,date_scraped
0,Adelanto,City,San Bernardino,california,34.5828,-117.409,43.09,90,1016,2.03,2018-12-06 19:27:00,4.1,2.28,24,Good air quality,pm25,2018-12-06 11:27:00
1,Agoura Hills,City,Los Angeles,california,34.1533,-118.762,52.57,75,1012,1.22,2018-12-06 18:57:00,11.3,2.32,34,Good air quality,pm25,2018-12-06 11:27:00
2,Alameda,City,Alameda,california,37.7652,-122.242,57.16,20,1015,0.0,2018-12-06 19:15:00,4.6,1.72,53,Moderate air quality,pm25,2018-12-06 11:27:00
3,Albany,City,Alameda,california,37.8869,-122.298,57.16,1,1015,0.0,2018-12-06 19:15:00,2.97,1.71,59,Moderate air quality,pm25,2018-12-06 11:27:00
4,Alhambra,City,Los Angeles,california,34.0953,-118.127,53.4,90,1013,5.17,2018-12-06 18:58:00,1.32,2.33,54,Moderate air quality,pm25,2018-12-06 11:27:00


In [21]:
push_to_sql(recent_data,'california_weather')

479 rows inserted into california_weather.


# Appendix

In [22]:
pd.read_sql_query('''select * from california_weather where date_scraped = "2018-12-06 11:27:00"''',engine)

Unnamed: 0,city,type,county,state,lat,lng,uv_index,aqi,category,dominant_pollutant,date,temperature,cloud,pressure,wind_speed,rain,date_scraped
0,Adelanto,City,San Bernardino,california,34.5828,-117.409,2.28,24.0,Good air quality,pm25,2018-12-06 19:27:00,43.09,90,1016.00,4.10,2.0300,2018-12-06 11:27:00
1,Agoura Hills,City,Los Angeles,california,34.1533,-118.762,2.32,34.0,Good air quality,pm25,2018-12-06 18:57:00,52.57,75,1012.00,11.30,1.2200,2018-12-06 11:27:00
2,Alameda,City,Alameda,california,37.7652,-122.242,1.72,53.0,Moderate air quality,pm25,2018-12-06 19:15:00,57.16,20,1015.00,4.60,0.0000,2018-12-06 11:27:00
3,Albany,City,Alameda,california,37.8869,-122.298,1.71,59.0,Moderate air quality,pm25,2018-12-06 19:15:00,57.16,1,1015.00,2.97,0.0000,2018-12-06 11:27:00
4,Alhambra,City,Los Angeles,california,34.0953,-118.127,2.33,54.0,Moderate air quality,pm25,2018-12-06 18:58:00,53.40,90,1013.00,1.32,5.1700,2018-12-06 11:27:00
5,Aliso Viejo,City,Orange,california,33.5685,-117.726,2.48,33.0,Good air quality,pm25,2018-12-06 19:35:00,55.06,90,1013.00,4.10,5.8900,2018-12-06 11:27:00
6,Alturas,City,Modoc,california,41.4871,-120.542,1.41,79.0,Moderate air quality,pm25,2018-12-06 18:55:00,33.98,1,1017.00,2.10,0.0000,2018-12-06 11:27:00
7,Amador City,City,Amador,california,38.4194,-120.824,1.66,43.0,Good air quality,pm25,2018-12-06 19:15:00,54.95,1,1014.00,3.10,0.0000,2018-12-06 11:27:00
8,American Canyon,City,Napa,california,38.1749,-122.261,1.68,32.0,Good air quality,pm25,2018-12-06 19:20:00,57.24,1,1015.00,1.32,0.0000,2018-12-06 11:27:00
9,Anaheim,City,Orange,california,33.8366,-117.914,2.38,43.0,Good air quality,pm25,2018-12-06 18:58:00,52.45,90,1013.00,5.70,1.3100,2018-12-06 11:27:00


In [4]:
# Specify table schema for california_weather.
# dtype={ 
#     'city':  sqlalchemy.types.VARCHAR(length=255),
#     'type': sqlalchemy.types.VARCHAR(length=255),
#     'county': sqlalchemy.types.VARCHAR(length=255),
#     'state': sqlalchemy.types.VARCHAR(length=255),
#     'lat': sqlalchemy.types.Float(precision=8, asdecimal=True),
#     'lng': sqlalchemy.types.Float(precision=8, asdecimal=True),
#     'uv_index': sqlalchemy.types.Float(precision=2, asdecimal=True),
#     'aqi': sqlalchemy.types.Float(precision=2, asdecimal=True),
#     'category': sqlalchemy.types.VARCHAR(length=255),
#     'dominant_pollutant': sqlalchemy.types.VARCHAR(length=255),
#     'date': sqlalchemy.DateTime(),
#     'temperature': sqlalchemy.types.Float(precision=2, asdecimal=True),
#     'cloud': sqlalchemy.types.INTEGER(),
#     'pressure': sqlalchemy.types.Float(precision=2, asdecimal=True),
#     'wind_speed': sqlalchemy.types.Float(precision=2, asdecimal=True),
#     'rain': sqlalchemy.types.Float(precision=2, asdecimal=True)
# }


# df = format_date(pd.read_csv('california_cities.csv').dropna())

#### Creates pre-loaded table with dtypes
# df.to_sql(name='california_weather', if_exists='replace',con=engine, index=False,dtype=dtype)

In [15]:
# Read in initial, consolidated dataset
# df = pd.read_csv('appendix/california_cities.csv')
# df.head()
# date = df['date'][0]
# # string = 

# df.to_csv("test{0}.csv".format(date))

FileNotFoundError: [Errno 2] No such file or directory: 'test12/1/2018 6:06.csv'

In [12]:
# def df_geodata(filename=None,query=None):    
#     """
#     Returns dataframe with pollution data from city list dataframe. Makes API calls via 'pollution_epa'
#     to scrape latest data.
#     """
    
#     if filename and query:
#         return "You must pass either a csv filename or a SQL query. You cannot pass two arguments."
#     elif filename == None and query == None:
#         return "Error: No arguments were passed to this function."
#     elif filename:
#         df = pd.read_csv(filename)
#     else:
#         df = pd.read_sql_query(query).set_index('index')
    
#     aqi = []
#     category = []
#     dominant_pollutant = []
#     date = []


#     for lat,lng in zip(df['lat'],df['lng']):
#         # Get pollution data.
#         data = pollution_epa(lat,lng)
        
#         if data != None:
#             index = data['data']['indexes']['usa_epa']

#             air_quality = index['aqi']
#             categories = index['category']
#             dom_pollutant = index['dominant_pollutant']
#             datetime = data['data']['datetime']

#             aqi.append(air_quality)
#             category.append(categories)
#             dominant_pollutant.append(dom_pollutant)
#             date.append(datetime)
            
#         else:
#             aqi.append("NaN")
#             category.append("NaN")
#             dominant_pollutant.append("NaN")
#             date.append("NaN")          
    
#     # Update columns with latest data
#     df['aqi'] = aqi
#     df['category'] = category
#     df['dominant_pollutant'] = dominant_pollutant
#     df['datetime'] = date
    
#     return df

In [13]:
# def generate_map(df):
#     '''
#     Generates HTML for map from pollution data.
#     '''
#     data = [
#     go.Scattermapbox(
#         lat=round(df['lat'],3),
#         lon=round(df['lng'],3),
#         mode='markers',
#         marker=dict(
#             size=df['aqi']/10,
#             color= df['aqi'],
#             colorscale = 'Jet',
#         ),
#         text= marker_text(df)
#         )
#     ]

#     layout = go.Layout(
#         autosize=True,
#         hovermode='closest',
#         mapbox=dict(
#             accesstoken=mapbox_api_key,
#             bearing=0,
#             center=dict(
#                 lat=36,
#                 lon=-119
#             ),
#             style='dark',
#             pitch=0,
#             zoom=4
#         ),
#     )

#     fig = dict(data=data, layout=layout)

#     map_html = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')
    
#     return map_html

In [14]:
# def df_geodata(filename=None,query=None):    
#     """
#     Returns dataframe with pollution data from city list dataframe. Makes API calls via 'pollution_epa'
#     to scrape latest data.
#     """
    
#     if filename and query:
#         return "You must pass either a csv filename or a SQL query. You cannot pass two arguments."
#     elif filename == None and query == None:
#         return "Error: No arguments were passed to this function."
#     elif filename:
#         df = pd.read_csv(filename)
#     else:
#         df = pd.read_sql_query(query).set_index('index')
    
#     aqi = []
#     category = []
#     dominant_pollutant = []
#     date = []


#     for lat,lng in zip(df['lat'],df['lng']):
#         # Get pollution data.
#         data = pollution_epa(lat,lng)
        
#         if data != None:
#             index = data['data']['indexes']['usa_epa']

#             air_quality = index['aqi']
#             categories = index['category']
#             dom_pollutant = index['dominant_pollutant']
#             datetime = data['data']['datetime']

#             aqi.append(air_quality)
#             category.append(categories)
#             dominant_pollutant.append(dom_pollutant)
#             date.append(datetime)
            
#         else:
#             aqi.append("NaN")
#             category.append("NaN")
#             dominant_pollutant.append("NaN")
#             date.append("NaN")          
    
#     # Update columns with latest data
#     df['aqi'] = aqi
#     df['category'] = category
#     df['dominant_pollutant'] = dominant_pollutant
#     df['datetime'] = date
    
#     return df

In [15]:
# generate_map(df2)
# df = df_geodata('california_cities.csv')
# df.to_sql(name='california_pollution',con=engine,if_exists='replace')