In [1]:
from sqlalchemy import create_engine
from graph_gen import pollution_epa
from config import pyowm_api_key

import pandas as pd
import pyowm
import time

In [2]:
# Establish connection to SQL
engine = create_engine("mysql://root:password@localhost/weather_data")

In [3]:
# Read in initial, consolidated dataset
# df = pd.read_csv('california_cities.csv').set_index('index')

In [4]:
# Load into SQL.
# df.to_sql(name='california_weather',con=engine,if_exists='replace')

In [5]:
# Test to make sure the data is there.
query = '''
select * from california_weather
'''

In [6]:
# Read query and set the index to the csv's index. Clean up column names so all are uniform.
df = pd.read_sql_query(query, engine).set_index('index')
df.head()

Unnamed: 0_level_0,city,type,county,state,lat,lng,uv_index,aqi,category,dominant_pollutant,date,temperature,cloud,pressure,wind_speed,rain
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Adelanto,City,San Bernardino,california,34.58277,-117.409215,2.98,41,Good air quality,o3,2018-12-01 06:06:27,13.95,75,1015.0,5.1,0.0
1,Agoura Hills,City,Los Angeles,california,34.15334,-118.761676,3.0,48,Good air quality,pm25,2018-12-01 06:06:27,17.23,40,1016.0,3.6,0.0
2,Alameda,City,Alameda,california,37.765206,-122.241636,2.0,61,Moderate air quality,pm25,2018-12-01 06:06:27,14.29,1,1020.0,2.1,1.02
3,Albany,City,Alameda,california,37.88687,-122.297747,1.97,61,Moderate air quality,pm25,2018-12-01 06:06:27,14.6,1,1020.0,2.1,1.02
4,Alhambra,City,Los Angeles,california,34.095287,-118.127015,3.0,66,Moderate air quality,pm25,2018-12-01 06:06:27,17.37,1,1015.0,2.1,0.0


### Consolidating scraping functions to update weather values:

Code from all group members was fused into this function.

In [7]:
def get_weather_data(df):
    
    '''Takes in dataframe with city coordinates and returns of updated weather data for each city
        via OpenWeatherMap and Breezometer APIs.'''

    temperature = []
    clouds = []
    pressure = []
    rain = []
    date = []
    wind = []
    uv_index = []
    
    aqi = []
    category = []
    dominant_pollutant = []
    
    # Initialize connection to pyowm
    owm = pyowm.OWM(pyowm_api_key) 

    for city,lat,lng in zip(df['city'],df['lat'],df['lng']):
        print(f"Gathering data for {city}...")
        
        ## Make API calls to OpenWeatherMap: ##
        try:
            # Get weather and uv index data at coordinates specified.
            weather = owm.weather_at_coords(lat, lng)
            uvi = owm.uvindex_around_coords(lat, lng)
        except:
            print(f"Error adding data for {city}. Appending NaN for all OpenWeatherMap categories.")
            temperature.append("NaN")
            clouds.append("NaN")
            pressure.append("NaN")
            date.append("NaN")
            wind.append("NaN")
            uv_index.append("NaN")
            rain.append("NaN")
        else:
            weather_data = weather.get_weather()
            
            # Add data to lists
            temperature.append(weather_data.get_temperature('fahrenheit')['temp'])
            clouds.append(weather_data.get_clouds())
            pressure.append(weather_data.get_pressure()['press'])
            date.append(weather_data.get_reference_time(timeformat='iso'))
            wind.append(weather_data.get_wind()['speed'])
            uv_index.append(uvi.get_value())

            precip = weather_data.get_rain()

            # Account for empty rain data.
            if precip == {}:
                rain.append(0)
            else:
                rain.extend([v for v in precip.values()])
        
        ## Make API calls to Breezometer for air quality data:
        data = pollution_epa(lat,lng)
        
        if data:
            index = data['data']['indexes']['usa_epa']

            air_quality = index['aqi']
            categories = index['category']
            dom_pollutant = index['dominant_pollutant']

            aqi.append(air_quality)
            category.append(categories)
            dominant_pollutant.append(dom_pollutant)
            
        else:
            aqi.append("NaN")
            category.append("NaN")
            dominant_pollutant.append("NaN")
        
        # sleep for 1.1 second. Limited to 60 API calls/min
        time.sleep(1.1)    
        
    return temperature,clouds,pressure,rain,date,wind,uv_index,aqi,category,dominant_pollutant

In [8]:
def update_df(df):
    
    '''Updates dataframe columns with new weather data.'''
    
    temperature,clouds,pressure,rain,date,wind,uv_index,aqi,category,dominant_pollutant = get_weather_data(df)
    
    df['temperature'] = temperature
    df['cloud'] = clouds
    df['pressure'] = pressure
    df['rain'] = rain
    df['date'] = pd.to_datetime(date)
    df['wind_speed'] = wind
    df['uv_index'] = uv_index
    df['aqi'] = aqi
    df['category'] = category
    df['dominant_pollutant'] = dominant_pollutant
    
#     df.to_sql(name='california_weather',con=engine,if_exists='replace')
    return df

In [None]:
recent_data = update_df(df)

Gathering data for Adelanto...
Gathering data for Agoura Hills...
Gathering data for Alameda...
Gathering data for Albany...
Gathering data for Alhambra...
Gathering data for Aliso Viejo...
Gathering data for Alturas...
Gathering data for Amador City...
Gathering data for American Canyon...
Gathering data for Anaheim...
Gathering data for Anderson...
Gathering data for Angels Camp...
Gathering data for Antioch...
Gathering data for Apple Valley...
Gathering data for Arcadia...
Gathering data for Arcata...
Gathering data for Arroyo Grande...
Gathering data for Artesia...
Gathering data for Arvin...
Gathering data for Atascadero...
Gathering data for Atherton...
Gathering data for Atwater...
Gathering data for Auburn...
Gathering data for Avalon...
Gathering data for Avenal...
Gathering data for Azusa...
Gathering data for Bakersfield...
Gathering data for Baldwin Park...
Gathering data for Banning...
Gathering data for Barstow...
Gathering data for Beaumont...
Gathering data for Bell...

Gathering data for Mill Valley...
Gathering data for Millbrae...
Gathering data for Milpitas...
Gathering data for Mission Viejo...
Gathering data for Modesto...
Gathering data for Monrovia...
Gathering data for Montague...
Gathering data for Montclair...
Gathering data for Monte Sereno...
Gathering data for Montebello...
Gathering data for Monterey...
Gathering data for Monterey Park...
Gathering data for Moorpark...
Gathering data for Moraga...
Gathering data for Moreno Valley...
Gathering data for Morgan Hill...
Gathering data for Morro Bay...
Gathering data for Mount Shasta...
Gathering data for Mountain View...
Gathering data for Murrieta...
Gathering data for Napa...
Gathering data for National City...
Gathering data for Needles...
Gathering data for Nevada City...
Gathering data for Newark...
Gathering data for Newman...
Gathering data for Newport Beach...
Gathering data for Norco...
Gathering data for Norwalk...
Gathering data for Novato...
Gathering data for Oakdale...
Gatheri

In [None]:
recent_data.head()

# Appendix

In [None]:
# pd.read_sql_query('''describe california_weather''',engine)

In [None]:
# def df_geodata(filename=None,query=None):    
#     """
#     Returns dataframe with pollution data from city list dataframe. Makes API calls via 'pollution_epa'
#     to scrape latest data.
#     """
    
#     if filename and query:
#         return "You must pass either a csv filename or a SQL query. You cannot pass two arguments."
#     elif filename == None and query == None:
#         return "Error: No arguments were passed to this function."
#     elif filename:
#         df = pd.read_csv(filename)
#     else:
#         df = pd.read_sql_query(query).set_index('index')
    
#     aqi = []
#     category = []
#     dominant_pollutant = []
#     date = []


#     for lat,lng in zip(df['lat'],df['lng']):
#         # Get pollution data.
#         data = pollution_epa(lat,lng)
        
#         if data != None:
#             index = data['data']['indexes']['usa_epa']

#             air_quality = index['aqi']
#             categories = index['category']
#             dom_pollutant = index['dominant_pollutant']
#             datetime = data['data']['datetime']

#             aqi.append(air_quality)
#             category.append(categories)
#             dominant_pollutant.append(dom_pollutant)
#             date.append(datetime)
            
#         else:
#             aqi.append("NaN")
#             category.append("NaN")
#             dominant_pollutant.append("NaN")
#             date.append("NaN")          
    
#     # Update columns with latest data
#     df['aqi'] = aqi
#     df['category'] = category
#     df['dominant_pollutant'] = dominant_pollutant
#     df['datetime'] = date
    
#     return df

In [None]:
# def generate_map(df):
#     '''
#     Generates HTML for map from pollution data.
#     '''
#     data = [
#     go.Scattermapbox(
#         lat=round(df['lat'],3),
#         lon=round(df['lng'],3),
#         mode='markers',
#         marker=dict(
#             size=df['aqi']/10,
#             color= df['aqi'],
#             colorscale = 'Jet',
#         ),
#         text= marker_text(df)
#         )
#     ]

#     layout = go.Layout(
#         autosize=True,
#         hovermode='closest',
#         mapbox=dict(
#             accesstoken=mapbox_api_key,
#             bearing=0,
#             center=dict(
#                 lat=36,
#                 lon=-119
#             ),
#             style='dark',
#             pitch=0,
#             zoom=4
#         ),
#     )

#     fig = dict(data=data, layout=layout)

#     map_html = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')
    
#     return map_html

In [None]:
# def df_geodata(filename=None,query=None):    
#     """
#     Returns dataframe with pollution data from city list dataframe. Makes API calls via 'pollution_epa'
#     to scrape latest data.
#     """
    
#     if filename and query:
#         return "You must pass either a csv filename or a SQL query. You cannot pass two arguments."
#     elif filename == None and query == None:
#         return "Error: No arguments were passed to this function."
#     elif filename:
#         df = pd.read_csv(filename)
#     else:
#         df = pd.read_sql_query(query).set_index('index')
    
#     aqi = []
#     category = []
#     dominant_pollutant = []
#     date = []


#     for lat,lng in zip(df['lat'],df['lng']):
#         # Get pollution data.
#         data = pollution_epa(lat,lng)
        
#         if data != None:
#             index = data['data']['indexes']['usa_epa']

#             air_quality = index['aqi']
#             categories = index['category']
#             dom_pollutant = index['dominant_pollutant']
#             datetime = data['data']['datetime']

#             aqi.append(air_quality)
#             category.append(categories)
#             dominant_pollutant.append(dom_pollutant)
#             date.append(datetime)
            
#         else:
#             aqi.append("NaN")
#             category.append("NaN")
#             dominant_pollutant.append("NaN")
#             date.append("NaN")          
    
#     # Update columns with latest data
#     df['aqi'] = aqi
#     df['category'] = category
#     df['dominant_pollutant'] = dominant_pollutant
#     df['datetime'] = date
    
#     return df

In [None]:
# generate_map(df2)
# df = df_geodata('california_cities.csv')
# df.to_sql(name='california_pollution',con=engine,if_exists='replace')