[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ds5110/stinky/blob/master/weather_datasets.ipynb)

##PURPOSE:
The purpose of this notebook is to download the master weather data and the merged data directly to the computer of the user for the user's own personal copy or analysis. The files downloaded will be '.csv' files that can be opened on programs like microsoft excel.

Running this notebook will not affect the analyses for any other notebook.

This notebook will reflect any updates made to the github repo 'data' and 'weather_data_raw directories'

Running this entire notebook should automatically download both files as a csv to your computer. Your browser may prompt you to allow multiple downloads from this notebook - please select allow.

If you encounder download issues, you may directly download each file individusally by running the code cells below. Please ensure that this whole notebook has been run at least once before proceeding to download the files indiviudally.

In [None]:
#@title 
import pandas as pd
import shutil
import numpy as np 
import seaborn as sns 
import glob
import os 
from datetime import *
import seaborn as sns
import copy
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import plotly.express as px
from google.colab import files

In [None]:
#@title 
#above string is used to hide code block
#PLEASE SEE FOLLOWING TEXT BLOCK FOR EXPLANATION OF THIS CODE CELL!!!

'''
Purpose: Adds date_only and time_only columns to dataframe
Column values are filled based on 'DateTime Column'
This DIRECTLY changes the dataframe, it does NOT return a copy
@param df : a dataframe, this will be a file consisting of data
  from each pod pooled together
'''
def add_dates_times_columns(df):
  # insert date only, time only, and DateTime Columns
  # adding a filler value x or y that will be replaced later
  df.insert(1, 'date_only', 'x')
  df.insert(2, 'time_only', 'y')
  
  # change name of Time column to datetime for clarity that it is a datetime
  df.rename(columns={'Time' : 'DateTime'}, inplace=True)
  #copy string of DateTime to time and date only columns
  df['date_only'] = df['DateTime']
  df['time_only'] = df['DateTime']

  # convert strings to appropriate data type - DateTime, date, and time
  df['DateTime'] = pd.to_datetime(df['DateTime'], format="%Y-%m-%d %H:%M:%S")
  df['date_only'] = pd.to_datetime(df['date_only']).dt.date
  df['time_only'] = pd.to_datetime(df['time_only']).dt.time

  # resetting index to make it look neat
  df.reset_index(drop=True, inplace=True)

def cardinal_direction(df): 
  condition_list = [(df['Wind Direction'] > 0) & (df['Wind Direction'] < 45), (df['Wind Direction'] == 45), 
                    (df['Wind Direction'] > 45) & (df['Wind Direction'] <= 90), (df['Wind Direction'] > 90) & (df['Wind Direction'] < 135), (df['Wind Direction'] == 135), 
                    (df['Wind Direction'] > 135) & (df['Wind Direction'] <= 180), (df['Wind Direction'] > 180) & (df['Wind Direction'] < 225), (df['Wind Direction'] == 225), 
                    (df['Wind Direction'] > 225) & (df['Wind Direction'] <= 270), (df['Wind Direction'] > 270) & (df['Wind Direction'] < 315), (df['Wind Direction'] == 315), 
                    (df['Wind Direction'] > 315) & (df['Wind Direction'] <= 360) ]
  choicelist = ['North','North-East', 'East', 'East', 'South-East', 'South', 'South', 'South-West', 'West', 'West', 'North-West', 'North']
  
  df['Cardinal Direction'] = np.select(condition_list, choicelist) #adding the cardinal direction column
  direction_values=df['Cardinal Direction'].values #to get the values either as array or a series
  
  df.drop(['Cardinal Direction'], axis=1, inplace=True) # we need to drop the column to relocate to the position we want
  df.insert(15,'Cardinal Direction',value=direction_values)
  return df # to return the dataframe when the function is called\

'''
Purpose: The get_weather_master function more than just the necessary
  weather data and adds them to the google colab local files.
  This function will remove unnecssary files in order to save 
  memory/space in the local colab session.
'''
def clean_up():
  shutil.rmtree('/content/stinky/vessels_data', ignore_errors=True)
  os.remove('/content/stinky/2021 SMRO VESSEL ARRIVALS.csv')
  os.remove('/content/stinky/Geo_dataset.ipynb')
  os.remove('/content/stinky/README.md')
  os.remove('/content/stinky/call_notes.md') 
  os.remove('/content/stinky/oil_vessel_dataset.ipynb')
  os.remove('/content/stinky/text_analysis_caroline.ipynb')
  os.remove('/content/stinky/SCF ODOR REPORT 1-1-20 to 6-3-21.xlsx')
  os.remove('/content/stinky/stinky_dataset.ipynb')
  os.remove('/content/stinky/2020 SMRO VESSEL ARRIVALS.csv')


def get_weather_master():
  shutil.rmtree('/content/stinky', ignore_errors=True)
  !git clone https://github.com/ds5110/stinky

  path = 'stinky/weather_data_raw/'
  lst = os.listdir(path)

  df= pd.DataFrame()
  df_new= pd.DataFrame()

  for i in lst:
    url = path+i
    if i[-4:]==".csv":
      if df.empty==True :
        df = pd.read_csv(url)
        df["Pod Name"] = i[:5]
      else:
        df_new= pd.read_csv(url)
        df_new["Pod Name"]= i[:5]
        df= df.append(df_new)

  add_dates_times_columns(df)
  df = cardinal_direction(df)

  df = df.sort_values(by=['Pod Name','DateTime'])
  df.reset_index(drop=True, inplace=True)
  df.rename(columns={'DateTime':'Date & time (hour rounded)', 
                     'date_only':'date', 'time_only':'time (hour rounded)'}
            , inplace=True)
  clean_up()

  return df

In [None]:
#@title 
#PLEASE SEE FOLLOWING TEXT BLOCK FOR EXPLANATION OF THIS CODE CELL!!!
'''
Purpose : A helper function to weather_df_interval_groupby_for_plot.
    This remove columns with all values that are NA
@param df_weather_master : dataframe, the weather master dataframe
    i.e. the return value of get_weather_master()
return: dataframe, a processed copy of the weather master
'''
def column_process_weather_df(df_weather_master):
  df_copy = df_weather_master.copy()
  df_copy =df_copy.dropna(axis=1, how='all')
  df_copy.pop('date')
  df_copy.pop('time (hour rounded)')

  return df_copy


'''
Purpose: A helper function for aggregation in weather_df_process_for_plotting
    This is used to get mode of cardinal directions for any time interval
@param: a pandas series containing strings of cardinal directions
@return: a string representation of the mode
'''
def determine_direction_mode(df_column):
  df_mode = df_column.mode()
  if (len(df_mode.index) > 1):
    return 'None'
  else:
    return df_mode


'''
Purpose: To create a datetime object from date values as integers
This is a helper function to get_weather_info_helper
@param day_int : an integer representing the numerical day
@param month_int : an integer representing the numerical month i.e. January is 1
@param year_int : the year represented as a 4 digit integer i.e. 2020
@param time : A boolean, set to False by default. Set true if looking at a particular hour
@param hour_int : an integer representing the hour in military time (0-23) where 23 = 11:OO PM
@ return : a datetime object representing the data values entered

'''
def convert_val_to_dt(day_int, month_int, year_int, time = False, hour_int = 0):
    if time == False:
        x = datetime(year = year_int, month = month_int, day = day_int, hour = 0, minute = 0, second = 0, microsecond = 0)
    else:
        x = datetime(year = year_int, month = month_int, day = day_int, hour = hour_int, minute = 0, second = 0, microsecond = 0)
    return x


'''
Purpose: A helper function to weather_df_interval_groupby_for_plot
    This groups the dataframe by key - weekly, monthly, or daily
    * daily actually does not meaure in months but in chunks of 30 days
    which is almost a month
@param df: a dataframe, weather master 
    i.e. the return value of get_weather_master()
@param key: a string of the grouping requested, 
    must be 'hours','days','weeks', or'months'
@return: dataframe, a processed dataframe that removes columns with all NA
and is grouped by user choice
'''
def weather_df_process_for_plotting(df : pd.DataFrame, key : str, interval = 1):
  options = ['hours','days','weeks','months']
  
  if interval <=0 or not isinstance(interval, int):
    raise ValueError("Interval must be a positive non-zero integer")
  
  key = key.lower()

  if key not in options:
    raise ValueError("Key must be a string: hours, days, weeks, or months")
  
  week_interval = str(interval * 7) + 'D'
  month_interval = str(interval * 30) + 'D'
  day_interval = str(interval * 1) + 'D'
  hour_interval = str(interval * 1) + 'H'

  dict_options = {'hours': hour_interval, 'days' : day_interval, 'weeks' : week_interval, 'months' : month_interval}

  
  df_copy = df.copy()

  interval_key = dict_options[key]
  
  try:
    df_copy.insert(loc=1, column='DateTime interval',value= 5)
  except:
    exception = 1
  
  df_copy['DateTime interval'] = df_copy['Date & time (hour rounded)'].dt.floor(interval_key)

  df_copy['Cardinal Direction'] = df_copy['Cardinal Direction'].replace('0', 'Not-measured')
  df_copy['Pod Name'] = df_copy['Pod Name'].astype(str)

  try:
    df_copy.pop(item='Date & time (hour rounded)')
  except:
    exception = 1

  list_col = list(df_copy.columns)

  dict_agg = {}

  for i in list_col:
    if i == 'DateTime interval':
       continue
    elif i == 'Cardinal Direction':
      dict_agg[i] = lambda x: determine_direction_mode(x)
    elif i == 'Pod Name':
      dict_agg[i] = lambda y : y.mode()
    else:
      dict_agg[i] = 'mean'

  all_pods = list(df_copy['Pod Name'].unique())
  df_list = []

  for pod in all_pods:
    df_iter = df_copy[df_copy['Pod Name'] == pod]
    df_iter = df_iter.groupby(by=['DateTime interval']).agg(dict_agg)
    df_list.append(df_iter)
    
  final = pd.concat(df_list)
  return final

'''
Purpose: A helper function to display_weather_EDA 
    Groups the master weather dataframe by user choice
    by key - weekly, monthly, or daily. Keeps the pods separate though.
@param df : a dataframe, this should be the master weather dataframe.
    i.e. the return value of get_weather_master()
@param key : a string value, the way to aggregate the data.
    Must be 'hours','days','weeks', or 'months'.
@ param interval : an integer, the quantity for aggregation.
    i.e. if you want to aggregate by intervals of 3 days then key = 'days', interval = 3
'''
def weather_df_interval_groupby_for_plot(df : pd.DataFrame, key : str, interval = 1)->pd.DataFrame:
  df = column_process_weather_df(df)
  return weather_df_process_for_plotting(df, key, interval=interval)

'''
Purpose: A helper function to display_weather_EDA
    Creates two graphs index vs. EDA to visualize and 
    datetime vs. EDA to visualize
@param df_weather_processed : a dataframe, the processed weather dataframe
    i.e. it should be the return value of weather_df_interval_groupby_for_plot
@param plot_type : a string representing the kind of plot to graph 
    'scatter' and 'plot' are the only valid plot_type values
@return: void, will print out the graphs
'''
def weather_EDA(df_weather_processed, plot_type : str):

  valid_plot_types = ['line', 'scatter']

  if (plot_type not in valid_plot_types):
    raise ValueError("plot_type must be 'line' or 'scatter'")

  sns.set()
  sns.set(rc={'figure.figsize':(20,10)})

  EDA_list_visualize = ['Temp Avg', 'Hum Avg','Baro Avg','Windspeed','Gust','Interval Precip','Dew Point','Heat Index','Wind Chill']

  index_key = 'DateTime interval'
  df_weather_processed = df_weather_processed.sort_values(by=['Pod Name'])
  df_weather_processed_reset = df_weather_processed.reset_index(drop=True).index
  id = df_weather_processed.reset_index(drop=True).index

  if plot_type == 'line':
    for i in EDA_list_visualize:
      fig, ax =plt.subplots(1,2)
      sns.lineplot(data = df_weather_processed.reset_index(drop=True), x=id , y = i, hue='Pod Name',  ax=ax[0] )
      sns.lineplot(data = df_weather_processed, x = index_key  , y = i, hue='Pod Name',  ax=ax[1] )
      plt.xticks(rotation=90)        
      fig.show()
  elif plot_type == 'scatter':
    for i in EDA_list_visualize:
      fig, ax =plt.subplots(1,2)
      sns.scatterplot(data = df_weather_processed.reset_index(drop=True), x = id  , y = i, hue='Pod Name',  ax=ax[0] )
      sns.scatterplot(data = df_weather_processed, x = index_key  , y = i, hue='Pod Name',  ax=ax[1] )
      plt.xticks(rotation=90)
      fig.show()

'''
Purpose: Calls the function that will created weather/data visualization
@param df_weather_master : the master weather dataframe
@param plot_type : a string representing the type of plot to visualize - 
  only 'scatter' and 'line' are valid options for now
  **In the future allow for different plot types
@param key : a string - must be 'hours', 'days', 'weeks', 'months'
@param interval : an integer, a multiplier for the key interval chosen, defaults to 1
@return : void, displays graph
'''
def display_weather_EDA(df_weather_master, plot_type : str, key : str,interval_qty = 1):
  df_to_plot = weather_df_interval_groupby_for_plot(df_weather_master, key, interval_qty)
  weather_EDA(df_to_plot, plot_type)

In [None]:
#@title 
# All the code in this block(besides the github path access) come from 
# the smell group. It was just packaged into functions
'''
Purpose: A helper function to get_smell_df_master
    This imports smell my city data csv
    and adds datetime columns.
@return: a dataframe of the smell my city data
'''
def get_smc():
  directory_path = 'stinky/data/'
  lst = os.listdir(directory_path)

  for i in lst:
    if i == 'smc.csv':
      file_path = directory_path+i
      df_smc = pd.read_csv(file_path)
    else: continue

  # Chnaging the date and time column to datetime format
  df_smc['date & time']=df_smc['date & time'].str[0:20]
  df_smc['date & time']=pd.to_datetime(df_smc['date & time'])
  # Creating separate columns for date and time
  df_smc['date'] = [d.date() for d in df_smc['date & time']]
  df_smc['time'] = [d.time() for d in df_smc['date & time']]  
  
  # Creating a date and hour column
  df_smc['Date & time (hour rounded)'] = df_smc['date & time'].dt.strftime("%Y-%m-%d %H:00:00")

  # renaming columns
  df_smc.rename(columns={'skewed latitude':'Latitude', 'skewed longitude':'Longitude'}, inplace=True)

  return df_smc

'''
Purpose: A helper function to get_smell_df_master
    This imports see clickfix data csv
    and add datetime columns.
@return: a dataframe of the see click fix data
'''
def get_scf():
  directory_path = 'stinky/data/'
  lst = os.listdir(directory_path)

  for i in lst:
    if i == 'scf.csv':
      file_path = directory_path+i
      df_scf = pd.read_csv(file_path)
    else: continue

  # Changing the date and time column to datetime format
  df_scf['Created at local']=pd.to_datetime(df_scf['Created at local'])
  df_scf['Closed at local']=pd.to_datetime(df_scf['Closed at local'])

  # Creating separate columns for date and time
  df_scf['date'] = [d.date() for d in df_scf['Created at local']]
  df_scf['time'] = [d.time() for d in df_scf['Created at local']]

  # Creating a date and hour column (using the 'Created at local' column)
  df_scf['Date & time (hour rounded)'] = df_scf['Created at local'].dt.strftime("%Y-%m-%d %H:00:00") 

  # renaming columns
  df_scf.rename(columns={'Description':'smell description', 'Lat':'Latitude', 'Lng':'Longitude'}, inplace=True) 
  return df_scf

'''
Purpose: A heplper function to get_smell_df_master
    It merges two dataframes. These parameters
    should be the return values of get_smc() and get_scf().
@param smc: a dataframe, the return value of get_smc()
@param sfc : a dataframe, the return value of get_scf()
@return a dataframe that merged both dataframes.
'''
def merge_and_process_smell(smc, scf):
  df_stinky = scf.append(smc, sort=False)

  #get a rounded hour column
  df_stinky['time (hour rounded)']= pd.to_datetime(df_stinky['Date & time (hour rounded)']).dt.time

  # change column positions
  date_hour_col = df_stinky.pop('Date & time (hour rounded)')
  date_only_col = df_stinky.pop('date')
  time_hour_col = df_stinky.pop('time (hour rounded)')

  df_stinky.insert(0,column = 'Date & time (hour rounded)', value= date_hour_col)
  df_stinky.insert(1,column = 'date', value = date_only_col)
  df_stinky.insert(2,column ='time (hour rounded)', value= time_hour_col)
  df_stinky.insert(loc=3,column='Complaint total', value= 1)

  return df_stinky

'''
Purpose: get the smell master dataframe
This dataframe (df) will be used in many later functions.
@return : a dataframe of merged smell my city data and see clickfix data
'''
def get_smell_df_master():
  df_smc = get_smc()
  df_scf = get_scf()
  return merge_and_process_smell(df_smc, df_scf)

In [None]:
#PLEASE SEE FOLLOWING TEXT BLOCK FOR EXPLANATION OF THIS CODE CELL!!!
#@title 
'''
Purpose: Helper function to find_min_max_dates
@param list_datetime : a list composed of 2 datetime objects
@param key: a string, either max or min
@return the minimum or maximum of the two datetime objects
from list_datetime based on key param
'''
def date_comp_helper(list_datetime, key):
  comp = list_datetime[0]

  if key == 'min':
    return (pd.Series(list_datetime)).max()
  elif key == 'max':
    return (pd.Series(list_datetime)).min()

'''
Purpose: Helper function to trim_dfs_by_datetime.
@param list_df: a list of dataframes.
Each dataframe must have a 'Date & time (hour rounded)' column
with a datatype of datetime
@return : two datetime objects, the minimum and maximum dates
that fit within the date ranges of two dataframes

'''
def find_min_max_dates(list_df : list):

  list_copy = copy.deepcopy(list_df)
  min_list = []
  max_list = []

  for df in list_copy:
    series = df['Date & time (hour rounded)']
    min_datetime_i = series.min()
    max_datetime_i = df['Date & time (hour rounded)'].max()
    min_list.append(min_datetime_i)
    max_list.append(max_datetime_i)

  min_datetime = date_comp_helper(min_list, 'min')
  max_datetime = date_comp_helper(max_list, 'max')


  return min_datetime, max_datetime


'''
Purpose: Helper function to trim_dfs_by_datetime
It removes every column except for date & time and Complaint total -
Essentially keeping only the columns we are interested in looking at.
@param df_smell_master: the smell master df (dataframe)
@return: a df with only the 'Date & time (hour rounded)' and 'Complaint total'
column. This is copy. This (and most functions) do not
affect the parameter passed in.
'''
def remove_smell_columns_unused(df_smell_master_param):
    df_copy = df_smell_master_param.copy()

    list_col_keep = ['Date & time (hour rounded)', 'Complaint total']

    for i in (list(df_copy.columns)):
        if i not in list_col_keep:
            df_copy.pop(i)
        else:
            continue

    df_copy = df_copy.groupby(by=['Date & time (hour rounded)']).sum()
    df_copy = df_copy.reset_index()

    return df_copy

'''
Purpose: Groups the weather data by datetime hours. 
The weather data contains multiple measurements for the same datetime hour
becuase multiple pods are measuring the data. This function will get 
all of measurements from each pod for that particular time interval and 
average them together so that the resulting df will only have 1 set of 
measurements per datetime interval.
@param df_weather_master: the weather master df
@ return: a df with the weather values aggregated/grouped by datetime interval
'''
def flatten_weather_hourly(df_weather_master_param):
    # Be aware this does not remove any outliers in the data!!!
    df_copy = df_weather_master_param.copy()
    df_copy['DateTime interval'] = df_copy['Date & time (hour rounded)'].dt.floor('H')
    df_copy = column_process_weather_df(df_copy)

    df_copy.pop(item='Pod Name')
    df_copy.pop(item='DateTime interval')

    list_col = list(df_copy.columns)

    dict_agg = {}

    for i in list_col:
        if i == 'Date & time (hour rounded)':
            continue
        elif i == 'Cardinal Direction':
            dict_agg[i] = lambda x: determine_direction_mode(x)
        else:
            dict_agg[i] = 'mean'

    df_agg = df_copy.groupby(by=['Date & time (hour rounded)']).agg(dict_agg)
    df_agg = df_agg.reset_index()
    return df_agg


'''
Purpose: This function will trim each dataframe passed so that
they match each others time ranges. The smell data was being collected
much earlier than the weather data. Therefore, we have no weather event values
that go as far back. We want to analyze weather and complaints together and we cannot 
analyze them together completely becuase they do not have the same datetime ranges.
So we trim the data such that the dataframe return contains the values
for the matching time ranges.
@param df_smell_master : the master smell df
@param df_weather_master : the weather master df
@return : a list consisting of two df's (weather and smell) whose
min and max date times match each other
'''
def trim_dfs_by_datetime(df_smell_master, df_weather_master_param):
    df_smell_master_copy = df_smell_master.copy()
    df_weather_master_copy = df_weather_master_param.copy()

    min_datetime, max_datetime = find_min_max_dates([df_smell_master_copy, df_weather_master_copy])

    df_smell_master_copy['Date & time (hour rounded)'] = pd.to_datetime(
        df_smell_master_copy['Date & time (hour rounded)'])
    df_smell = df_smell_master_copy[df_smell_master_copy['Date & time (hour rounded)'] >= min_datetime]
    df_weather = df_weather_master_copy[df_weather_master_copy['Date & time (hour rounded)'] >= min_datetime]
    df_smell = df_smell[df_smell['Date & time (hour rounded)'] <= max_datetime]
    df_weather = df_weather[df_weather['Date & time (hour rounded)'] <= max_datetime]

    # Need this second check do not remove
    min_datetime, max_datetime = find_min_max_dates([df_smell, df_weather])
    df_smell = df_smell[df_smell['Date & time (hour rounded)'] >= min_datetime]
    df_weather = df_weather[df_weather['Date & time (hour rounded)'] >= min_datetime]
    df_smell = df_smell[df_smell['Date & time (hour rounded)'] <= max_datetime]
    df_weather = df_weather[df_weather['Date & time (hour rounded)'] <= max_datetime]

    df_weather = df_weather[df_weather['Date & time (hour rounded)'] >= df_smell['Date & time (hour rounded)'].min()]
    df_smell = remove_smell_columns_unused(df_smell)
    df_smell = df_smell.reset_index(drop=True)

    df_weather = flatten_weather_hourly(df_weather)

    return df_weather, df_smell

'''
Purpose: This function will merge two dataframes together in a full outer join.
Duplicate values will be dropped. The idea is to call this function on the 
return value of trim_dfs_by_datetime.
@param df_weather : the first item from the list that is returned by trim_dfs_by_datetime
@param df_smell : the seocnd item from the list that is returned by trim_dfs_by_datetime
@return : a df representing the merged and trimmed data of the weather master df and smell master df
'''
def merge_trimmed_data(df_weather, df_smell):
  df_merged = pd.merge(df_weather,df_smell, how='outer', on='Date & time (hour rounded)') 
  df_merged = df_merged.reset_index()
  df_merged = df_merged.drop_duplicates(subset=['Date & time (hour rounded)'])
  df_merged.pop('index')
  df_merged = df_merged.reset_index()
  df_merged.pop('index')
  return df_merged

'''
Purpose: This is meant to be called on the return value of merge_trimmed_data
It will aggregate/group the data by the interval you want to look at.. i.e.
if key = days and interval = 3 it will aggregate/group the data in 
intervals of 3 days (it will perform the correct aggregation type per column
so complaint totals are summed up while while weather features like temperature 
become averaged)
@param df : This should be the return value merge_trimmed_data
@
'''
def groupby_interval_merged_trimmed_df(df, key : str, interval = 1):
  df_copy = df.copy()

  options = ['hours','days','weeks','months']
  
  if interval <=0 or not isinstance(interval, int):
    raise ValueError("Interval must be a positive non-zero integer")
  
  key = key.lower()

  if key not in options:
    raise ValueError("Key must be a string: hours, days, weeks, or months")
  
  week_interval = str(interval * 7) + 'D'
  month_interval = str(interval * 30) + 'D'
  day_interval = str(interval * 1) + 'D'
  hour_interval = str(interval * 1) + 'H'

  dict_options = {'hours': hour_interval, 'days' : day_interval, 'weeks' : week_interval, 'months' : month_interval}


  interval_key = dict_options[key]

  try:
    df_copy.insert(loc=1, column='DateTime interval',value= 5)
  except:
    exception = 1
  
  df_copy['DateTime interval'] = df_copy['Date & time (hour rounded)'].dt.floor(interval_key)

  try:
    df_copy.pop(item='Date & time (hour rounded)')
  except:
    exception = 1

  list_col = list(df_copy.columns)

  dict_agg = {}

  for i in list_col:
    if i == 'DateTime interval':
       continue
    elif i == 'Cardinal Direction':
      dict_agg[i] = lambda x: determine_direction_mode(x)
    elif i == 'Complaint total':
      dict_agg[i] = 'sum'
    else:
      dict_agg[i] = 'mean'
  
  df_agg = df_copy.groupby(by=['DateTime interval']).agg(dict_agg)
  df_agg = df_agg.reset_index()
  return df_agg

def create_season_dict():
  all_months = ["December", "January", "February", "March", "April", 
                "May", "June", "July", 
                "August", "September", "October", "November"]

  all_seasons = ['Winter', 'Spring', 'Summer', 'Fall']

  season_tracker = 0
  month_tracker = 0

  season_dict = {}
  for i in range(len(all_months)):
    season_dict[all_months[i]] = all_seasons[season_tracker]
    month_tracker+=1
    
    if month_tracker == 3:
      season_tracker +=1
      month_tracker = 0
  return season_dict


  
'''
Purpose: To combine the master weather and master smell data set into dataframe
param @ df_weather_master : the master weather dataframe
param @  df_smell_master : the smell master dataframe
param @ key : the type of interval to use, must be 'hours', 'days', 'weeks', or 'months'
parman @interval : the qty of interval to use, defaults to 1
Return: A dataframe that combined the smell master df and the weather master df

For example get_merged_trimmed_df(df_weather_master, df_smell_master, key = 'hours', interval = 12)
will return a dataframe that groups the intervals into 12 hour blocks.

New Column Explanations: 
*** 'Hour Group' - can be used to group analysis or visualization by hour blocks
*** 'Day_of_week' - can be used to group analysis or visualizaiton by weekday, i.e. Monday, Teusday, ..etc.
*** 'Month_of_year' - can be used to group analysis by categorical month i.e. January, February etc.
*** 'Season' - can be used to group analysis by categorical season 'Winter', 'Spring', 'Summer', 'Fall'

Different keys will provide different extra columns:
*** key : Hours -  will add the columns 'Hour Group', 'Day_of_week', 'Month_of_year', and 'Season'
*** key : days -  will add the columns 'Day_of_week', 'Month_of_year', and 'Season'
*** key : month - will add the columns 'Month_of_year', and 'Season'

'''
def get_merged_trimmed_df(df_weather_master_param, df_smell_master, key, interval=1):
    trimmed_weather, trimmed_smell = trim_dfs_by_datetime(df_smell_master, df_weather_master_param)
    merged_df = merge_trimmed_data(trimmed_weather, trimmed_smell)
    merged_grouped_df = groupby_interval_merged_trimmed_df(merged_df, key, interval)

    merged_grouped_df['DateTime interval'] = pd.to_datetime(merged_grouped_df['DateTime interval'])
    seasons_dict = create_season_dict()

    if ((key == 'days' and interval == 1) or key == 'hours'):
        merged_grouped_df.insert(loc=1, column='Day_of_week', value=5)
        merged_grouped_df.insert(loc=2, column='Month_of_year', value=5)
        merged_grouped_df.insert(loc=3, column='Season', value=5)
        merged_grouped_df['Day_of_week'] = merged_grouped_df['DateTime interval'].dt.day_name()

        merged_grouped_df['Month_of_year'] = merged_grouped_df['DateTime interval'].apply(lambda x: x.strftime('%B'))
        merged_grouped_df['Season'] = merged_grouped_df['Month_of_year'].apply(lambda y: seasons_dict[y])

    elif ((key == 'weeks' and interval == 1) or (key == 'months' and interval == 1)):
        merged_grouped_df.insert(loc=1, column='Month_of_year', value=5)
        merged_grouped_df.insert(loc=2, column='Season', value=5)

        merged_grouped_df['Month_of_year'] = merged_grouped_df['DateTime interval'].apply(lambda x: x.strftime('%B'))
        merged_grouped_df['Season'] = merged_grouped_df['Month_of_year'].apply(lambda y: seasons_dict[y])

    if (key == 'hours'):
        merged_grouped_df.insert(loc=1, column='Hour Group', value=5)
        merged_grouped_df['Hour Group'] = merged_grouped_df['DateTime interval'].dt.time

    col_complaint = merged_grouped_df.pop(item='Complaint total')
    merged_grouped_df.insert(loc=1, column='Complaint total', value=col_complaint)

    return merged_grouped_df

The code block below will produce a sneak peak of what the the master weather dataframe file will look like. This dataframe contains all the measurements of
every pod. Every row represents the measurement for a particular hour. This dataframe is sorted in terms of time and then by pod name.

In [None]:
df_weather_master = get_weather_master()
df_weather_master

Cloning into 'stinky'...
remote: Enumerating objects: 324, done.[K
remote: Counting objects: 100% (324/324), done.[K
remote: Compressing objects: 100% (294/294), done.[K
remote: Total 324 (delta 155), reused 67 (delta 21), pack-reused 0[K
Receiving objects: 100% (324/324), 2.84 MiB | 6.95 MiB/s, done.
Resolving deltas: 100% (155/155), done.


Unnamed: 0,Date & time (hour rounded),date,time (hour rounded),Temp Avg,Temp Low,Temp High,Hum Avg,Hum Low,Hum High,Baro Avg,Baro Low,Baro High,Windspeed,Gust,Wind Direction,Cardinal Direction,Interval Precip,Leaf Wetness (Minutes),Solar Radiation Avg,Inside Temp Avg,Inside Temp Low,Inside Temp High,Heat Index,Wind Chill,Dew Point,UV Index,Station Voltage,Station Voltage 2,t1ia,t1il,t1ih,t2ia,t2il,t2ih,wetbulb,s2ia,smia,lvl1,lvl2,lvl3,Pod Name
0,2020-09-02 12:00:00,2020-09-02,12:00:00,75.00,75.0,75.0,60.0,60.0,60.0,29.97,29.97,29.98,0.00000,0.0,157.0,South,0.00,,,66.51,65.0,69.0,75.00,75.0,60.2,,6.530,,,,,,,,65.7,,,,,,SMRO3
1,2020-09-02 13:00:00,2020-09-02,13:00:00,75.16,75.0,75.2,59.0,58.0,60.0,29.96,29.95,29.98,0.00000,0.0,157.0,South,0.00,,,73.92,69.0,75.0,75.16,75.2,59.8,,6.504,,,,,,,,65.3,,,,,,SMRO3
2,2020-09-02 14:00:00,2020-09-02,14:00:00,75.41,75.2,75.5,58.0,58.0,58.0,29.94,29.93,29.95,0.00000,0.0,135.0,South-East,0.00,,,74.98,74.0,75.0,75.41,75.4,59.6,,6.500,,,,,,,,65.3,,,,,,SMRO3
3,2020-09-02 15:00:00,2020-09-02,15:00:00,75.32,75.2,75.4,59.0,58.0,59.0,29.91,29.90,29.93,0.00000,0.0,135.0,South-East,0.00,,,74.95,74.0,75.0,75.32,75.3,59.9,,6.500,,,,,,,,65.4,,,,,,SMRO3
4,2020-09-02 16:00:00,2020-09-02,16:00:00,75.47,75.2,75.7,59.0,59.0,59.0,29.89,29.88,29.90,0.00000,0.0,135.0,South-East,0.00,,,74.90,74.0,75.0,75.47,75.5,60.1,,6.500,,,,,,,,65.6,,,,,,SMRO3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23522,2021-07-23 20:00:00,2021-07-23,20:00:00,64.32,63.7,64.7,75.0,70.0,81.0,30.14,30.13,30.15,2.03667,7.2,111.0,East,0.01,,,82.77,82.0,83.0,64.32,64.3,56.3,,6.866,,,,,,,,59.4,,,,,,SMRO7
23523,2021-07-23 21:00:00,2021-07-23,21:00:00,63.51,62.7,64.0,81.0,80.0,83.0,30.13,30.12,30.15,3.15167,9.3,283.0,West,0.00,,,82.28,82.0,83.0,63.51,63.5,57.5,,6.766,,,,,,,,59.8,,,,,,SMRO7
23524,2021-07-23 22:00:00,2021-07-23,22:00:00,62.94,62.7,63.1,81.0,79.0,83.0,30.15,30.14,30.15,3.17167,9.5,315.0,North-West,0.00,,,82.00,82.0,82.0,62.94,62.9,57.1,,6.685,,,,,,,,59.4,,,,,,SMRO7
23525,2021-07-23 23:00:00,2021-07-23,23:00:00,62.57,62.0,63.0,80.0,79.0,80.0,30.14,30.14,30.15,4.81667,11.0,317.0,North,0.00,,,82.00,82.0,82.0,62.57,62.6,56.2,,6.603,,,,,,,,58.7,,,,,,SMRO7


Please run the code block below manually if you only want to download the df_weather_master file on it's own. By default runnning this entire notebook will download this file.

In [None]:
df_weather_master.to_csv('df_weather_master.csv', index=False) 
files.download('df_weather_master.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The code block below will produce a sneak peak of what the merged dataframe will look like. This dataframe represents an aggregation of data from the pods - essentially averaging out all the values for any matching particular time frame. It also contains the number of complaints made at any particular time frame. Each measurement is done by the hour, but if you want to get aggregattions in different intervals, please see "Optional Section"

In [None]:
merged_data = get_merged_trimmed_df(df_weather_master,get_smell_df_master(),"hours")
merged_data

Unnamed: 0,DateTime interval,Complaint total,Hour Group,Day_of_week,Month_of_year,Season,Temp Avg,Temp Low,Temp High,Hum Avg,Hum Low,Hum High,Baro Avg,Baro Low,Baro High,Windspeed,Gust,Wind Direction,Cardinal Direction,Interval Precip,Inside Temp Avg,Inside Temp Low,Inside Temp High,Heat Index,Wind Chill,Dew Point,Station Voltage,wetbulb
0,2020-09-02 19:00:00,3.0,19:00:00,Wednesday,September,Fall,75.590,75.20,75.70,61.0,61.0,61.0,29.820,29.810,29.830,0.000000,0.00,135.0,South-East,0.0,75.000,75.0,75.0,75.590,75.60,61.20,6.5000,65.60
1,2020-09-02 20:00:00,1.0,20:00:00,Wednesday,September,Fall,75.130,75.00,75.40,61.0,61.0,61.0,29.800,29.800,29.810,0.000000,0.00,135.0,South-East,0.0,74.430,74.0,75.0,75.130,75.10,60.70,6.5000,65.80
2,2020-09-02 21:00:00,0.0,21:00:00,Wednesday,September,Fall,74.720,74.50,75.00,61.0,61.0,62.0,29.790,29.790,29.800,0.000000,0.00,135.0,South-East,0.0,74.000,74.0,74.0,74.720,74.70,60.50,6.5000,65.50
3,2020-09-02 22:00:00,1.0,22:00:00,Wednesday,September,Fall,74.330,74.20,74.50,62.0,62.0,62.0,29.780,29.760,29.790,0.000000,0.00,135.0,South-East,0.0,74.000,74.0,74.0,74.330,74.30,60.40,6.5000,65.40
4,2020-09-02 23:00:00,0.0,23:00:00,Wednesday,September,Fall,74.110,74.00,74.20,62.0,62.0,62.0,29.760,29.760,29.770,0.000000,0.00,135.0,South-East,0.0,74.000,74.0,74.0,74.110,74.10,60.20,6.4970,65.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7736,2021-07-23 04:00:00,0.0,04:00:00,Friday,July,Summer,60.778,60.32,61.20,87.8,87.2,89.0,30.082,30.082,30.086,1.319002,4.14,256.4,,0.0,68.524,68.2,69.0,60.778,60.78,57.10,6.4690,58.50
7737,2021-07-23 05:00:00,0.0,05:00:00,Friday,July,Summer,60.214,59.86,60.48,89.2,88.0,89.6,30.084,30.082,30.090,1.253668,4.04,274.6,West,0.0,68.102,67.8,68.6,60.214,60.20,57.00,6.4588,58.24
7738,2021-07-23 06:00:00,0.0,06:00:00,Friday,July,Summer,60.084,59.78,60.44,89.0,88.4,89.6,30.094,30.088,30.104,1.076666,3.76,266.4,,0.0,67.872,67.2,68.2,60.084,60.08,56.88,6.4502,58.14
7739,2021-07-23 07:00:00,0.0,07:00:00,Friday,July,Summer,60.732,59.92,61.84,87.6,85.6,89.0,30.108,30.104,30.114,0.862002,3.76,278.4,North,0.0,67.864,67.2,69.0,60.732,60.74,57.00,6.4898,58.46


Please run the code block below manually if you only want to download the merged_data file on it's own. By default runnning this entire notebook will download this file.

In [None]:
merged_data.to_csv('merged_data.csv', index=False) 
files.download('merged_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###OPTIONAL

By default the code block below will not download this optional file

If you would like to aggregate the data in different time intervals for the merged_file Please see instructions below

1.) In the code block below please change optional = 0 to optional = 1

2.) Please change key = 'hours' to the appropriate key. Key choices include:
  'days', 'weeks', 'months'

3.) If you would like to aggregate in a specific quanitity of intervals please change interval = 1 to interval = quantity 

4.) Then run the code block below. 

An example is shown in the text cell titled 'OPTIONAL EXAMPLE'

In [None]:
optional = 0
key = 'hours' 
interval = 1
if (optional == 1):
  user_choice_merged = get_merged_trimmed_df(df_weather_master, get_smell_df_master(), key = key, interval=interval)
  display(user_choice_merged)
  user_choice_merged.to_csv('user_choice_merged.csv', index = False)
  files.download('user_choice_merged.csv')

Unnamed: 0,DateTime interval,Complaint total,Hour Group,Day_of_week,Month_of_year,Season,Temp Avg,Temp Low,Temp High,Hum Avg,Hum Low,Hum High,Baro Avg,Baro Low,Baro High,Windspeed,Gust,Wind Direction,Cardinal Direction,Interval Precip,Inside Temp Avg,Inside Temp Low,Inside Temp High,Heat Index,Wind Chill,Dew Point,Station Voltage,wetbulb
0,2020-09-02 19:00:00,3.0,19:00:00,Wednesday,September,Fall,75.590,75.20,75.70,61.0,61.0,61.0,29.820,29.810,29.830,0.000000,0.00,135.0,South-East,0.0,75.000,75.0,75.0,75.590,75.60,61.20,6.5000,65.60
1,2020-09-02 20:00:00,1.0,20:00:00,Wednesday,September,Fall,75.130,75.00,75.40,61.0,61.0,61.0,29.800,29.800,29.810,0.000000,0.00,135.0,South-East,0.0,74.430,74.0,75.0,75.130,75.10,60.70,6.5000,65.80
2,2020-09-02 21:00:00,0.0,21:00:00,Wednesday,September,Fall,74.720,74.50,75.00,61.0,61.0,62.0,29.790,29.790,29.800,0.000000,0.00,135.0,South-East,0.0,74.000,74.0,74.0,74.720,74.70,60.50,6.5000,65.50
3,2020-09-02 22:00:00,1.0,22:00:00,Wednesday,September,Fall,74.330,74.20,74.50,62.0,62.0,62.0,29.780,29.760,29.790,0.000000,0.00,135.0,South-East,0.0,74.000,74.0,74.0,74.330,74.30,60.40,6.5000,65.40
4,2020-09-02 23:00:00,0.0,23:00:00,Wednesday,September,Fall,74.110,74.00,74.20,62.0,62.0,62.0,29.760,29.760,29.770,0.000000,0.00,135.0,South-East,0.0,74.000,74.0,74.0,74.110,74.10,60.20,6.4970,65.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7736,2021-07-23 04:00:00,0.0,04:00:00,Friday,July,Summer,60.778,60.32,61.20,87.8,87.2,89.0,30.082,30.082,30.086,1.319002,4.14,256.4,,0.0,68.524,68.2,69.0,60.778,60.78,57.10,6.4690,58.50
7737,2021-07-23 05:00:00,0.0,05:00:00,Friday,July,Summer,60.214,59.86,60.48,89.2,88.0,89.6,30.084,30.082,30.090,1.253668,4.04,274.6,West,0.0,68.102,67.8,68.6,60.214,60.20,57.00,6.4588,58.24
7738,2021-07-23 06:00:00,0.0,06:00:00,Friday,July,Summer,60.084,59.78,60.44,89.0,88.4,89.6,30.094,30.088,30.104,1.076666,3.76,266.4,,0.0,67.872,67.2,68.2,60.084,60.08,56.88,6.4502,58.14
7739,2021-07-23 07:00:00,0.0,07:00:00,Friday,July,Summer,60.732,59.92,61.84,87.6,85.6,89.0,30.108,30.104,30.114,0.862002,3.76,278.4,North,0.0,67.864,67.2,69.0,60.732,60.74,57.00,6.4898,58.46


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### OPTIONAL EXAMPLE
***Please note this is a text block and NOT a code block, so this cell block can NOT be run.

If I wanted to download the merged file which aggregated the data in intervals of 3 days, then I would need to make the following changes to the code block above:

```python
optional = 1 #change optional to = 1
key = 'days'  # change key = 'hours' to key = 'days'
interval = 3 # change interval = 1 to interval = 3
if (optional == 1):
  user_choice_merged = get_merged_trimmed_df(df_weather_master, get_smell_df_master(), key = key, interval=interval)
  display(user_choice_merged)
  user_choice_merged.to_csv('user_choice_merged.csv', index = False)
  files.download('user_choice_merged.csv')
```