Merge f128c0b into d1e260a

predicthq · Oct 4, 2020 · a647f9a · a647f9a
2 parents d1e260a + f128c0b
commit a647f9a
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 0 deletions.
diff --git a/examples/aei_from_events/README.md b/examples/aei_from_events/README.md
@@ -0,0 +1,15 @@
+# Aggregate Event Impact (AEI) from Events
+
+Event data queried from our events endpoint comes in the following form: one row corresponds to a single event. However, in the context of building forecasting models, the most natural way to incorporate information about events is to (first) include data about estimated attendance.
+
+The set of functions simplify the transformation of record level event data into features aggregated by event category.
+
+## Calculating Impact vs. Summing PHQ Attendance
+
+[`phq_attendance`](https://support.predicthq.com/what-is-phq-attendance) is the _overall_ estimated attendance for an event. So no matter if an event with a `phq_attendance` of 10,000 takes place over the course of 1 day or 1 month, the event will show the same value in both cases. To properly model this information on a daily, calendarized basis, we must split the overall attendance figure into equal chunks over the duration of its start and end dates.
+
+## AEI by Event Category and Log Transforms
+
+Determining the daily `impact` (the proportionate, summed `phq attendance` across all events occurring on a specific day) allows a data scientist to incorporate a value representative of the estimated number of people attending (or affected) by events that day to a machine learning model.
+
+Given the (typically) exponential distribution observed in predicted attendance, we recommend to log transform each of the `impact` figures that are computed. While this typically muddies the interpretation of the value being provided to a forecasting algorithm, this process (especially for linear regressors) tends to improve forecasting performance.
diff --git a/examples/aei_from_events/aei_from_events.py b/examples/aei_from_events/aei_from_events.py
@@ -0,0 +1,51 @@
+import pandas as pd
+
+all_cat = ['festivals', 'school-holidays', 'airport-delays', 'sports',
+           'performing-arts', 'concerts', 'public-holidays', 'community',
+           'observances', 'severe-weather', 'expos', 'conferences',
+           'health-warnings', 'academic', 'disasters', 'politics', 'terror']
+
+nonattended = ['public-holidays', 'observances', 'school-holidays', 'academic']
+
+attended = ['festivals', 'sports', 'performing-arts',
+            'concerts', 'community', 'expos', 'conferences']
+
+unscheduled = ['airport-delays', 'health-warnings', 'disasters', 'politics', 'terror', 'severe-weather']
+
+from datetime import datetime, timedelta
+
+def gen_cal(st, ed):
+    clean_start = st.replace('T', ' ').replace('Z', '')
+    start_dt = datetime.strptime(clean_start, '%Y-%m-%d %H:%M:%S')
+
+    clean_end = ed.replace('T', ' ').replace('Z', '')
+    end_dt = datetime.strptime(clean_end, '%Y-%m-%d %H:%M:%S')
+
+    date_diff = end_dt - start_dt
+    date_range = []
+    for d in range(0, date_diff.days + 1):
+        res = start_dt + timedelta(days=d)
+        date_range.append(str(res).split(' ')[0])
+    return date_range
+
+def gen_aei_from_events(cal, events_pd):
+    all_pd = []
+    for d in cal:
+        d_formatted = d + 'T00:00:00Z'
+        qualified_events = events_pd[(events_pd['start'] <= d_formatted) & (events_pd['end'] >= d_formatted)]
+        qe_agg = qualified_events.groupby('category')['impact'].sum().reset_index()
+        qe_pivot = pd.pivot_table(qe_agg, columns = 'category', values = 'impact', fill_value=0)
+        d_col = pd.DataFrame([d], columns = ['date'])
+        aei_temp = pd.concat([d_col, qe_pivot.reset_index(drop=True)], axis = 1)
+        all_pd.append(aei_temp)
+    return all_pd
+
+def attended_aei(df, event_groupings = attended):
+    att_df = df[df.category.isin(event_groupings)]
+    att_df['days_duration'] = [1 if x / 86400 < 1 else round(x / 86400, 0) for x in att_df.duration]
+    att_df['impact'] = round(att_df.phq_attendance / att_df.days_duration, 0)
+    start_cal, end_cal = att_df.start.min(), att_df.end.max()
+    df_cal = gen_cal(start_cal, end_cal)
+
+    aei_pd = pd.concat(gen_aei_from_events(df_cal, att_df), axis = 0).fillna(0)
+    return aei_pd
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ pytz==2017.2
 requests>=2.7.0,<3.0
 responses>=0.10.8,<1.0
 schematics==2.0.0.dev2
+pandas>=0.20.0