Skip to content

Commit

Permalink
Merge pull request #750 from readthedocs/davidfischer/remove-old-aggr…
Browse files Browse the repository at this point in the history
…egation-data

Keep only 1 year of geo/region/keyword/placement data
  • Loading branch information
davidfischer committed Jun 12, 2023
2 parents 5589c80 + 3487e26 commit dc990fa
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 5 deletions.
28 changes: 28 additions & 0 deletions adserver/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,34 @@ def update_previous_day_reports(day=None):
)


@app.task()
def remove_old_report_data(days=366):
"""
Remove old report data for selected reports from the database.
Removes:
- geo breakdown data
- placement data
- keyword data
- uplift data
- regiontopic data
"""
old_cutoff = get_ad_day() - datetime.timedelta(days=days)

models = (
GeoImpression,
PlacementImpression,
KeywordImpression,
UpliftImpression,
RegionTopicImpression,
)

for model in models:
model_name = model.__name__
log.info("Deleting old %s before %s", model_name, old_cutoff)
model.objects.filter(date__lt=old_cutoff).delete()


@app.task()
def remove_old_client_ids(days=90):
"""Remove old Client IDs which are used for short periods for fraud prevention."""
Expand Down
2 changes: 1 addition & 1 deletion adserver/templates/adserver/reports/advertiser-geo.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<section class="mb-5">
<h3>{% trans 'About this report' %}</h3>
<p>{% trans 'This report shows all your traffic that had country data we could determine from the IP address.' %}</p>
<em>{% trans 'All data for previous days is complete. This report can take up to a minute to load.' %}</em>
<em>{% trans 'All data for previous days is complete and 1 year of data is retained. This report can take up to a minute to load.' %}</em>
</section>
{% endblock explainer %}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<section class="mb-5">
<h3>{% trans 'About this report' %}</h3>
<p>{% trans 'This report shows all your traffic that had keyword and topic data.' %}</p>
<em>{% trans 'This report shows the <strong>top 20 keywords</strong> targeted by your flights and updates daily. All previous days data is complete.' %}</em>
<em>{% trans 'This report shows the <strong>top 20 keywords</strong> targeted by your flights and updates daily. All previous days data is complete and 1 year of data is retained.' %}</em>
</section>
{% endblock explainer %}

Expand Down
2 changes: 1 addition & 1 deletion adserver/templates/adserver/reports/publisher-geo.html
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ <h3>{% trans 'About this report' %}</h3>
so it will let you understand a bit more about how your traffic is monetizing.
{% endblocktrans %}
</p>
<em>{% trans 'All data for previous days is complete. This report can take up to a minute to load.' %}</em>
<em>{% trans 'All data for previous days is complete and 1 year of data is retained. This report can take up to a minute to load.' %}</em>
</section>
{% endblock explainer %}

Expand Down
2 changes: 1 addition & 1 deletion adserver/templates/adserver/reports/publisher-keyword.html
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ <h3>{% trans 'About this report' %}</h3>
This report may report more than 100% of your ad views, because some ads match multiple keywords for a publisher.
</p>
<p>
<em>{% trans 'This report updates daily. All previous days data is complete. This report can take some time to load for large publishers.' %}</em>
<em>{% trans 'This report updates daily. All previous days data is complete and 1 year of data is retained. This report can take some time to load for large publishers.' %}</em>
</p>
</section>
{% endblock explainer %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ <h3>About this report</h3>
</p>
<p>
<em>
This report shows the <strong>top {{ limit }} placements</strong> and updates periodically. All previous days data is complete.
This report shows the <strong>top {{ limit }} placements</strong> and updates periodically. All previous days data is complete and 1 year of data is retained.
</em>
</p>
</section>
Expand Down
58 changes: 58 additions & 0 deletions adserver/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from ..tasks import notify_of_completed_flights
from ..tasks import notify_of_publisher_changes
from ..tasks import remove_old_client_ids
from ..tasks import remove_old_report_data
from ..tasks import update_previous_day_reports
from .common import BaseAdModelsTestCase


Expand Down Expand Up @@ -626,3 +628,59 @@ def test_daily_update_placements(self):
self.assertEqual(pi2_ad2.offers, 2)
self.assertEqual(pi2_ad2.views, 2)
self.assertEqual(pi2_ad2.clicks, 0)

def test_remove_old_report_data(self):
# Add a very old offer
old_date = timezone.now() - datetime.timedelta(days=370)
get(
Offer,
advertisement=self.ad1,
publisher=self.publisher,
country="CA",
viewed=True,
view_time=6,
clicked=True,
keywords=["backend"],
div_id="id_1",
ad_type_slug=self.text_ad_type.slug,
date=old_date,
)

# Run aggregations both for today and the very old day
update_previous_day_reports(timezone.now())
update_previous_day_reports(old_date)

# Check that the aggregations match
impression_old = RegionTopicImpression.objects.filter(
region="us-ca",
topic="backend-web",
advertisement=self.ad1,
date=old_date.date(),
).first()
self.assertIsNotNone(impression_old)
self.assertEqual(impression_old.offers, 1)
self.assertEqual(impression_old.views, 1)
self.assertEqual(impression_old.clicks, 1)

# Remove old aggregation data and verify they are gone
remove_old_report_data()
self.assertFalse(
RegionTopicImpression.objects.filter(
region="us-ca",
topic="backend-web",
advertisement=self.ad1,
date=old_date.date(),
).exists()
)

# Newer aggregation data are still there
impression_new = RegionTopicImpression.objects.filter(
region="us-ca",
topic="backend-web",
advertisement=self.ad1,
date=timezone.now(),
).first()
self.assertIsNotNone(impression_new)
self.assertEqual(impression_new.offers, 3)
self.assertEqual(impression_new.views, 2)
self.assertEqual(impression_new.clicks, 1)
5 changes: 5 additions & 0 deletions config/settings/production.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@
# Runs on Tuesday
"schedule": crontab(day_of_week=2, hour="6", minute="0"),
},
"every-week-remove-old-report-data": {
"task": "adserver.tasks.remove_old_report_data",
# Runs on Wednesday
"schedule": crontab(day_of_week=3, hour="5", minute="10"),
},
# Very fast indexes that can be run more frequently
"halfhourly-advertiser-index": {
"task": "adserver.tasks.daily_update_advertisers",
Expand Down

0 comments on commit dc990fa

Please sign in to comment.