Skip to content

Commit

Permalink
Merge pull request #846 from readthedocs/migrate-embedding
Browse files Browse the repository at this point in the history
Remove embedding code from public repo
  • Loading branch information
ericholscher committed Mar 21, 2024
2 parents 7c0e01e + b151f36 commit 99ac03d
Show file tree
Hide file tree
Showing 18 changed files with 249 additions and 198 deletions.
4 changes: 4 additions & 0 deletions .envs/local/django.sample
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ IPYTHONDIR=/app/.ipython
MEDIA_URL=http://localhost:5000/media/
#ADSERVER_ETHICALADS_BRANDING=True

# Bash (Docker)
# HISTFILE=/app/.bash_history

# Redis
# ------------------------------------------------------------------------------
REDIS_URL=redis://redis:6379/0
Expand Down Expand Up @@ -41,4 +44,5 @@ COLUMNS=80
# Analyzer
# ------------------------------------------------------------------------------
# See ``adserver.analyzer.backends`` for available backends
# ADSERVER_ANALYZER_BACKEND="adserver.analyzer.backends.TextacyAnalyzerBackend,adserver.analyzer.backends.SentenceTransformerAnalyzerBackend"
# ADSERVER_ANALYZER_BACKEND=
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ sftp-config.json
# Basics
*.py[cod]
__pycache__
.bash_history

# Logs
*.log
Expand Down Expand Up @@ -65,6 +66,9 @@ celerybeat-schedule
celerybeat-schedule.db
celerybeat.pid

# VSCode
.vscode


##########################################################################
# Ad Server specific ignores
Expand Down
21 changes: 21 additions & 0 deletions adserver/analyzer/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from django.contrib import admin
from simple_history.admin import SimpleHistoryAdmin

from .models import AnalyzedAdvertiserUrl
from .models import AnalyzedUrl


Expand All @@ -24,3 +25,23 @@ class AnalyzedUrlAdmin(SimpleHistoryAdmin):
search_fields = ("url", "keywords")

# Note: may need to use the estimated count paginator if this gets large


@admin.register(AnalyzedAdvertiserUrl)
class AnalyzedAdvertiserUrlAdmin(SimpleHistoryAdmin):

"""Django admin configuration for analyzed ads."""

list_display = (
"url",
"advertiser",
"keywords",
"last_analyzed_date",
)
list_per_page = 500
list_filter = ("last_analyzed_date", "advertiser")
list_select_related = ("advertiser",)
raw_id_fields = ("advertiser",)
search_fields = ("url", "keywords")

# Note: may need to use the estimated count paginator if this gets large
1 change: 0 additions & 1 deletion adserver/analyzer/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Backends for analyzing URLs for keywords and topics."""
from .eatopics import EthicalAdsTopicsBackend # noqa
from .naive import NaiveKeywordAnalyzerBackend # noqa
from .st import SentenceTransformerAnalyzerBackend # noqa
from .textacynlp import TextacyAnalyzerBackend # noqa
52 changes: 0 additions & 52 deletions adserver/analyzer/backends/st.py

This file was deleted.

1 change: 1 addition & 0 deletions adserver/analyzer/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
"utm_label",
"utm_keyword",
"utm_content",
"utm_term",
"ref",
)
9 changes: 0 additions & 9 deletions adserver/analyzer/management/commands/runmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def handle_url(self, url):
self.stdout.write(_("Running against %s") % url)

keywords = []
embeddings = []
for backend in get_url_analyzer_backends():

backend_instance = backend(url)
Expand All @@ -48,16 +47,8 @@ def handle_url(self, url):
self.stdout.write(
_("Keywords from '%s': %s") % (backend.__name__, analyzed_keywords)
)
analyzed_embedding = backend_instance.embedding(response)
self.stdout.write(
_("Embeddings from '%s': %s") % (backend.__name__, analyzed_embedding)
)

if analyzed_keywords:
keywords.extend(analyzed_keywords)

if analyzed_embedding:
embeddings.extend(analyzed_embedding)

self.stdout.write(_("Keywords/topics: %s") % keywords)
self.stdout.write(_("Embeddings: %s") % embeddings)
128 changes: 128 additions & 0 deletions adserver/analyzer/migrations/0005_add_analyzedad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Generated by Django 4.2.11 on 2024-03-20 20:25
import django.db.models.deletion
import django_extensions.db.fields
import jsonfield.fields
import pgvector.django
from django.db import migrations
from django.db import models

import adserver.analyzer.validators


class Migration(migrations.Migration):

dependencies = [
("adserver", "0093_publisher_ignore_mobile_traffic"),
("adserver_analyzer", "0004_add_embeddings"),
]

operations = [
migrations.CreateModel(
name="AnalyzedAdvertiserUrl",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
(
"url",
models.URLField(
db_index=True,
help_text="URL of the page being analyzed after certain query parameters are stripped away",
max_length=1024,
),
),
(
"keywords",
jsonfield.fields.JSONField(
blank=True,
null=True,
validators=[adserver.analyzer.validators.KeywordsValidator()],
verbose_name="Keywords for this URL",
),
),
(
"last_analyzed_date",
models.DateTimeField(
blank=True,
db_index=True,
default=None,
help_text="Last time the ad server analyzed this URL",
null=True,
),
),
(
"title",
models.TextField(
blank=True,
default=None,
null=True,
verbose_name="Title of the page",
),
),
(
"description",
models.TextField(
blank=True,
default=None,
null=True,
verbose_name="Description of the page",
),
),
(
"embedding",
pgvector.django.VectorField(
blank=True, default=None, dimensions=384, null=True
),
),
(
"advertiser",
models.ForeignKey(
help_text="Advertiser with the URL",
on_delete=django.db.models.deletion.CASCADE,
to="adserver.advertiser",
),
),
],
options={
"unique_together": {("url", "advertiser")},
},
),
migrations.AddField(
model_name="analyzedurl",
name="description",
field=models.TextField(
blank=True,
default=None,
null=True,
verbose_name="Description of the page",
),
),
migrations.AddField(
model_name="analyzedurl",
name="title",
field=models.TextField(
blank=True, default=None, null=True, verbose_name="Title of the page"
),
),
migrations.DeleteModel(
name="HistoricalAnalyzedUrl",
),
]

0 comments on commit 99ac03d

Please sign in to comment.