-
Notifications
You must be signed in to change notification settings - Fork 0
190 lines (188 loc) · 7.72 KB
/
backup.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
name: Back up my blog database
on:
push:
branches:
- main
workflow_dispatch:
inputs:
force_deploy:
description: 'Deploy even if no changes detected'
required: false
type: boolean
schedule:
- cron: '26 */2 * * *'
jobs:
backup:
runs-on: ubuntu-latest
outputs:
change_detected: ${{ steps.commit_and_push.outputs.change_detected }}
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: "3.10"
- uses: actions/cache@v3
name: Configure pip caching
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install Python dependencies
run: |
pip install -r requirements.txt
- name: Import Heroku DB into SQLite
env:
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
run: |-
db-to-sqlite \
$(heroku config:get DATABASE_URL -a simonwillisonblog | sed s/postgres:/postgresql+psycopg2:/) \
simonwillisonblog.db \
--table auth_permission \
--table auth_user \
--table blog_blogmark \
--table blog_blogmark_tags \
--table blog_entry \
--table blog_entry_tags \
--table blog_quotation \
--table blog_quotation_tags \
--table blog_tag \
--table blog_series \
--table django_content_type \
--table redirects_redirect
sqlite-utils tables simonwillisonblog.db --counts --columns
- name: Redact passwords
run: |-
sqlite-utils simonwillisonblog.db "update auth_user set password = null"
- name: Remove un-interesting search_document columns
run: |-
sqlite-utils transform simonwillisonblog.db blog_blogmark --drop search_document
sqlite-utils transform simonwillisonblog.db blog_entry --drop search_document
sqlite-utils transform simonwillisonblog.db blog_quotation --drop search_document
- name: Convert to newline-delimited JSON
run: |-
rm simonwillisonblog/* || true
sqlite-diffable dump simonwillisonblog.db simonwillisonblog --all
- name: Backup Substack
run: |-
curl 'https://simon.datasette.site/substack' | \
python -c "import sys, re; print(re.sub(r'<lastBuildDate>.*?</lastBuildDate>', '', sys.stdin.read(), flags=re.DOTALL))" \
> simonw-substack-com.xml
- name: Commit any changes
id: commit_and_push
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add simonwillisonblog
git add simonw-substack-com.xml
timestamp=$(date -u)
git commit -m "Latest data: ${timestamp}" || exit 0
git push
echo "change_detected=1" >> $GITHUB_OUTPUT
build_and_deploy:
runs-on: ubuntu-latest
needs: backup
if: >
${{ inputs.force_deploy || needs.backup.outputs.change_detected ||
contains(github.event.workflow_run.head_commit.modified, '.github/workflows/backup.yml') }}
steps:
- name: Check out repo
uses: actions/checkout@v3
with:
ref: main
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: "3.9"
- uses: actions/cache@v3
name: Configure pip caching
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
- name: Install Python dependencies
run: |
pip install -r requirements.txt
- name: Restore previous database
continue-on-error: true
run: |-
wget -q https://datasette.simonwillison.net/simonwillisonblog.db
- name: Build database
run: |-
sqlite-diffable load simonwillisonblog.db simonwillisonblog --replace
sqlite-utils tables simonwillisonblog.db --counts
- name: Extract entities
env:
AWS_ACCESS_KEY_ID: ${{ secrets.COMPREHEND_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.COMPREHEND_SECRET_KEY }}
AWS_DEFAULT_REGION: us-west-2
run: |-
which sqlite-comprehend
sqlite-comprehend --version
sqlite-comprehend --help
sqlite-comprehend entities --help
sqlite-comprehend entities simonwillisonblog.db blog_entry title body --strip-tags
sqlite-utils tables simonwillisonblog.db --counts
- name: Import the OpenAI embeddings
run: |-
wget https://static.simonwillison.net/static/2023/blog-embeddings/blog.db
# Create table (if it does not exist yet)
sqlite-utils create-table simonwillisonblog.db blog_entry_embeddings \
id integer embedding blob --pk id --ignore
# Import/replace the embeddings
sqlite-utils simonwillisonblog.db --attach embeddings blog.db \
'replace into blog_entry_embeddings select cast(id as integer), embedding from embeddings.embeddings'
- name: Configure FTS
run: |-
set +e
sqlite-utils enable-fts simonwillisonblog.db blog_series title summary --create-triggers --tokenize porter 2>/dev/null
sqlite-utils enable-fts simonwillisonblog.db blog_tag tag --create-triggers --tokenize porter 2>/dev/null
sqlite-utils enable-fts simonwillisonblog.db blog_quotation quotation source --create-triggers --tokenize porter 2>/dev/null
sqlite-utils enable-fts simonwillisonblog.db blog_entry title body --create-triggers --tokenize porter 2>/dev/null
sqlite-utils enable-fts simonwillisonblog.db blog_blogmark link_title via_title commentary --create-triggers --tokenize porter 2>/dev/null
set -e
# Re-populate tables just in case they're missing anything
sqlite-utils populate-fts simonwillisonblog.db blog_series title summary
sqlite-utils populate-fts simonwillisonblog.db blog_tag tag
sqlite-utils populate-fts simonwillisonblog.db blog_quotation quotation source
sqlite-utils populate-fts simonwillisonblog.db blog_entry title body
sqlite-utils populate-fts simonwillisonblog.db blog_blogmark link_title via_title commentary
- name: Copy in latest TILs
run: |
wget https://s3.amazonaws.com/til.simonwillison.net/tils.db
echo "
attach database 'simonwillisonblog.db' as simonwillisonblog;
attach database 'tils.db' as tils;
drop table if exists simonwillisonblog.til;
create table simonwillisonblog.til as select * from tils.til;
" | sqlite3
- name: Set up Cloud Run
uses: google-github-actions/setup-gcloud@v0
with:
version: '318.0.0'
service_account_email: ${{ secrets.GCP_SA_EMAIL }}
service_account_key: ${{ secrets.GCP_SA_KEY }}
- name: Deploy to Cloud Run
run: |-
gcloud config set run/region us-central1
gcloud config set project datasette-222320
datasette publish cloudrun simonwillisonblog.db \
-m metadata.yml \
--branch 1.0a2 \
--extra-options "--setting sql_time_limit_ms 15000 --setting truncate_cells_html 10000" \
--service simonwillisonblog \
--install packaging \
--install datasette-block-robots \
--install datasette-graphql \
--install datasette-search-all \
--install "datasette-openai>=0.1a2" \
--install "datasette-cookies-for-magic-parameters>=0.1.2" \
--install datasette-json-html \
--install datasette-sqlite-regex \
--install "datasette-explain>=0.1a2" \
--install "datasette-faiss>=0.2.1" \
--install datasette-simple-html \
--install datasette-dateutil