.github/workflows/backup.yml

name: Back up my blog database

on:
  push:
    branches:
      - main
  workflow_dispatch:
    inputs:
      force_deploy:
        description: 'Deploy even if no changes detected'
        required: false 
        type: boolean
  schedule:
    - cron: '26 */2 * * *'

jobs:
  backup:
    runs-on: ubuntu-latest
    outputs:
      change_detected: ${{ steps.commit_and_push.outputs.change_detected }}
    steps:
    - name: Check out repo
      uses: actions/checkout@v3
    - name: Set up Python
      uses: actions/setup-python@v3
      with:
        python-version: "3.10"
    - uses: actions/cache@v3
      name: Configure pip caching
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    - name: Install Python dependencies
      run: |
        pip install -r requirements.txt
    - name: Import Heroku DB into SQLite
      env:
        HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
      run: |-
        db-to-sqlite \
          $(heroku config:get DATABASE_URL -a simonwillisonblog | sed s/postgres:/postgresql+psycopg2:/) \
          simonwillisonblog.db \
          --table auth_permission \
          --table auth_user \
          --table blog_blogmark \
          --table blog_blogmark_tags \
          --table blog_entry \
          --table blog_entry_tags \
          --table blog_quotation \
          --table blog_quotation_tags \
          --table blog_tag \
          --table blog_series \
          --table django_content_type \
          --table redirects_redirect
        sqlite-utils tables simonwillisonblog.db --counts --columns
    - name: Redact passwords
      run: |-
        sqlite-utils simonwillisonblog.db "update auth_user set password = null"
    - name: Remove un-interesting search_document columns
      run: |-
        sqlite-utils transform simonwillisonblog.db blog_blogmark --drop search_document
        sqlite-utils transform simonwillisonblog.db blog_entry --drop search_document
        sqlite-utils transform simonwillisonblog.db blog_quotation --drop search_document
    - name: Convert to newline-delimited JSON
      run: |-
        rm simonwillisonblog/* || true
        sqlite-diffable dump simonwillisonblog.db simonwillisonblog --all
    - name: Backup Substack
      run: |-
        curl 'https://simon.datasette.site/substack' | \
          python -c "import sys, re; print(re.sub(r'<lastBuildDate>.*?</lastBuildDate>', '', sys.stdin.read(), flags=re.DOTALL))" \
          > simonw-substack-com.xml
    - name: Commit any changes
      id: commit_and_push
      run: |-
        git config user.name "Automated"
        git config user.email "actions@users.noreply.github.com"
        git add simonwillisonblog
        git add simonw-substack-com.xml
        timestamp=$(date -u)
        git commit -m "Latest data: ${timestamp}" || exit 0
        git push
        echo "change_detected=1" >> $GITHUB_OUTPUT
  build_and_deploy:
    runs-on: ubuntu-latest
    needs: backup
    if: >
      ${{ inputs.force_deploy || needs.backup.outputs.change_detected || 
        contains(github.event.workflow_run.head_commit.modified, '.github/workflows/backup.yml') }}
    steps:
    - name: Check out repo
      uses: actions/checkout@v3
      with:
        ref: main
    - name: Set up Python
      uses: actions/setup-python@v3
      with:
        python-version: "3.9"
    - uses: actions/cache@v3
      name: Configure pip caching
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
    - name: Install Python dependencies
      run: |
        pip install -r requirements.txt
    - name: Restore previous database
      continue-on-error: true
      run: |-
        wget -q https://datasette.simonwillison.net/simonwillisonblog.db
    - name: Build database
      run: |-
        sqlite-diffable load simonwillisonblog.db simonwillisonblog --replace
        sqlite-utils tables simonwillisonblog.db --counts
    - name: Extract entities
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.COMPREHEND_ACCESS_KEY }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.COMPREHEND_SECRET_KEY }}
        AWS_DEFAULT_REGION: us-west-2
      run: |-
        which sqlite-comprehend
        sqlite-comprehend --version
        sqlite-comprehend --help
        sqlite-comprehend entities --help
        sqlite-comprehend entities simonwillisonblog.db blog_entry title body --strip-tags
        sqlite-utils tables simonwillisonblog.db --counts
    - name: Import the OpenAI embeddings
      run: |-
        wget https://static.simonwillison.net/static/2023/blog-embeddings/blog.db
        # Create table (if it does not exist yet)
        sqlite-utils create-table simonwillisonblog.db blog_entry_embeddings \
          id integer embedding blob --pk id --ignore
        # Import/replace the embeddings
        sqlite-utils simonwillisonblog.db --attach embeddings blog.db \
          'replace into blog_entry_embeddings select cast(id as integer), embedding from embeddings.embeddings'
    - name: Configure FTS
      run: |-
        set +e
        sqlite-utils enable-fts simonwillisonblog.db blog_series title summary --create-triggers --tokenize porter 2>/dev/null
        sqlite-utils enable-fts simonwillisonblog.db blog_tag tag --create-triggers --tokenize porter 2>/dev/null
        sqlite-utils enable-fts simonwillisonblog.db blog_quotation quotation source --create-triggers --tokenize porter 2>/dev/null
        sqlite-utils enable-fts simonwillisonblog.db blog_entry title body --create-triggers --tokenize porter 2>/dev/null
        sqlite-utils enable-fts simonwillisonblog.db blog_blogmark link_title via_title commentary --create-triggers --tokenize porter 2>/dev/null
        set -e
        # Re-populate tables just in case they're missing anything
        sqlite-utils populate-fts simonwillisonblog.db blog_series title summary
        sqlite-utils populate-fts simonwillisonblog.db blog_tag tag
        sqlite-utils populate-fts simonwillisonblog.db blog_quotation quotation source
        sqlite-utils populate-fts simonwillisonblog.db blog_entry title body
        sqlite-utils populate-fts simonwillisonblog.db blog_blogmark link_title via_title commentary
    - name: Copy in latest TILs
      run: |
        wget https://s3.amazonaws.com/til.simonwillison.net/tils.db
        echo "
        attach database 'simonwillisonblog.db' as simonwillisonblog;
        attach database 'tils.db' as tils;
        drop table if exists simonwillisonblog.til;
        create table simonwillisonblog.til as select * from tils.til;
        " | sqlite3
    - name: Set up Cloud Run
      uses: google-github-actions/setup-gcloud@v0
      with:
        version: '318.0.0'
        service_account_email: ${{ secrets.GCP_SA_EMAIL }}
        service_account_key: ${{ secrets.GCP_SA_KEY }}
    - name: Deploy to Cloud Run
      run: |-
        gcloud config set run/region us-central1
        gcloud config set project datasette-222320
        datasette publish cloudrun simonwillisonblog.db \
          -m metadata.yml \
          --branch 1.0a2 \
          --extra-options "--setting sql_time_limit_ms 15000 --setting truncate_cells_html 10000" \
          --service simonwillisonblog \
          --install packaging \
          --install datasette-block-robots \
          --install datasette-graphql \
          --install datasette-search-all \
          --install "datasette-openai>=0.1a2" \
          --install "datasette-cookies-for-magic-parameters>=0.1.2" \
          --install datasette-json-html \
          --install datasette-sqlite-regex \
          --install "datasette-explain>=0.1a2" \
          --install "datasette-faiss>=0.2.1" \
          --install datasette-simple-html \
          --install datasette-dateutil