<a href="https://colab.research.google.com/github/softplus/robots.txt-wav/blob/main/Embedding_robots_txt_in_a_wave.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Embed a robots.txt file in a WAV file for the lolz
# John Mueller / 2025-01-25
#
# See https://johnmu.com/robots.txt
# or  https://johnmu.com/2025-side-effects
#

# MIT License
#
# Copyright (c) 2024,2025 John Mueller
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Notes
# - robots.txt contents is hard-coded here; you could also download one
# - WAV file needs to be reasonably sized for browsers to accept it,
#   you may be able to make do by re-compressing (I use Audacity).
# - WAV file is downloaded from the web for this example. Use what you want.
# - The output is saved as 'file_out.wav' in the Colab. Rename & upload.
# - Follow the remaining notes in my blog post.
#
# Is this a good idea? Is anything online really? YMMV yadda yadda yadda.
#


In [None]:
# imports
from IPython.display import Audio, display
import struct, binascii

input_file = 'file_in.wav'
output_file = 'file_out.wav'

# Adjust the robots.txt contents as desired.
inject_bytes = bytes("""
# Your comment on the first line

sitemap: https://example.com/sitemap.xml

user-agent: Googlebot
user-agent: Bingbot
user-agent: DuckDuckBot
user-agent: slurp
allow: /

user-agent: semrushbot
user-agent: dotbot
user-agent: voltron
user-agent: ahrefsbot
disallow: /

user-agent: *
disallow: /

# A closing comment
""", 'ascii') + b'\0' # null terminated string


In [None]:
# Download source file (or upload one yourself & skip this)
#
# A simple, license-free file: https://commons.wikimedia.org/wiki/File:BAK.wav
!wget 'https://upload.wikimedia.org/wikipedia/commons/c/cc/BAK.wav' -O {input_file}


--2025-01-24 20:14:01--  https://upload.wikimedia.org/wikipedia/commons/c/cc/BAK.wav
Resolving upload.wikimedia.org (upload.wikimedia.org)... 208.80.153.240, 2620:0:860:ed1a::2:b
Connecting to upload.wikimedia.org (upload.wikimedia.org)|208.80.153.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1474248 (1.4M) [audio/x-wav]
Saving to: ‘file_in.wav’


2025-01-24 20:14:01 (8.29 MB/s) - ‘file_in.wav’ saved [1474248/1474248]



In [None]:
# test input file & process it
print('Test of input file:')
display(Audio(input_file, autoplay=False))

# process file
chunks = []
with open(input_file, 'rb') as wav_file:
  # main header
  main_id = wav_file.read(4)
  assert main_id == b'RIFF', 'Not RIFF file'
  main_size = struct.unpack('<I', wav_file.read(4))[0]

  # wave chunk
  wav_format = wav_file.read(4)
  assert wav_format == b'WAVE', 'Found ' + wav_format + ' instead of WAVE'

  # now do all chunks
  while True:
    chunk_id = wav_file.read(4)
    if not chunk_id: break
    chunk_size = struct.unpack('<I', wav_file.read(4))[0]
    chunk_data = wav_file.read(chunk_size)
    if (chunk_size % 2 != 0): junk = wav_file.read(1)
    chunks.append([chunk_id, chunk_size, chunk_data])

print('Found:')
for chunk in chunks:
  print('Chunk: ', chunk[0], ' - size: ', chunk[1])


Test of input file:


Found:
Chunk:  b'fmt '  - size:  16
Chunk:  b'iXML'  - size:  1582
Chunk:  b'PAD '  - size:  2454
Chunk:  b'data'  - size:  1470152


In [None]:
# inject new chunk, use 'BOTS' type (anything unused)
new_chunk = [b'BOTS', len(inject_bytes), inject_bytes]
chunks.insert(0, new_chunk)
main_size += len(inject_bytes) + 8 # adjust total length

# save to file
fp = open(output_file, 'wb')
fp.write(main_id)
fp.write(struct.pack('<I', main_size))
fp.write(wav_format)
for chunk in chunks:
  print('writing: ', chunk[0], ' - length: ', chunk[1])
  fp.write(chunk[0])
  fp.write(struct.pack('<I', chunk[1]))
  fp.write(chunk[2])
  if (chunk[1] % 2): fp.write(b'\x00')
fp.close

# done - You should still be able to play the audio file.
print("Test output:")
display(Audio(output_file, autoplay=False))

# Download "file_out.wav" from the left sidebar
# Check it in a text editor, if you're curious

writing:  b'BOTS'  - length:  316
writing:  b'fmt '  - length:  16
writing:  b'iXML'  - length:  1582
writing:  b'PAD '  - length:  2454
writing:  b'data'  - length:  1470152
Test output:
