In [3]:
import os
import shutil
import re

# ==========================================
# 1. SETUP & DOWNLOAD
# ==========================================
repo_url = "https://github.com/ras0k/whisper-rap-queb.git"
repo_name = "whisper-rap-queb"
target_subfolder = "top1000/genius_scrape"
full_path = os.path.join("/content", repo_name, target_subfolder)

# Clean up previous runs to ensure fresh download
if os.path.exists(repo_name):
    shutil.rmtree(repo_name)

print("Downloading files from GitHub...")
!git clone {repo_url} > /dev/null 2>&1
print(f"Download complete. Scanning files in: {target_subfolder}\n")

# ==========================================
# 2. ANALYSIS LOGIC
# ==========================================

def analyze_files(directory):
    # Get all .txt files and SORT them alphabetically
    files = sorted([f for f in os.listdir(directory) if f.endswith(".txt")])

    total_files = len(files)
    files_with_hits = 0

    print(f"Processing {total_files} files...\n" + "="*60 + "\n")

    for filename in files:
        filepath = os.path.join(directory, filename)

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                # Join lines specifically to check the multi-line structure regex
                full_content = "".join(lines)
        except Exception as e:
            # Skip files that can't be read
            continue

        # --- STORAGE FOR HITS ---
        # We store hits in lists first. If both are empty, we print NOTHING for this file.
        follow_hits = []
        structure_hits = []

        # -------------------------------------------------
        # CHECK 1: "Follow" Context
        # -------------------------------------------------
        for i, line in enumerate(lines):
            if "Follow" in line:
                # Capture context (2 lines before, 2 lines after)
                start = max(0, i - 2)
                end = min(len(lines), i + 3)

                context_block = []
                for j in range(start, end):
                    prefix = ">> " if j == i else "   "
                    # Strip newline for cleaner printing
                    context_block.append(f"{prefix}{lines[j].rstrip()}")

                follow_hits.append((i + 1, context_block))

        # -------------------------------------------------
        # CHECK 2: Empty -> [Refrain] -> Empty
        # -------------------------------------------------
        # Regex explanation:
        # (?<=\n)       -> Lookbehind: Ensure we start after a newline
        # \s*\n         -> Match an empty line (whitespace allowed)
        # \s*\[(Refrain|Hook)\] -> Match the tag
        # \s*\n         -> Match the newline after the tag
        # \s*\n         -> Match the following empty line

        # We use a pattern that finds: Newline -> Empty Line -> Tag -> Empty Line
        pattern = r'\n\s*\n\s*\[(Refrain|Hook)\]\s*\n\s*\n'

        matches = re.finditer(pattern, full_content)
        for match in matches:
            tag_name = match.group(1)
            # Find line number (roughly) by counting newlines up to the match start
            line_num = full_content[:match.start()].count('\n') + 2
            structure_hits.append(f"Line {line_num}: Empty line -> [{tag_name}] -> Empty line")

        # -------------------------------------------------
        # OUTPUT: Only print if we found something
        # -------------------------------------------------
        if follow_hits or structure_hits:
            files_with_hits += 1
            print(f"üìÑ FILE: {filename}")
            print("-" * 40)

            if follow_hits:
                print("   üîç Found 'Follow':")
                for line_num, block in follow_hits:
                    print(f"   [Line {line_num}]")
                    for text in block:
                        print(f"    {text}")
                    print("") # Spacer

            if structure_hits:
                print("   ‚ö†Ô∏è  Found Empty Section ([Refrain] with no lyrics):")
                for hit in structure_hits:
                    print(f"    - {hit}")

            print("\n" + "="*60 + "\n")

    print(f"Done. Found matches in {files_with_hits} out of {total_files} files.")

# Run the analyzer
analyze_files(full_path)

Downloading files from GitHub...
Download complete. Scanning files in: top1000/genius_scrape

Processing 943 files...

üìÑ FILE: 039-Gros-big-la-route-est-longue.txt
----------------------------------------
   ‚ö†Ô∏è  Found Empty Section ([Refrain] with no lyrics):
    - Line 35: Empty line -> [Refrain] -> Empty line
    - Line 65: Empty line -> [Refrain] -> Empty line


üìÑ FILE: 159-Souldia-esperance.txt
----------------------------------------
   ‚ö†Ô∏è  Found Empty Section ([Refrain] with no lyrics):
    - Line 34: Empty line -> [Refrain] -> Empty line
    - Line 77: Empty line -> [Refrain] -> Empty line


üìÑ FILE: 211-Souldia-sourire-aux-levres.txt
----------------------------------------
   ‚ö†Ô∏è  Found Empty Section ([Refrain] with no lyrics):
    - Line 11: Empty line -> [Refrain] -> Empty line
    - Line 45: Empty line -> [Refrain] -> Empty line
    - Line 73: Empty line -> [Refrain] -> Empty line


üìÑ FILE: 265-Souldia-une-lettre-pour-milan.txt
------------------------

In [4]:
import os
import re

# Define paths
input_folder = "/content/whisper-rap-queb/top1000/genius_scrape"
output_folder = "/content/transcriptions"

# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Get list of files
files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]

print(f"Processing {len(files)} files into '{output_folder}'...")

for filename in files:
    input_path = os.path.join(input_folder, filename)
    output_path = os.path.join(output_folder, filename)

    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 1. Convert guillemets to normal quotes
    content = content.replace("<<", '"').replace(">>", '"')

    # 2. Remove asterisks (*)
    content = content.replace("*", "")

    # 3. Remove [Brackets] and content inside
    content = re.sub(r'\[.*?\]', '', content)

    # 4. Remove (Parentheses) and content inside (ad-libs)
    content = re.sub(r'\(.*?\)', '', content)

    # 5. Remove empty lines
    # We split into lines, strip whitespace, and keep only lines that have text
    lines = content.splitlines()
    clean_lines = [line.strip() for line in lines if line.strip()]

    # Rejoin with newlines
    final_content = "\n".join(clean_lines)

    # Save to new folder
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(final_content)

print("‚úÖ Done! Files are ready in the 'transcriptions' folder.")

# Optional: Print a preview of the first processed file
if files:
    print(f"\n--- Preview of {files[0]} ---")
    with open(os.path.join(output_folder, files[0]), 'r') as f:
        print(f.read()[:500]) # Print first 500 chars

Processing 943 files into '/content/transcriptions'...
‚úÖ Done! Files are ready in the 'transcriptions' folder.

--- Preview of 439-Samian-peuple-invincible.txt ---
Je ne crois pas √™tre capable de cesser de crier ce qui est injuste
Regarde notre r√©alit√© et les mensonges qu‚Äôon nous incruste
Je n‚Äôai pas la force de comprendre toute cette discrimination
J‚Äôai la force et le courage de crier pour ma nation
Il est temps qu‚Äôon avance, qu‚Äôon se rassemble pour la cause
Qu‚Äôon arr√™te de se d√©truire par l‚Äôalcool et la coke
Qu‚Äôon leur prouve qu‚Äôon est des hommes et qu‚Äôon est fier de qui on est
S‚Äôils nous traitent de sauvages, on s‚Äôen fout, on est des guerriers
On a pas 


In [5]:
import shutil
import os
from google.colab import files

# Define paths
folder_to_zip = '/content/transcriptions'
output_filename = '/content/transcriptions' # shutil adds .zip automatically

# Check if folder exists
if os.path.exists(folder_to_zip):
    print(f"Zipping '{folder_to_zip}'...")

    # Create the zip archive
    shutil.make_archive(output_filename, 'zip', folder_to_zip)

    print(f"‚úÖ Success! Created '{output_filename}.zip'")

    # Trigger automatic download
    print("Downloading now...")
    files.download(f"{output_filename}.zip")

else:
    print(f"‚ùå Error: The folder '{folder_to_zip}' was not found. Please run the previous step first.")

Zipping '/content/transcriptions'...
‚úÖ Success! Created '/content/transcriptions.zip'
Downloading now...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>