In [None]:
! apt-get install -y ffmpeg
! sudo apt-get install -y zip

In [None]:
import requests
import subprocess
from bs4 import BeautifulSoup
import csv
import os
import zipfile
from google.colab import drive
from tqdm.notebook import tqdm

# Define the base URL and the directory to save files
base_url = "https://naslemana.com/"

part = 0
step = 1

save_dir = "nasl-e mana/"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(f"{save_dir}/audio", exist_ok=True)
os.makedirs(f"{save_dir}/text", exist_ok=True)

# Function to log messages to a file
def log_to_file(message):
    with open(log_file_path, "a", encoding="utf-8") as log_file:
        log_file.write(f"{message}\n")

# Function to download a file
def download_file(url, save_path):
    if not url.startswith(base_url):
        log_to_file(f"Link not starting with naslemana.com: {url}")
        return
    response = requests.get(url, stream=True)
    with open(save_path, 'wb') as out_file:
        out_file.write(response.content)

# Function to extract text from a page
def extract_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('span', class_='post-title', itemprop='headline').text
    subtitle = soup.find('h2', class_='post-subtitle')
    if subtitle:
        subtitle = subtitle.text
    else:
        subtitle = ""
    body_text = " ".join([p.text for p in soup.find_all('div', class_='entry-content clearfix single-post-content')])
    return title, subtitle, body_text


# Function to get the duration of an audio file using ffmpeg
def get_audio_duration(file_path):
    try:
        # Run ffmpeg to get the duration
        result = subprocess.run(
            ['ffmpeg', '-i', file_path],
            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True
        )
        output = result.stderr  # ffmpeg duration info is in stderr

        # Find the duration in the output (look for "Duration: ")
        duration_line = [line for line in output.split('\n') if "Duration:" in line]
        if duration_line:
            duration_text = duration_line[0].split("Duration:")[1].split(",")[0].strip()
            # Convert HH:MM:SS.MS to seconds
            hours, minutes, seconds = map(float, duration_text.split(':'))
            total_seconds = hours * 3600 + minutes * 60 + seconds
            return total_seconds
    except Exception as e:
        log_to_file(f"Error getting duration for {file_path}: {e}")
        return None


# File path for log file
log_file_path = f"{save_dir}/crawl_log.txt"

# Initialize the metadata as a list
metadata = []

# Create the CSV file and write the header
metadata_file_path = f"{save_dir}/metadata.csv"
if not os.path.exists(metadata_file_path):
    with open(metadata_file_path, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["magazine_name", "magazine_url", "subject", "audio_url", "audio_duration", "text_url", "file_name"])
        writer.writeheader()

In [None]:
# Crawl the pages
for page_num in range(1 + part * step, 1 + (part + 1) * step):
    log_to_file(f"Crawling page {page_num}")
    print(f"Crawling page {page_num}")
    page_url = f"{base_url}page/{page_num}/"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find candidate pages
    candidate_pages = soup.find_all('a', class_='read-more')
    for candidate in tqdm(candidate_pages):
        log_to_file(f"Crawling {candidate['href']}")
        candidate_url = candidate['href']
        candidate_response = requests.get(candidate_url)
        candidate_soup = BeautifulSoup(candidate_response.text, 'html.parser')

        # Find elements with "(صوت)" and a link to an .mp3 file
        audio_elements = candidate_soup.find_all('a', href=lambda href: href and href.endswith('.mp3'), string=lambda string: string and '(صوت)' in string)

        if audio_elements:    # The candidate page is for a magazine
          log_to_file(f"Crawling {candidate_url}")

        for audio_element in tqdm(audio_elements):
            audio_url = audio_element['href']
            audio_text = audio_element.text.replace('(صوت)', '').strip()

            # Look for a corresponding element with "(متن)"
            text_element = candidate_soup.find('a', string=lambda string: string and '(متن)' in string and audio_text in string)
            if text_element:
                text_url = text_element['href']

                # Extract text from the text page
                title, subtitle, body_text = extract_text(text_url)

                # Prepare the subject by removing specific words
                subject = title.replace('(متن)', '').replace('(صوت)', '')

                log_to_file(f"Downloading the pair: {subject}")

                # Download the audio file
                file_name = f"{save_dir}/audio/{len(metadata) + 164}.mp3"
                download_file(audio_url, file_name)

                # Save the text to a file
                text_file_name = f"{save_dir}/text/{len(metadata) + 164}.txt"
                with open(text_file_name, 'w', encoding='utf-8') as f:
                    f.write(f"{title}\n{subtitle}\n{body_text}")

                # Update the metadata
                metadata_entry = {
                    "magazine_name": candidate_soup.find('span', class_='post-title', itemprop='headline').text,
                    "magazine_url": candidate_url,
                    "subject": subject,
                    "audio_url": audio_url,
                    'audio_duration': get_audio_duration(file_name),
                    "text_url": text_url,
                    "file_name": f"{len(metadata) + 164}"
                }
                metadata.append(metadata_entry)

                # Append the metadata to the CSV file
                with open(metadata_file_path, mode="a", encoding="utf-8", newline="") as f:
                    writer = csv.DictWriter(f, fieldnames=metadata_entry.keys())
                    writer.writerow(metadata_entry)

In [None]:
def zip_folder_in_parts_with_progress(folder_path, zip_name, part_size_mb, part):
    # List of all files in the folder for progress tracking
    all_files = []
    for foldername, subfolders, filenames in os.walk(folder_path):
        for filename in filenames:
            filepath = os.path.join(foldername, filename)
            all_files.append(filepath)

    # Create the base zip file and track progress
    zip_filename = f"{zip_name}_{part}.zip"
    with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
        for filepath in tqdm(all_files, desc="Zipping files", unit="file"):
            arcname = os.path.relpath(filepath, folder_path)  # Create a relative path for the zip
            zipf.write(filepath, arcname)

    # Split the zip file into parts using the system zip tool
    # The '-s' flag is used to split the zip file into smaller chunks
    os.system(f"zip -s {part_size_mb}m {zip_filename} {zip_filename}")

    # Clean up the original zip file after splitting
    os.remove(zip_filename)

    print(f"Zip file successfully split into parts. Each part is {part_size_mb}MB.")

# Usage example
folder_path = 'nasl-e-mana'
zip_name = 'nasl-e-mana'
part_size_mb = 2048  # Set to 2GB per part (adjust as needed)
zip_folder_in_parts_with_progress(folder_path, zip_name, part_size_mb, part)