In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
import pandas as pd
import os
import time


In [None]:

def is_valid_url(url, base_url):
    parsed = urlparse(url)
    return bool(parsed.scheme) and bool(parsed.netloc) and parsed.netloc == urlparse(base_url).netloc

def is_not_php(url):
    return '.php' not in url


In [None]:

def get_valid_urls_from_website(website_url, visited=None, max_requests=10):
    if visited is None:
        visited = set()

    if website_url in visited or len(visited) >= max_requests:
        return []

    print(f"Visiting: {website_url}")

    try:
        # Send a request to the website
        response = requests.get(website_url)
        response.raise_for_status()  # Check for HTTP errors

        # Add the URL to the set of visited URLs
        visited.add(website_url)

        # Parse the content of the website
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract all anchor tags with href attributes
        anchors = soup.find_all('a', href=True)

        # Generate a list of valid irealb:// URLs
        valid_urls = []
        for anchor in anchors:
            href = anchor.get('href')
            if href.startswith('irealb://') and is_not_php(href):
                valid_urls.append({'url': href, 'source_page': website_url})

        # Traverse through other links on the page recursively
        for anchor in anchors:
            href = anchor.get('href')
            full_url = urljoin(website_url, href)
            if is_valid_url(full_url, website_url) and full_url not in visited:
                valid_urls.extend(get_valid_urls_from_website(full_url, visited, max_requests))
                if len(visited) >= max_requests:
                    break
                time.sleep(1)  # To avoid overwhelming the server

        return valid_urls

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []

def save_to_dataframe(urls, file_name):
    # Load existing data if file exists
    if os.path.exists(file_name):
        df_existing = pd.read_csv(file_name)
    else:
        df_existing = pd.DataFrame(columns=['url', 'source_page'])

    # Convert the list of URLs to a DataFrame
    df_new = pd.DataFrame(urls)

    # Concatenate with existing data and remove duplicates
    df_combined = pd.concat([df_existing, df_new]).drop_duplicates()

    # Save the combined DataFrame back to the CSV file
    df_combined.to_csv(file_name, index=False)
    return df_combined


In [None]:

# Example usage:
website_url = 'https://www.irealb.com/forums/showthread.php?8483-Pop-400'
max_requests = 250  # Limit the total number of requests
output_file = 'irealb_urls.csv'  # Output file to save the results


def seed_urls():
    if os.path.exists(output_file):
        df_final = pd.read_csv(output_file)
        if os.path.getsize(output_file) > 0:
            df_final = pd.read_csv(output_file)
    else:
        # Get valid URLs from the website
        valid_urls = get_valid_urls_from_website(website_url, max_requests=max_requests)

        # # Save the URLs to a CSV file and update it
        df_final = save_to_dataframe(valid_urls, output_file)

    # Load the DataFrame from the CSV file
    df_final = pd.read_csv(output_file)
    return df_final

df_final = seed_urls()


In [None]:
df_final.head()

In [None]:
def parse_metadata(url: str) -> pd.DataFrame:
    # Remove the scheme part (irealb://) from the URL
    url = url.split("irealb://")[-1]
    

    # Split the fields by "==" (for separating the main parts)
    fields = url.split("==")

    # Decode any URL encoded characters
    fields = [unquote(field) for field in fields]

    # Assuming the fields correspond to specific metadata, we'll label them
    data = {
        "Song Title=Artist": fields[0] if len(fields) > 0 else None,
        "Style=Key": fields[1] if len(fields) > 1 else None,
        "Unknown1": fields[2] if len(fields) > 2 else None,
        "Unknown2": fields[3] if len(fields) > 3 else None,
        "Unknown3": fields[4] if len(fields) > 4 else None,
        "Unknown4": fields[5] if len(fields) > 5 else None,
        "Unknown5": fields[6] if len(fields) > 6 else None,
        "Unknown6": fields[7] if len(fields) > 7 else None
    }
    try:
        data["Song Title"], data["Artist"] = data["Song Title=Artist"].split("=")
    except:
        pass
    try:
        data["Style"], data["Key"] = data["Style=Key"].split("=")
    except:
        pass
    
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame([data])

    # order the columns
    df = df[["Song Title=Artist", "Style=Key", "Unknown1", "Unknown2", "Unknown3", "Unknown4", "Unknown5", "Unknown6"]]

    return df


In [None]:
# remove row from df if url starts with irealb://%
df_final = df_final[~df_final['url'].str.startswith('irealb://%')]

In [None]:
df_meta = pd.concat([parse_metadata(url) for url in df_final['url']], ignore_index=True)

df_combined = pd.concat([df_final, df_meta], axis=1)

df_combined.head()


In [None]:
df_final.drop_duplicates(subset=['url'], inplace=True)
df_final

In [None]:
df_partial = df_final.head(1000)

In [None]:
# write each entry of data frame column to a file with .ireal extension if the song isn't in the file
with open("src/static/gathered_songs.ireal", 'r') as f:
    gathered_songs = f.read().splitlines()

for i, url in enumerate(df_partial['url']):
    if url not in gathered_songs:
        with open("src/static/gathered_songs.ireal", 'a') as f:
            f.write(str(url) + '\n')