In [None]:
import os
import requests
import wikipedia
from bs4 import BeautifulSoup
from tqdm import tqdm

project_root = Path("..").resolve()
SAVE_DIR = project_root / "Non-License-Text"
os.makedirs(SAVE_DIR, exist_ok=True)

# GitHub repos to pull non-license docs from
GITHUB_REPOS = [
    "https://github.com/pallets/flask",
    "https://github.com/psf/requests",
    "https://github.com/numpy/numpy",
    "https://github.com/facebook/react",
    "https://github.com/scikit-learn/scikit-learn",
]

# Function to download README and other files
def download_github_files(repo_url):
    try:
        repo_name = repo_url.strip('/').split('/')[-1]
        api_url = f"https://api.github.com/repos/{repo_url.split('/')[-2]}/{repo_name}/contents/"
        response = requests.get(api_url)
        if response.status_code != 200:
            return
        data = response.json()

        for file in data:
            name = file['name'].lower()
            if name.endswith('.md') and not name.startswith("license"):
                file_content = requests.get(file['download_url']).text
                with open(os.path.join(SAVE_DIR, f"github_{repo_name}_{name}"), 'w', encoding='utf-8') as f:
                    f.write(file_content)

    except Exception as e:
        print("GitHub error:", e)

# Wikipedia topics to sample
WIKI_TOPICS = [
    "Software engineering",
    "Cloud computing",
    "Computer programming",
    "Free software",
    "Application software",
    "Information security",
    "Data science",
]

# Function to download Wikipedia content
def download_wikipedia_articles(topics):
    for topic in tqdm(topics, desc="Wikipedia"):
        try:
            content = wikipedia.page(topic).content
            filename = topic.replace(" ", "_") + ".txt"
            with open(os.path.join(SAVE_DIR, f"wiki_{filename}"), 'w', encoding='utf-8') as f:
                f.write(content)
        except Exception as e:
            print(f"Wikipedia error on {topic}: {e}")

# Run crawlers
print("📥 Crawling GitHub...")
for repo in tqdm(GITHUB_REPOS, desc="GitHub"):
    download_github_files(repo)

print("📥 Crawling Wikipedia...")
download_wikipedia_articles(WIKI_TOPICS)
