In [None]:
!pip install tqdm requests wikipedia

In [None]:
import wikipedia
import concurrent.futures
from tqdm import tqdm

NUM_PAGES = 3000
MIN_LENGTH = 200
MAX_TRIES = NUM_PAGES
OUTPUT_FILE = "dataset/wikipedia_2.txt"
THREADS = 16

def clean_text(text):
    lines = text.splitlines()
    cleaned = [line for line in lines if not line.lstrip().startswith("==") and line.strip()]
    return "\n".join(cleaned)

def fetch_and_clean_article(_):
    try:
        title = wikipedia.random(1)
        page = wikipedia.page(title)
        text = page.content
        if len(text) >= MIN_LENGTH:
            cleaned = clean_text(text)
            return cleaned
    except Exception:
        pass
    return None

def main():
    count = 0
    buffer = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) as executor:
        futures = [executor.submit(fetch_and_clean_article, i) for i in range(MAX_TRIES)]
        for future in tqdm(concurrent.futures.as_completed(futures), total=MAX_TRIES, desc="Downloading"):
            result = future.result()
            if result:
                buffer.append(result)
                count += 1
                if count >= NUM_PAGES:
                    break

    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
        f.write("\n".join(buffer) + "\n")

    print(f"Downloaded {count} Wikipedia pages (attempted {MAX_TRIES} times).")

if __name__ == "__main__":
    main()

In [23]:
import requests
import json

url = "https://raw.githubusercontent.com/alexa/Topical-Chat/master/conversations/train.json"
response = requests.get(url)
response.raise_for_status()

data = json.loads(response.text)

full_text = ""
for conversation in data.values():
    for turn in conversation["content"]:
        full_text += turn["message"].strip() + "\n"

with open("dataset/conversation_1.txt", "w") as f:
    f.write(full_text)

In [1]:
import requests

url = "https://raw.githubusercontent.com/Phylliida/Dialogue-Datasets/refs/heads/master/MovieCorpus.txt"
response = requests.get(url)
response.raise_for_status()

with open("dataset/movie_scripts.txt", "a") as f:
    for line in response.text.splitlines():
        if len(line) >= 20:
            f.write(line + "\n")