In [1]:
!pip install httpx



In [2]:
##Sample 
#1. Data Fetch from Wikipedia
#2. Clean the data with removing all the reference numbers
#3. Add fruit names to the articles
#4. Save the articles with added fruit
#5. Remove the brackets and resave

In [2]:
from bs4 import BeautifulSoup
import httpx
import os
from fastcore.parallel import parallel
import time

def fetch_random_wiki_page():
    """Fetches a random Wikipedia page and returns its title and content."""
    url = 'https://en.wikipedia.org/wiki/Special:Random'
    response = httpx.get(url, follow_redirects=True)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.select_one('#firstHeading').text
    paragraphs = soup.select('.mw-parser-output > p, .mw-parser-output > ul li, .mw-parser-output table.infobox td, .mw-parser-output > h2, .mw-parser-output > h3')
    content = '\n\n'.join(p.text.strip() for p in paragraphs if p.text.strip())

    return title, content

def save_article(title, content, article_index):
    """Saves the article content to a file with a sanitized title."""
    safe_title = ''.join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()
    os.makedirs('wiki_articles', exist_ok=True)
    filename = f'wiki_articles/article_{article_index}_{safe_title}.txt'

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"Title: {title}\n\n")
        f.write(content)

    return filename

def get_article_with_retries(article_index, min_length=2000, max_retries=15):
    """Fetches and saves a random Wikipedia article, retrying if it's too short."""
    for attempt in range(max_retries):
        try:
            title, content = fetch_random_wiki_page()

            if len(content) >= min_length:
                print(f"Saved article {article_index}: {title} ({len(content)} chars)")
                return save_article(title, content, article_index)

            print(f"Attempt {attempt + 1} for article {article_index}: Too short ({len(content)} chars)")
        except Exception as e:
            print(f"Error on attempt {attempt + 1} for article {article_index}: {e}")

        time.sleep(1)

    print(f"Failed to fetch suitable article for position {article_index} after {max_retries} attempts")
    return None

def process_article(i, min_length):
    """Processes a single article fetch operation."""
    return get_article_with_retries(i + 1, min_length)

def fetch_multiple_articles(num_articles=2, min_length=1000):
    """Fetches multiple random Wikipedia articles in parallel."""
    print(f"Fetching {num_articles} articles with a minimum length of {min_length} characters...")
    articles = parallel(process_article, range(num_articles), n_workers=20, min_length=min_length)
    successful_articles = [article for article in articles if article]

    print(f"\nSuccessfully saved {len(successful_articles)}/{num_articles} articles")
    return successful_articles

# Fetch articles
saved_files = fetch_multiple_articles(num_articles=5, min_length=2000)


In [6]:
#2
import re

def clean_text(text):
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return cleaned_text

def clean_existing_articles(directory='./wiki_articles'):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        cleaned_content = clean_text(content)
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)
        
        #print(f"Cleaned references from {file}")

clean_existing_articles()

In [8]:
#3
import random

random_words = ["Mango", "Apple", "Guava",""]

def add_single_random_word_to_articles(directory='./wiki_articles'):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        paragraphs = content.split('\n\n')
        
        if len(paragraphs) > 1:
            para_idx = random.randint(1, len(paragraphs)-1)
            words = paragraphs[para_idx].split()
            if words:
                insert_pos = random.randint(0, len(words))
                random_word = random.choice(random_words)
                words.insert(insert_pos, f"[{random_word}]")
                paragraphs[para_idx] = ' '.join(words)
        
            modified_content = '\n\n'.join(paragraphs)
            
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(modified_content)
            
            print(f"Added '{random_word}' to {file}")

add_single_random_word_to_articles()


Added 'Mango' to article_2_National Newspaper Awards.txt
Added 'Guava' to article_1_Influencer marketing.txt


In [10]:
#5
import re

def remove_brackets_keep_words(directory='./wiki_articles'):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        cleaned_content = re.sub(r'\[(.*?)\]', r'\1', content)
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)
            
remove_brackets_keep_words()
