In [13]:
from bs4 import BeautifulSoup
import httpx
import os
from fastcore.parallel import parallel
import time

def fetch_random_wiki_page():
    """Fetches a random Wikipedia page and returns its title and content."""
    url = 'https://en.wikipedia.org/wiki/Special:Random'
    response = httpx.get(url, follow_redirects=True)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.select_one('#firstHeading').text
    paragraphs = soup.select('.mw-parser-output > p, .mw-parser-output > ul li, .mw-parser-output table.infobox td, .mw-parser-output > h2, .mw-parser-output > h3')
    content = '\n\n'.join(p.text.strip() for p in paragraphs if p.text.strip())

    return title, content

def save_article(title, content, article_index):
    """Saves the article content to a file with a sanitized title."""
    safe_title = ''.join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()
    os.makedirs('wiki_articles_new', exist_ok=True)
    filename = f'wiki_articles_new/article_{article_index}_{safe_title}.txt'

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"Title: {title}\n\n")
        f.write(content)

    return filename

def get_article_with_retries(article_index, min_length=2000, max_retries=15):
    """Fetches and saves a random Wikipedia article, retrying if it's too short."""
    for attempt in range(max_retries):
        try:
            title, content = fetch_random_wiki_page()

            if len(content) >= min_length:
                print(f"Saved article {article_index}: {title} ({len(content)} chars)")
                return save_article(title, content, article_index)

            print(f"Attempt {attempt + 1} for article {article_index}: Too short ({len(content)} chars)")
        except Exception as e:
            print(f"Error on attempt {attempt + 1} for article {article_index}: {e}")

        time.sleep(1)


    print(f"Failed to fetch suitable article for position {article_index} after {max_retries} attempts")
    return None

def process_article(i, min_length):
    """Processes a single article fetch operation."""
    return get_article_with_retries(i + 1, min_length)

def fetch_multiple_articles(num_articles=1, min_length=1000):
    """Fetches multiple random Wikipedia articles in parallel."""
    print(f"Fetching {num_articles} articles with a minimum length of {min_length} characters...")
    articles = parallel(process_article, range(num_articles), n_workers=20, min_length=min_length)
    successful_articles = [article for article in articles if article]

    print(f"\nSuccessfully saved {len(successful_articles)}/{num_articles} articles")
    return successful_articles

saved_files = fetch_multiple_articles(num_articles=125, min_length=1500)


Fetching 125 articles with a minimum length of 1500 characters...
Attempt 1 for article 18: Too short (294 chars)
Attempt 1 for article 13: Too short (537 chars)
Attempt 1 for article 15: Too short (259 chars)
Attempt 1 for article 2: Too short (1206 chars)
Saved article 9: 2002 Motor City Bowl (2193 chars)
Attempt 1 for article 19: Too short (1141 chars)
Attempt 1 for article 8: Too short (487 chars)
Attempt 1 for article 3: Too short (674 chars)
Attempt 1 for article 12: Too short (586 chars)
Saved article 10: Maria Smirnova (gymnast) (2146 chars)
Attempt 1 for article 4: Too short (339 chars)
Attempt 1 for article 11: Too short (443 chars)
Attempt 1 for article 16: Too short (740 chars)
Attempt 1 for article 14: Too short (871 chars)
Attempt 1 for article 17: Too short (929 chars)
Attempt 1 for article 6: Too short (370 chars)
Attempt 1 for article 7: Too short (800 chars)
Attempt 1 for article 21: Too short (1302 chars)
Attempt 1 for article 20: Too short (399 chars)
Attempt 1 for 

In [14]:
#2
import re

def clean_text(text):
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return cleaned_text

def clean_existing_articles(directory='./wiki_articles_new'):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        cleaned_content = clean_text(content)
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)
clean_existing_articles()

In [15]:
import os
import random
import csv

random_words = ["Mango", "Apple", "Guava", "Strawberry", "Avocado", "Pineapple", "Papaya", "Orange"]

def add_single_random_word_to_articles(directory='wiki_articles_new', output_csv='fruit_log.csv'):
    log_data = []
    
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        paragraphs = content.split('\n\n')
        
        if len(paragraphs) > 1:
            para_idx = random.randint(1, len(paragraphs) - 1)
            words = paragraphs[para_idx].split()
            if words:
                insert_pos = random.randint(0, len(words))
                random_word = random.choice(random_words)
                words.insert(insert_pos, f"[{random_word}]")
                paragraphs[para_idx] = ' '.join(words)
        
            modified_content = '\n\n'.join(paragraphs)
            
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(modified_content)
            
            parts = file.split('_')
            if len(parts) > 2:
                article_number = parts[1]
                article_title = ' '.join(parts[2:]).replace('.txt', '')
            else:
                article_number = parts[0].replace('.txt', '')
                article_title = "Unknown"
            
            log_data.append({
                "Title": article_title,
                "Fruit_added": random_word
            })
            
            print(f"Added '{random_word}' to {file}")
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["Title", "Fruit_added"])
        writer.writeheader()
        writer.writerows(log_data)
    
    print(f"Log has been saved to {output_csv}")

add_single_random_word_to_articles()

Added 'Avocado' to article_47_American Society of Parasitologists.txt
Added 'Pineapple' to article_100_Diana Salazar.txt
Added 'Guava' to article_27_Schraffts.txt
Added 'Apple' to article_68_Mine Creek Battlefield State Historic Site.txt
Added 'Apple' to article_43_WDZQ.txt
Added 'Strawberry' to article_29_Redmi 7.txt
Added 'Avocado' to article_107_Giurgiului.txt
Added 'Orange' to article_59_Natalia Sokol.txt
Added 'Mango' to article_104_John Jay OConnor.txt
Added 'Pineapple' to article_111_German submarine U-516.txt
Added 'Guava' to article_88_Mettmann Stadtwald station.txt
Added 'Apple' to article_30_Barbara Morris.txt
Added 'Avocado' to article_12_Expansion of Heathrow Airport.txt
Added 'Apple' to article_121_Tuttlingen station.txt
Added 'Avocado' to article_22_Chumikan.txt
Added 'Orange' to article_37_John Volken.txt
Added 'Papaya' to article_119_2023 Niger State gubernatorial election.txt
Added 'Pineapple' to article_66_Saraswati Supercluster.txt
Added 'Orange' to article_9_2002 M

In [16]:
#5
import re

def remove_brackets_keep_words(directory='./wiki_articles_new'):
    files = os.listdir(directory)
    
    for file in files:
        file_path = os.path.join(directory, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        cleaned_content = re.sub(r'\[(.*?)\]', r'\1', content)
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)
            
remove_brackets_keep_words()

In [17]:
import os
import json

input_folder = "wiki_articles_new"
output_file = "./json_files/wikipedia_113.json"

documents = []

for file_name in os.listdir(input_folder):
    if file_name.endswith(".txt"):
        file_path = os.path.join(input_folder, file_name)
        
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read().strip()
        
        documents.append({"src": content})

with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(documents, json_file, indent=4, ensure_ascii=False)

print(f"JSON file successfully created at {output_file}")

JSON file successfully created at ./json_files/wikipedia_113.json


In [3]:
import json
import csv

INPUT_JSON = "/home/aru/Desktop/UCB_research/results/wikipedia_113_test.json"   
OUTPUT_CSV = "output.csv"  

def main():
    try:
        with open(INPUT_JSON, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
        
        results = []
        for entry in data:
            src = entry.get("src", "")
            title = "No Title Found"
            if "Title: " in src:
                title = src.split("Title: ")[1].split('\n')[0]
            hidden_fruits = entry.get("hidden_fruits", [])
            results.append({"title": title, "hidden_fruits": hidden_fruits})
        
        with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Title", "Hidden_Fruits"])
            for item in results:
                title = item['title']
                fruits_str = ",".join(item['hidden_fruits'])
                writer.writerow([title, fruits_str])
        
        print(f"Successfully generated CSV: {OUTPUT_CSV}")
    
    except FileNotFoundError:
        print(f"Error: File '{INPUT_JSON}' not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{INPUT_JSON}'.")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    main()

                                  article_title extracted_output
0           American Society of Parasitologists                 
1                                 Diana Salazar                 
2                                    Schrafft's                 
3    Mine Creek Battlefield State Historic Site                 
4                                          WDZQ                 
..                                          ...              ...
108                  Burntollet Bridge incident                 
109         Shirley Raines (non-profit founder)                 
110                                 Diane Curry                 
111                  Situationist International                 
112                              Tropomodulin 2                 

[113 rows x 2 columns]
