In [2]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import wikipediaapi
import wikipedia
from groq import Groq

attractions = pd.read_csv('sightseeing_places_ISTANBUL.csv')

attractions.head()

Unnamed: 0,NAME,LONGITUDE,LATITUDE
0,Ayasofya Camii,28.980175,41.008583
1,Topkapı Sarayı Müzesi,28.983379,41.01152
2,Yerebatan Sarnıcı,28.977878,41.008384
3,Sultan Ahmet Camii,28.976814,41.00541
4,Sultanahmet Meydanı,28.975705,41.006329


In [3]:
# Define a User-Agent for your script
USER_AGENT = "MyWikipediaScraper/1.0 (https://example.com; contact@example.com)"

# Initialize Wikipedia API for Turkish and English
wiki_en = wikipediaapi.Wikipedia(
    user_agent=USER_AGENT,
    language='en'
)
wiki_tr = wikipediaapi.Wikipedia(
    user_agent=USER_AGENT,
    language='tr'
)
# Function to find the closest match using Wikipedia search
def get_closest_match(title, lang='en'):
    # Perform a search for the closest match
    wikipedia.set_lang(lang)
    results = wikipedia.search(title, results=1)
    if results:
        return results[0]  # Return the top match
    else:
        return None

# Function to fetch Wikipedia content in English using Turkish input
def fetch_wikipedia_content(place):
    # Step 1: Find the closest match in Turkish Wikipedia
    closest_title_tr = get_closest_match(place, lang='tr')

    if closest_title_tr:  # If a Turkish match is found
        # Fetch Turkish page
        page_tr = wiki_tr.page(closest_title_tr)
        if page_tr.exists():
            # Step 2: Check for English translation in Turkish page
            if 'en' in page_tr.langlinks:
                english_title = page_tr.langlinks['en'].title  # Use the English translation
            else:
                # Step 3: Apply closest match search in English Wikipedia
                english_title = get_closest_match(closest_title_tr, lang='en')

            # Step 4: Fetch English content
            if english_title:
                page_en = wiki_en.page(english_title)

                # If no exact match, apply closest match search again in English
                if not page_en.exists():
                    english_title = get_closest_match(english_title, lang='en')
                    page_en = wiki_en.page(english_title)

                # Return final English title and content
                if page_en.exists():
                    return english_title, page_en.text
                else:
                    return english_title, "No English content found."
            else:
                return "No Match", "No English page found."
        else:
            return "No Match", "No Turkish page found."
    else:
        return "No Match", "No match found."

# Process each place in the dataset
knowledge_data = []
for place in attractions["NAME"]:
    english_title, content = fetch_wikipedia_content(place)
    knowledge_data.append({
        "Place" : english_title,
        "Content": content
    })

# Create a DataFrame for visualization
knowledge_df = pd.DataFrame(knowledge_data)

# Optionally, save to a CSV file
knowledge_df.to_csv("istanbul_places_wikipedia_content.csv", index=False)

KeyboardInterrupt: 

In [12]:
# URL for scraping
from urllib.parse import urljoin
import time
# Base URL for crawling
base_url = "https://muze.gen.tr/Museums"

from openai import OpenAI

# Create OpenAI client
openai = OpenAI(
    api_key="API_KEY",
    base_url="https://api.deepinfra.com/v1/openai",
)

# Step 1: Crawl all museum links
def crawl_museum_links(base_url):
    # Request the base URL
    response = requests.get(base_url)
    if response.status_code != 200:
        print("Failed to fetch base URL!")
        return []

    # Parse HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract links
    museum_links = []
    for link in soup.find_all("a", href=True):  # Get all <a> tags with href
        href = link['href']
        full_url = urljoin(base_url, href)  # Handle relative URLs

        # Filter URLs containing 'muze-detay'
        if "muze-detay" in full_url:  # Looks for museum-related links
            museum_links.append(full_url)

    # Remove duplicates
    return list(set(museum_links))


# Step 1: Scrape the entire webpage
def scrape_full_content(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    soup = BeautifulSoup(response.content, "html.parser")
    # Get the body content
    body_content = soup.find("body")
    return body_content.get_text()


# Step 2: Send content to API for processing
def process_with_api(content):
    # Create a prompt for structured data extraction
    prompt = f"""
    Extract information about the museum in Istanbul from the following html content:
    {content}

    Return the result as a valid JSON list containing:
    - "name": The name of the museum.
    - "description": A detailed body of text describing the museum. Do not shorten the text. Extract as much information as possible. 

    Output Rules:
    1. Return only the JSON output.
    2. Do not include explanations, headers, or any other text.
    3. Ensure the JSON output is properly formatted and valid.

    Example output:
    [
        {{
            "name": "Istanbul Archaeology Museums",
            "description": "The Istanbul Archaeology Museums is a group of three archeological museums located in the Eminönü district of Istanbul, Turkey..."
        }}
    ]
    """

    # API call to OpenAI
    response = openai.chat.completions.create(
        model="Qwen/QwQ-32B-Preview",
        messages=[{"role": "user", "content": prompt}],
        response_format={ "type": "json_object" }
    )

    # Extract the generated content
    result = response.choices[0].message.content

    # Return the JSON output as Python objects
    return result


# Step 4: Crawl, scrape, and process all museums
if __name__ == "__main__":
    # Crawl all museum links
    museum_links = crawl_museum_links(base_url)
    
    if not museum_links:
        print("No museum links found!")
    else:
        # Process each museum link
        for url in museum_links:
            print(f"Processing: {url}")

            # Scrape the content
            full_content = scrape_full_content(url)
            if not full_content:
                print(f"Failed to fetch content for {url}")
                continue

            # Process content with Groq
            structured_data = process_with_api(full_content)
            print(structured_data)

        # Save the structured data to a JSON file
        with open("museum_data.json", "a") as f:
            f.write(structured_data)
        print("Structured data saved to museum_data.json")




Processing: https://muze.gen.tr/muze-detay/tiem
{
    "name": "Museum of Turkish and Islamic Arts",
    "description": "The Museum of Turkish and Islamic Arts in Istanbul is the first museum in Turkey to showcase Turkish and Islamic artworks together. It holds the distinction of being the last museum opened during the Ottoman Empire period. Established in 1914 in the imaret building of the Suleymaniye Mosque Complex, it was initially named 'Evkaf-ı Islamiye Museum' (Islamic Foundations Museum). After the declaration of the Republic and the renaming of the country to Türkiye Cumhuriyeti in 1923, it was renamed 'Turkish and Islamic Arts Museum' and moved to Ibrahim Pasha Palace in 1983.\n\nIbrahim Pasha Palace, dating back to the late 15th century, is one of the earliest surviving palace buildings and was once the residence of Grand Vizier Ibrahim Pasha. The museum underwent extensive restoration in 1982 and was reopened in 2014 to commemorate its centennial anniversary.\n\nThe museum's 

In [43]:
from openai import OpenAI
import json

# Create OpenAI client
openai = OpenAI(
    api_key="API_KEY",
    base_url="https://api.deepinfra.com/v1/openai",
)

def generate_qa_istanbul(content,model="Qwen/Qwen2.5-72B-Instruct"):
    prompt = f""" 
        Analyze the following content about Istanbul and generate question-answer pairs strictly based on the provided text. 
        Content:
        {content}

        Instructions:
        1. Write clear and self-contained questions that do not require referring back to the content. 
        2. Ensure the questions can stand alone and provide informative context. 
        3. Focus on facts, dates, locations, and cultural insights directly mentioned in the text. 
        4. Avoid vague questions like "What is mentioned in the text?" or any that depend on the reader having access to the content. 
        5. If the content lacks enough information, generate fewer questions or return an empty list. 
        6. Generate as many questions as possible based on the content provided. But the questions should be relevant and meaningful.
        7. Give just the JSON output. Do not include any other information.
        8. Provide the output as a Python list in JSON format:
        Example Output:
        [
            {{"question": "When was Hagia Sophia built?", "answer": "537 AD"}},
            {{"question": "What was the original purpose of Hagia Sophia?", "answer": "A church."}}
        ]
        """

    # API call to OpenAI
    response = openai.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format={ "type": "json_object" }
    )

    # Extract the generated content
    result = response.choices[0].message.content
    result = json.loads(result)
    # Return the JSON output as Python objects
    return result


In [None]:
# Open the JSON file containing museum data
with open("museum_data.json", "r") as f:
    museum_data = json.load(f)

# Create a list to store all questions
all_questions = []

# Loop through each museum
for museum in museum_data:
    content = museum["description"]
    museum_name = museum["name"]
    
    # Generate questions for each museum (assuming the function returns a list of dictionaries)
    questions = generate_qa_istanbul(content)
    print(questions)
    questions = questions.get('questions')
    # Format each question into the required structure
    for q in questions:
        if q == 0:
            break
        all_questions.append({
            "museum": museum_name,
            "question": q["question"],
            "answer": q["answer"]
        })


# Write all questions to a single JSON file
with open("questions.json", "w") as f:
    json.dump(all_questions, f, indent=4)  # Pretty-print with indentation

print("All questions saved to questions.json")



{'questions': [{'question': 'In which city is the Hagia Sophia Grand Mosque located?', 'answer': 'İstanbul'}, {'question': 'What was the original name of the Hagia Sophia?', 'answer': 'Megale Ekklesia (Big Church)'}, {'question': 'When was the first church of Hagia Sophia built?', 'answer': '360 AD'}, {'question': 'Who constructed the first church of Hagia Sophia?', 'answer': 'Emperor Konstantios'}, {'question': 'What happened to the first church of Hagia Sophia in 404?', 'answer': 'It was burned down after a public riot.'}, {'question': 'When was the second church of Hagia Sophia built?', 'answer': '415'}, {'question': 'Who constructed the second church of Hagia Sophia?', 'answer': 'Emperor Theodosios II'}, {'question': 'What significant event caused the second church to be demolished in 532?', 'answer': 'The Nika revolts during Emperor Justinianos’ reign'}, {'question': 'When was the current Hagia Sophia built?', 'answer': '532'}, {'question': 'Who were the architects of the current 

In [48]:
with open("questions.json", "r") as f:
    questions = json.load(f)
    i = 0
    for q in questions:
        i += 1

print(i)

390


In [60]:
from openai import OpenAI
import json

# Create OpenAI client
openai = OpenAI(
    api_key="API_KEY",
    base_url="https://api.deepinfra.com/v1/openai",
)

def generate_qa_istanbul_wiki(content, model="Qwen/Qwen2.5-72B-Instruct"):
    prompt = f""" 
        Analyze the following content about Istanbul and generate a diverse set of question-answer pairs strictly based on the provided text. 
        Content:
        {content}

        Instructions:
        1. Write clear, self-contained questions that can stand alone without referring back to the content. 
        2. The questions should focus on key facts, events, locations, dates, cultural elements, and historical context directly mentioned in the text.
        3. Ensure a mix of short factual questions (e.g., "What is the capital of Turkey?") and more detailed ones (e.g., "What role did the Hagia Sophia play in the history of Istanbul and the Byzantine Empire?").
        4. For longer questions, include context, comparisons, or explanations that require a detailed answer (e.g., "What are the main architectural features of the Hagia Sophia, and how did they influence later structures in Istanbul?").
        5. If the content lacks sufficient details, generate fewer but meaningful questions, keeping the answers concise and informative.
        6. Avoid vague or overly generic questions like "What is mentioned in the text?" or those requiring external knowledge.
        7. Prioritize factual, informative, and educational content, ensuring that each question has a clear, unambiguous answer. 
        8. The output should be formatted as a Python list of dictionaries in JSON format, with each dictionary containing:
            - "content": the excerpt or context used to generate the question
            - "question": the formulated question
            - "answer": the corresponding answer

        Example Output:
        {{"questions":[
            {{"content": "Used content to create this question","question": "When was Hagia Sophia built?", "answer": "537 AD"}},
            {{"content": "Used content to create this question","question": "What was the original purpose of Hagia Sophia?", "answer": "A church."}},
            {{"content": "Used content to create this question","question": "How did the Hagia Sophia influence the architecture of later mosques?", "answer": "Its large dome and innovative structural design inspired Ottoman mosque architecture."}}
        ]}}
    """

    # API call to OpenAI
    response = openai.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        response_format={ "type": "json_object" }
    )

    # Extract the generated content
    result = response.choices[0].message.content
    result = json.loads(result)
    # Return the JSON output as Python objects
    return result

In [None]:
# Open the JSON file containing museum data
with open("istanbul_places_documents.json", "r") as f:
    wiki_data = json.load(f)

# Create a list to store all questions
all_questions_wiki = []

# Loop through each museum
for museum in wiki_data[141:]:
    content = museum["text"]
    
    # Generate questions for each museum (assuming the function returns a list of dictionaries)
    questions = generate_qa_istanbul_wiki(content)
    # Change the word 'questions' to "questions" to match the required format in the questions content







    
    print(questions)
    questions = questions.get('questions')
    # Format each question into the required structure
    for q in questions:
        if type(q) != dict:
            break
        all_questions_wiki.append({
            "content": q["content"],
            "question": q["question"],
            "answer": q["answer"]
        })


# Write all questions to a single JSON file
with open("wiki_questions.json", "w") as f:
    json.dump(all_questions_wiki, f, indent=4)  # Pretty-print with indentation

print("All questions saved to questions.json")



KeyboardInterrupt: 

In [133]:
import json
all_questions_wiki = []

with open("temp_cleaned_wiki_questions.txt", "r") as f:
    i = 0  # Counter for valid lines
    j = 0  # Counter for total lines
    for line in f:
        j += 1
        if j > 11:
            try:
                line = str(line).replace('"', '\'')
                # Replace invalid characters and fix JSON formatting
                line = str(line).replace('\'questions\'', '"questions"')
                line = str(line).replace('\'content\'', '"content"')
                line = str(line).replace('\'question\'', '"question"')
                line = str(line).replace('\'answer\'', '"answer"')

                line = str(line).replace('"content": \'', '"content": "')
                line = str(line).replace('"question": \'', '"question": "')
                line = str(line).replace('"answer": \'', '"answer": "')
                
                line = str(line).replace('\', "answer"', '", "answer"')
                line = str(line).replace('\', "content"', '", "content"')
                line = str(line).replace('\', "question"', '", "question"')
                line = str(line).replace('\'}', '"}')
                line = str(line).replace('\\', '')
                # Attempt to parse JSON
                line = json.loads(line)
                i += 1  # Increment valid counter
                line
            except Exception as e:
                # Find and highlight the problematic part
                error_index = str(e).find("char")  # Locate error position
                if error_index != -1:
                    # Extract position from error message
                    start = str(e).find("(") + 1
                    end = str(e).find(")", start)
                    pos = int(str(e)[start:end].split()[1])  # Character position
                    # Print error context with problematic part
                    print(f"Error: {e}")
                    print(f"Errored part: {line[max(0, pos-20):pos+20]}")  # Show ±20 chars
                    print(f"Line number: {j}")

                else:
                    # If position not found, display the full error
                    print(f"Error: {e}")
                continue
        else:
            line = json.loads(line)
            i += 1

        questions = line["questions"]
        for q in questions:

            all_questions_wiki.append({
                "content": q["content"],
                "question": q["question"],
                "answer": q["answer"]
            }

        )
            
    print(f"Total lines processed: {j}")


# Write all questions to a single JSON file
with open("wiki_questions_2.json", "w") as f:
    json.dump(all_questions_wiki, f, indent=4)  # Pretty-print with indentation
            
print(f"Valid JSON lines processed: {i}")

Total lines processed: 141
Valid JSON lines processed: 141


In [5]:
with open("combined_questions.json", "r") as f:
    questions = json.load(f)
    i = 0
    for q in questions:
        i += 1

print(i)

3266


In [7]:
import json

# Load the two files
with open("wiki_questions.json", "r") as f1, open("wiki_questions_2.json", "r") as f2, open("questions.json", "r") as f3:
    data1 = json.load(f1)
    data2 = json.load(f2)
    data3 = json.load(f3)
    # Change museum column to content in data3
    for d in data3:
        d['content'] = d.pop('museum')

# Combine data
combined_data = data3 + data2 + data1 

# Save the combined file
with open("combined_questions.json", "w") as outfile:
    json.dump(combined_data, outfile, ensure_ascii=False, indent=4)

In [9]:
from datasets import Dataset, DatasetDict
import json

# Load your combined JSON file
with open("combined_questions.json", "r") as f:
    data = json.load(f)

# Convert to Dataset
dataset = Dataset.from_dict({
    "content": [item["content"] for item in data],
    "question": [item["question"] for item in data],
    "answer": [item["answer"] for item in data]
})

# Push the dataset to HuggingFace Hub
dataset.push_to_hub("BlackFear/dl_project_uncleaned")

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 598.97ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BlackFear/dl_project_uncleaned/commit/3fddde8524bf55b2a8db77dcf8110b744e3b5546', commit_message='Upload dataset', commit_description='', oid='3fddde8524bf55b2a8db77dcf8110b744e3b5546', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BlackFear/dl_project_uncleaned', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BlackFear/dl_project_uncleaned'), pr_revision=None, pr_num=None)