In [2]:
# The challenge involves extracting specific information from a target website https://www.scrapethissite.com/pages/frames/, 
# you are to get the name of the turtle and the bio in a csv file . You are encouraged to utilize any programming language 
# and libraries/tools of your choice.

#Solutions

# I inported the necessay libries as listed below
# checked for the source oo check if there are any external links or references to page, https://www.scrapethissite.com/pages/frames/
# Viewed Page Source"  <a href="https://en.wikipedia.org/wiki/List_of_Testudines_families" class="data-attribution" target="_blank">https://en.wikipedia.org/wiki/List_of_Testudines_families</a>
# with this i know the page t extract from and proceeded with other necessary codes
# conclusion shows the list in csv of turtle name embeded in the page

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from urllib.parse import urljoin

# Function to extract turtle names from the Wikipedia page
def extract_turtle_names_from_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    turtle_names = []
    tables = soup.find_all('table', class_='wikitable')
    
    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            if cells:
                name = cells[0].text.strip()  # Get turtle name from the first cell 
                turtle_names.append(name)
    
    return turtle_names

# Function to extract turtle names from the second link (ScrapeThisSite)
def extract_turtle_names_from_scrapethissite(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    turtle_names = []
    
    # Find all frames in the page (frames may contain the data we need)
    frames = soup.find_all('iframe')
    
    # Iterate through each iframe to scrape relevant data (assuming the data is embedded in one of the frames)
    for frame in frames:
        frame_url = frame['src']
        
        # Convert relative URL to absolute URL
        full_url = urljoin(url, frame_url)  # This ensures the correct URL format
        frame_response = requests.get(full_url)
        frame_soup = BeautifulSoup(frame_response.content, 'html.parser')
        
        # You may need to adjust this part based on the actual structure of the page
        # Let's assume the turtle names are inside a list or table with a specific class
        names = frame_soup.find_all('li')  # Example of extracting data inside <li> tags
        for name in names:
            turtle_names.append(name.text.strip())
    
    return turtle_names

# Define URLs for both pages
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_Testudines_families'
scrapethissite_url = 'https://www.scrapethissite.com/pages/frames/'

# Extract turtle names from both URLs
turtle_names_wikipedia = extract_turtle_names_from_wikipedia(wikipedia_url)
turtle_names_scrapethissite = extract_turtle_names_from_scrapethissite(scrapethissite_url)

# Combine the turtle names from both sources
combined_turtle_names = turtle_names_wikipedia + turtle_names_scrapethissite

# Define the CSV filename
filename = 'combined_turtle_names.csv'

# Save the combined turtle names to a CSV file
with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Turtle Name'])  # Write the header
    
    for name in combined_turtle_names:
        csvwriter.writerow([name])  # Write each name to the CSV file

# Read the CSV file to verify its content
try:
    df = pd.read_csv(filename, encoding='ISO-8859-1')
    
    # Display the content of the CSV file
    print("CSV File Content:")
    print(df)
    
    # Print the list of turtle names from the CSV
    if 'Turtle Name' in df.columns:
        print("\nList of Turtle Names:")
        for name in df['Turtle Name']:
            print(name)
    else:
        print("The column 'Turtle Name' does not exist in the CSV file.")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

CSV File Content:
                                          Turtle Name
0   Cryptodira  11 families, 74 genera, over 200 ...
1                     CarettochelyidaeBoulenger, 1887
2                              CheloniidaeOppel, 1811
3                               ChelydridaeGray, 1831
4                            DermatemydidaeGray, 1870
5                       DermochelyidaeFitzinger, 1843
6                            EmydidaeRafinesque, 1815
7                           GeoemydidaeTheobald, 1868
8                          KinosternidaeAgassiz, 1857
9                            PlatysternidaeGray, 1869
10                           TestudinidaeBatsch, 1788
11                        TrionychidaeFitzinger, 1826
12  Pleurodira  3 families, 16 genera, over 60 sp...
13                                 ChelidaeGray, 1831
14                            PelomedusidaeCope, 1868
15                           PodocnemididaeGray, 1869
16                                   Scrape This Site
17        