In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


#Question 1- most viewed videos on YouTube
# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos"

# Request the page
response = requests.get(url)
response.raise_for_status()

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table in the page
table = soup.find('table', {'class': 'wikitable'})

# Initialize lists to store data
ranks = []
names = []
artists = []
upload_dates = []
views = []

# Extract data from the table
for row in table.find_all('tr')[1:]:
    cells = row.find_all('td')
    ranks.append(cells[0].text.strip() if len(cells) > 0 else '-')
    names.append(cells[1].text.strip() if len(cells) > 1 else '-')
    artists.append(cells[2].text.strip() if len(cells) > 2 else '-')
    upload_dates.append(cells[3].text.strip() if len(cells) > 3 else '-')
    views.append(cells[4].text.strip() if len(cells) > 4 else '-')

# Create a DataFrame
df = pd.DataFrame({
    'Rank': ranks,
    'Name': names,
    'Artist': artists,
    'Upload Date': upload_dates,
    'Views': views
})

df.head()

Unnamed: 0,Rank,Name,Artist,Upload Date,Views
0,"""Baby Shark Dance""[7]",Pinkfong Baby Shark - Kids' Songs & Stories,14.66,"June 17, 2016",[A]
1,"""Despacito""[10]",Luis Fonsi,8.47,"January 12, 2017",[B]
2,"""Johny Johny Yes Papa""[18]",LooLoo Kids - Nursery Rhymes and Children's Songs,6.92,"October 8, 2016",
3,"""Bath Song""[19]",Cocomelon - Nursery Rhymes,6.75,"May 2, 2018",
4,"""See You Again""[20]",Wiz Khalifa,6.3,"April 6, 2015",[C]


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 2 - URL of the BCCI website
base_url = "https://www.bcci.tv/"

# Request the home page
response = requests.get(base_url)
response.raise_for_status()

# Parse the home page content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the link to the international fixtures page
fixtures_page_link = None
for a in soup.find_all('a', href=True):
    if 'international' in a['href']:
        fixtures_page_link = a['href']
        break

# If the fixtures page link is found, request that page
if fixtures_page_link:
    fixtures_url = base_url + fixtures_page_link
    fixtures_response = requests.get(fixtures_url)
    fixtures_response.raise_for_status()
    
    # Parse the fixtures page content
    fixtures_soup = BeautifulSoup(fixtures_response.content, 'html.parser')
    
    # Initialize lists to store data
    series = []
    places = []
    dates = []
    times = []
    
    # Find and extract the fixture details
    fixtures = fixtures_soup.find_all('div', class_='fixture-item')
    for fixture in fixtures:
        series.append(fixture.find('div', class_='fixture-item__series').text.strip() if fixture.find('div', class_='fixture-item__series') else '-')
        places.append(fixture.find('div', class_='fixture-item__venue').text.strip() if fixture.find('div', class_='fixture-item__venue') else '-')
        date_time = fixture.find('div', class_='fixture-item__datetime').text.strip() if fixture.find('div', class_='fixture-item__datetime') else '-'
        if date_time and '|' in date_time:
            date, time = date_time.split('|')
            dates.append(date.strip())
            times.append(time.strip())
        else:
            dates.append(date_time)
            times.append('-')
    
    # Create a DataFrame
    df_fixtures = pd.DataFrame({
        'Series': series,
        'Place': places,
        'Date': dates,
        'Time': times
    })

    df_fixtures.head()
else:
    df_fixtures = pd.DataFrame(columns=['Series', 'Place', 'Date', 'Time'])
    print("Could not find the international fixtures page link.")
    
df_fixtures.head()

Unnamed: 0,Series,Place,Date,Time


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 3 - URL of the Statistics Times website
base_url = "http://statisticstimes.com/"

# Request the home page
response = requests.get(base_url)
response.raise_for_status()

# Parse the home page content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the link to the economy page
economy_page_link = None
for a in soup.find_all('a', href=True):
    if 'economy' in a['href']:
        economy_page_link = a['href']
        break

# If the economy page link is found, request that page
if economy_page_link:
    economy_url = base_url + economy_page_link
    economy_response = requests.get(economy_url)
    economy_response.raise_for_status()
    
    # Parse the economy page content
    economy_soup = BeautifulSoup(economy_response.content, 'html.parser')
    
    # Find the link to the State-wise GDP page
    gdp_page_link = None
    for a in economy_soup.find_all('a', href=True):
        if 'india/indian-states-gdp.php' in a['href']:
            gdp_page_link = a['href']
            break
    
    # If the GDP page link is found, request that page
    if gdp_page_link:
        gdp_url = base_url + gdp_page_link
        gdp_response = requests.get(gdp_url)
        gdp_response.raise_for_status()
        
        # Parse the GDP page content
        gdp_soup = BeautifulSoup(gdp_response.content, 'html.parser')
        
        # Find the table in the page
        table = gdp_soup.find('table', {'id': 'table_id'})
        
        # Initialize lists to store data
        ranks = []
        states = []
        gsdp_18_19 = []
        gsdp_19_20 = []
        share_18_19 = []
        gdp_billion = []
        
        # Extract data from the table
        for row in table.find('tbody').find_all('tr'):
            cells = row.find_all('td')
            ranks.append(cells[0].text.strip() if len(cells) > 0 else '-')
            states.append(cells[1].text.strip() if len(cells) > 1 else '-')
            gsdp_18_19.append(cells[2].text.strip() if len(cells) > 2 else '-')
            gsdp_19_20.append(cells[3].text.strip() if len(cells) > 3 else '-')
            share_18_19.append(cells[4].text.strip() if len(cells) > 4 else '-')
            gdp_billion.append(cells[5].text.strip() if len(cells) > 5 else '-')
        
        # Create a DataFrame
        df_gdp = pd.DataFrame({
            'Rank': ranks,
            'State': states,
            'GSDP(18-19) - at current prices': gsdp_18_19,
            'GSDP(19-20) - at current prices': gsdp_19_20,
            'Share(18-19)': share_18_19,
            'GDP($ billion)': gdp_billion
        })

        df_gdp.head()
    else:
        df_gdp = pd.DataFrame(columns=['Rank', 'State', 'GSDP(18-19) - at current prices', 'GSDP(19-20) - at current prices', 'Share(18-19)', 'GDP($ billion)'])
        print("Could not find the State-wise GDP page link.")
else:
    df_gdp = pd.DataFrame(columns=['Rank', 'State', 'GSDP(18-19) - at current prices', 'GSDP(19-20) - at current prices', 'Share(18-19)', 'GDP($ billion)'])
    print("Could not find the economy page link.")
    
df_gdp.head()

Could not find the State-wise GDP page link.


Unnamed: 0,Rank,State,GSDP(18-19) - at current prices,GSDP(19-20) - at current prices,Share(18-19),GDP($ billion)


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 4 - URL of the GitHub trending repositories page
url = "https://github.com/trending"

# Request the trending repositories page
response = requests.get(url)
response.raise_for_status()

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize lists to store data
titles = []
descriptions = []
contributors_counts = []
languages = []

# Extract data from the page
repos = soup.find_all('article', class_='Box-row')
for repo in repos:
    # Title
    title_tag = repo.find('h1', class_='h3 lh-condensed')
    if title_tag:
        title = title_tag.text.strip()
        titles.append(title)
    else:
        titles.append('-')
    
    # Description
    description_tag = repo.find('p', class_='col-9 color-fg-muted my-1 pr-4')
    description = description_tag.text.strip() if description_tag else '-'
    descriptions.append(description)
    
    # Contributors count
    contributors_tag = repo.find_all('a', class_='Link--muted d-inline-block mr-3')
    contributors_count = len(contributors_tag)
    contributors_counts.append(contributors_count)
    
    # Language used
    language_tag = repo.find('span', itemprop='programmingLanguage')
    language = language_tag.text.strip() if language_tag else '-'
    languages.append(language)

# Create a DataFrame
df_trending_repos = pd.DataFrame({
    'Repository Title': titles,
    'Repository Description': descriptions,
    'Contributors Count': contributors_counts,
    'Language Used': languages
})

df_trending_repos.head()


Unnamed: 0,Repository Title,Repository Description,Contributors Count,Language Used
0,-,The distributed financial transactions databas...,0,Zig
1,-,An extremely fast Python package installer and...,0,Rust
2,-,"18 Lessons, Get Started Building with Generati...",0,Jupyter Notebook
3,-,Elden Ring Save Editor. Compatible with PC and...,0,Rust
4,-,"☄🌌️ The minimal, blazing-fast, and infinitely ...",0,Rust


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 5 - Direct URL to the Hot 100 chart page
hot_100_url = "https://www.billboard.com/charts/hot-100/"

# Request the Hot 100 page
response = requests.get(hot_100_url)
response.raise_for_status()

# Parse the Hot 100 page content
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize lists to store data
songs = []
artists = []
last_week_ranks = []
peak_ranks = []
weeks_on_board = []

# Extract data from the Hot 100 page
chart_list = soup.find_all('li', class_='o-chart-results-list__item')
for chart_item in chart_list:
    # Song name
    song_name_tag = chart_item.find('h3', id='title-of-a-story')
    song_name = song_name_tag.text.strip() if song_name_tag else '-'
    songs.append(song_name)
    
    # Artist name
    artist_name_tag = chart_item.find('span', class_='c-label')
    artist_name = artist_name_tag.text.strip() if artist_name_tag else '-'
    artists.append(artist_name)
    
    # Last week rank
    last_week_rank_tag = chart_item.find('span', class_='c-label', attrs={'data-rank-last-week': True})
    last_week_rank = last_week_rank_tag.text.strip() if last_week_rank_tag else '-'
    last_week_ranks.append(last_week_rank)
    
    # Peak rank
    peak_rank_tag = chart_item.find('span', class_='c-label', attrs={'data-rank-peak': True})
    peak_rank = peak_rank_tag.text.strip() if peak_rank_tag else '-'
    peak_ranks.append(peak_rank)
    
    # Weeks on board
    weeks_on_board_tag = chart_item.find('span', class_='c-label', attrs={'data-weeks-on-chart': True})
    weeks_on_board = weeks_on_board_tag.text.strip() if weeks_on_board_tag else '-'
    weeks_on_board.append(weeks_on_board)

# Create a DataFrame
df_hot_100 = pd.DataFrame({
    'Song Name': songs,
    'Artist Name': artists,
    'Last Week Rank': last_week_ranks,
    'Peak Rank': peak_ranks,
    'Weeks on Board': weeks_on_board
})

df_hot_100.head()


AttributeError: 'str' object has no attribute 'append'

In [7]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 6 - URL of the Guardian page with the highest selling novels
url = "https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare"

# Request the page
response = requests.get(url)
response.raise_for_status()

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize lists to store data
book_names = []
author_names = []
volumes_sold = []
publishers = []
genres = []

# Find the table with the data
table = soup.find('table')

# Extract data from the table
for row in table.find_all('tr')[1:]:  # Skip the header row
    cols = row.find_all('td')
    if len(cols) >= 5:  # Ensure there are at least 5 columns
        book_names.append(cols[0].text.strip())
        author_names.append(cols[1].text.strip())
        volumes_sold.append(cols[2].text.strip())
        publishers.append(cols[3].text.strip())
        genres.append(cols[4].text.strip())

# Create a DataFrame
df_best_selling_novels = pd.DataFrame({
    'Book Name': book_names,
    'Author Name': author_names,
    'Volumes Sold': volumes_sold,
    'Publisher': publishers,
    'Genre': genres
})

df_best_selling_novels.head()


Unnamed: 0,Book Name,Author Name,Volumes Sold,Publisher,Genre
0,1,"Da Vinci Code,The","Brown, Dan",5094805,Transworld
1,2,Harry Potter and the Deathly Hallows,"Rowling, J.K.",4475152,Bloomsbury
2,3,Harry Potter and the Philosopher's Stone,"Rowling, J.K.",4200654,Bloomsbury
3,4,Harry Potter and the Order of the Phoenix,"Rowling, J.K.",4179479,Bloomsbury
4,5,Fifty Shades of Grey,"James, E. L.",3758936,Random House


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 7 - Correct URL of an IMDb page with a popular TV series list
url = "https://www.imdb.com/chart/toptv/"

# Headers to mimic a web browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Request the page with headers
response = requests.get(url, headers=headers)
response.raise_for_status()

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize lists to store data
names = []
year_spans = []
genres = []
run_times = []
ratings = []
votes = []

# Extract data from the page
items = soup.find_all('tr')[1:]  # Skip the header row
for item in items:
    # Name
    name_tag = item.find('td', class_='titleColumn')
    if name_tag:
        name = name_tag.a.text.strip()
        names.append(name)
    
    # Year span
    year_span_tag = name_tag.span if name_tag else None
    year_span = year_span_tag.text.strip("()") if year_span_tag else '-'
    year_spans.append(year_span)
    
    # Genre (not available in this table directly)
    genres.append('-')  # Placeholder
    
    # Run time (not available in this table directly)
    run_times.append('-')  # Placeholder
    
    # Ratings
    rating_tag = item.find('td', class_='ratingColumn imdbRating')
    rating = rating_tag.strong.text.strip() if rating_tag and rating_tag.strong else '-'
    ratings.append(rating)
    
    # Votes (not available in this table directly)
    votes.append('-')  # Placeholder

# Create a DataFrame
df_most_watched_tv_series = pd.DataFrame({
    'Name': names,
    'Year Span': year_spans,
    'Genre': genres,
    'Run Time': run_times,
    'Ratings': ratings,
    'Votes': votes
})

# Display the DataFrame
df_most_watched_tv_series.head()


Unnamed: 0,Name,Year Span,Genre,Run Time,Ratings,Votes


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Question 8  - Base URL of the UCI Machine Learning Repository
base_url = "https://archive.ics.uci.edu/ml/index.php"

# Headers to mimic a web browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Request the home page
response = requests.get(base_url, headers=headers)
response.raise_for_status()

# Parse the home page content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the link to the "View All Datasets" page by looking for a link containing the keyword 'datasets'
all_datasets_link = None
for a_tag in soup.find_all('a'):
    if 'datasets' in a_tag.get('href', ''):
        all_datasets_link = a_tag
        break

if all_datasets_link is None:
    raise Exception("Couldn't find the link to the datasets page")

# Debug: print the found link
print("Found link:", all_datasets_link)

all_datasets_url = "https://archive.ics.uci.edu/ml/" + all_datasets_link['href']

# Debug: print the constructed URL
print("Constructed URL:", all_datasets_url)

# Request the "View All Datasets" page
response = requests.get(all_datasets_url, headers=headers)
response.raise_for_status()

# Parse the "View All Datasets" page content
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize lists to store data
dataset_names = []
data_types = []
tasks = []
attribute_types = []
num_instances = []
num_attributes = []
years = []

# Extract data from the datasets table
table = soup.find('table', {'border': '1'})
rows = table.find_all('tr')[1:]  # Skip the header row

for row in rows:
    cols = row.find_all('td')
    dataset_names.append(cols[0].text.strip())
    data_types.append(cols[1].text.strip())
    tasks.append(cols[2].text.strip())
    attribute_types.append(cols[3].text.strip())
    num_instances.append(cols[4].text.strip())
    num_attributes.append(cols[5].text.strip())
    years.append(cols[6].text.strip())

# Create a DataFrame
df_datasets = pd.DataFrame({
    'Dataset Name': dataset_names,
    'Data Type': data_types,
    'Task': tasks,
    'Attribute Type': attribute_types,
    'No of Instances': num_instances,
    'No of Attributes': num_attributes,
    'Year': years
})

# Display the DataFrame
print(df_datasets.head())
