In [15]:
#1 Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'


import requests

from collections import Counter
import re

def get_most_frequent_words(url, topno=10):
    
    response = requests.get(url) # Fetch content from the URL
    content = response.text

    #we will need BeautifulSoup for HTML parsing
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')

   
    text = soup.get_text()  # Extracting text from the HTML

   
    words = re.findall(r'\b\w+\b', text.lower())  # Use regex to find words

    word_counts = Counter(words)  # Count occurrences of each word
   

    most_frequent_words = word_counts.most_common(topno)  # Get the top N most frequent words

    return most_frequent_words


romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'

# 10 most frequent words
most_frequent_words = get_most_frequent_words(romeo_and_juliet_url, 10)


print(most_frequent_words)


[('the', 5), ('gutenberg', 4), ('to', 4), ('you', 4), ('project', 3), ('about', 3), ('contact', 3), ('of', 3), ('and', 3), ('help', 3)]


In [16]:
#2 Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
#the min, max, mean, median, standard deviation of cats' weight in metric units.
#the min, max, mean, median, standard deviation of cats' lifespan in years.
#Create a frequency table of country and breed of cats
import requests
import pandas as pd
import numpy as np

cats_api = 'https://api.thecatapi.com/v1/breeds'


response = requests.get(cats_api) # Fetch data from the Cat API
breeds_data = response.json()


df = pd.DataFrame(breeds_data) # Create a DataFrame from the API response


print(df.columns) # Check the columns in the DataFrame

# Convert weight and lifespan to numeric values
# Convert weight and lifespan to numeric values
df['weight.metric'] = pd.to_numeric(df['weight'], errors='coerce') * 0.453592  # this convert pounds to kilograms
df['life_span'] = pd.to_numeric(df['life_span'].str.extract('(\d+)')[0], errors='coerce')  # this extract only numeric values

# Calculate statistics for weight
weight_stats = {
    'min': df['weight.metric'].min(),
    'max': df['weight.metric'].max(),
    'mean': df['weight.metric'].mean(),
    'median': df['weight.metric'].median(),
    'std_dev': df['weight.metric'].std()
}

# Calculate statistics for lifespan
lifespan_stats = {
    'min': df['life_span'].min(),
    'max': df['life_span'].max(),
    'mean': df['life_span'].mean(),
    'median': df['life_span'].median(),
    'std_dev': df['life_span'].std()
}

# Creating a frequency table
frequency_table = df.groupby(['origin', 'name']).size().reset_index(name='count')


print("Statistics for Cats' Weight (in metric units):")
print(weight_stats)

print("\nStatistics for Cats' Lifespan (in years):")
print(lifespan_stats)

print("\nFrequency Table of Country and Breed:")
print(frequency_table)


Index(['weight', 'id', 'name', 'cfa_url', 'vetstreet_url', 'vcahospitals_url',
       'temperament', 'origin', 'country_codes', 'country_code', 'description',
       'life_span', 'indoor', 'lap', 'alt_names', 'adaptability',
       'affection_level', 'child_friendly', 'dog_friendly', 'energy_level',
       'grooming', 'health_issues', 'intelligence', 'shedding_level',
       'social_needs', 'stranger_friendly', 'vocalisation', 'experimental',
       'hairless', 'natural', 'rare', 'rex', 'suppressed_tail', 'short_legs',
       'wikipedia_url', 'hypoallergenic', 'reference_image_id', 'cat_friendly',
       'bidability'],
      dtype='object')
Statistics for Cats' Weight (in metric units):
{'min': nan, 'max': nan, 'mean': nan, 'median': nan, 'std_dev': nan}

Statistics for Cats' Lifespan (in years):
{'min': 8, 'max': 18, 'mean': 12.074626865671641, 'median': 12.0, 'std_dev': 1.8283411328456125}

Frequency Table of Country and Breed:
           origin              name  count
0       Austr

In [17]:
#3 Read the countries API and find
#the 10 largest countries
#the 10 most spoken languages
#the total number of languages in the countries API
import requests
import pandas as pd


countries_api = 'https://restcountries.com/v2/all' # Fetch data from the Countries API
response = requests.get(countries_api)
countries_data = response.json()


df = pd.DataFrame(countries_data) # Create a DataFrame from the API response

# Convert area and population to numeric values
df['area'] = pd.to_numeric(df['area'], errors='coerce') 

df['population'] = pd.to_numeric(df['population'], errors='coerce')

#10 largest countries
largest_countries = df.nlargest(10, 'area')[['name', 'area']]
print("10 Largest Countries:")
print(largest_countries)

#10 most spoken languages
languages = [lang['name'] for country in countries_data for lang in country.get('languages', [])]
language_counts = pd.Series(languages).value_counts().head(10)
print("\n10 Most Spoken Languages:")
print(language_counts)

#total number of languages in the countries API
total_languages = pd.Series(languages).nunique()
print("\nTotal Number of Languages:")
print(total_languages)


10 Largest Countries:
                         name        area
185        Russian Federation  17124442.0
8                  Antarctica  14000000.0
42                     Canada   9984670.0
48                      China   9640011.0
239  United States of America   9629091.0
31                     Brazil   8515767.0
13                  Australia   7692024.0
104                     India   3287590.0
10                  Argentina   2780400.0
117                Kazakhstan   2724900.0

10 Most Spoken Languages:
English       91
French        45
Arabic        25
Spanish       24
Portuguese    10
Russian        8
Dutch          8
German         7
Chinese        5
Italian        4
Name: count, dtype: int64

Total Number of Languages:
123


In [18]:
#4 UCI is one of the most common places to get data sets for data science and machine learning. 
#Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4
import requests
from bs4 import BeautifulSoup


uci_url = 'https://archive.ics.uci.edu/ml/datasets.php' # URL of the UCI Machine Learning Repository


response = requests.get(uci_url) # Fetch the HTML content from the URL
html_content = response.text


soup = BeautifulSoup(html_content, 'html.parser') # Parse the HTML using BeautifulSoup

# Extract and print the 1st 400content
print(soup.prettify()[:400])


<!DOCTYPE html>
<html data-theme="light" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Discover datasets around the world!" name="description"/>
  <link href="/favicon.ico" rel="icon"/>
  <link href="/apple-touch-icon.png" rel="apple-touch-icon"/>
  <!-- Provides metadata used when the app is installed on a m
