In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Web scraping from https://toscrape.com/
url = 'http://books.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract book data (title, price, availability)
books = []
for article in soup.select('article.product_pod'):
    title = article.select_one('h3 a')['title']
    price = article.select_one('.price_color').text
    availability = article.select_one('.instock.availability').text.strip() if article.select_one('.instock.availability') else 'N/A'
    books.append({'title': title, 'price': price, 'availability': availability})

# Create DataFrame
df = pd.DataFrame(books)

# Display the first few rows to verify
df.head()

Unnamed: 0,title,price,availability
0,A Light in the Attic,£51.77,In stock
1,Tipping the Velvet,£53.74,In stock
2,Soumission,£50.10,In stock
3,Sharp Objects,£47.82,In stock
4,Sapiens: A Brief History of Humankind,£54.23,In stock


In [3]:
# EDA - Answer the questions

# 1. What is the size of the dataset?
size = df.shape
print(f"Size of the dataset: {size[0]} rows, {size[1]} columns")

Size of the dataset: 20 rows, 3 columns


In [4]:
# 2. What are the names and data types of each column?
column_info = df.dtypes
print("\nNames and data types of each column:")
print(column_info)


Names and data types of each column:
title            object
price           float64
availability     object
dtype: object


In [5]:
# 3. How many unique values are there for each categorical variable?
unique_values = df.select_dtypes(include=['object']).nunique()
print("\nNumber of unique values for each categorical variable:")
print(unique_values)


Number of unique values for each categorical variable:
title           20
availability     1
dtype: int64


In [10]:
# 4. If there is any numerical value in the dataset, what are the minimum and maximum values for it?
# Convert price to numeric by removing '£' and converting to float, handling potential errors
df['price'] = df['price'].replace('£', '', regex=True).astype(float, errors='ignore')
numeric_summary = df.describe()
print("\nMinimum and maximum values for numerical columns:")
print(numeric_summary.loc[['min', 'max']][['price']])


Minimum and maximum values for numerical columns:
     price
min  13.99
max  57.25


In [8]:
# 5. Drop rows that have missing values
df_cleaned = df.dropna()
print(f"\nNumber of rows after dropping missing values: {df_cleaned.shape[0]}")


Number of rows after dropping missing values: 20


In [9]:
# 6. What are the most frequent categories in the data?
most_frequent = df['availability'].value_counts().head(1)
print(f"\nMost frequent category in 'availability': {most_frequent.index[0]} with {most_frequent.values[0]} occurrences")
# Observation: The most frequent availability category is likely 'In stock' or 'N/A', indicating that most books have a consistent stock status, which suggests the dataset primarily reflects available items.


Most frequent category in 'availability': In stock with 20 occurrences
