In [1]:
# Scrape SMART attributes from a Wikipedia table and output then as .csv
import pandas as pd
import requests


In [2]:
# Scrape SMART attributes
url = 'https://en.wikipedia.org/wiki/Self-Monitoring,_Analysis_and_Reporting_Technology'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:146.0) Gecko/20100101 Firefox/146.0'}
response = requests.get(url, headers=headers)


In [3]:
# Parse the HTML response
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')
print(f"Response status code: {response.status_code}")


Response status code: 200


In [4]:
# Find the SMART attributes table
# The table is usually titled "Known ATA S.M.A.R.T. attributes"
tables = soup.find_all('table', class_='wikitable')
print(f"Found {len(tables)} wikitable(s)")

# Extract the main SMART attributes table (usually the first one)
if tables:
    smart_table = tables[0]
    df = pd.read_html(str(smart_table))[0]
    print(f"\nTable shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    display(df.head())


Found 5 wikitable(s)


  df = pd.read_html(str(smart_table))[0]


ImportError: Missing optional dependency 'lxml'.  Use pip or conda to install lxml.

In [None]:
# Clean the data and save to CSV
# Remove any multi-level column headers if present
if isinstance(df.columns, pd.MultiIndex):
    df.columns = ['_'.join(col).strip() for col in df.columns.values]

# Save to CSV
output_path = 'datasets/SMART_attributes.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved SMART attributes to: {output_path}")
print(f"Total attributes: {len(df)}")
