In [None]:
# Scrape SMART attributes from a Wikipedia table and output then as .csv.
# This will be used to map SMART attribute IDs to their names and descriptions.
# Written by Kevin Cotellesso
import pandas as pd
import requests


In [2]:
# Scrape SMART attributes
url = 'https://en.wikipedia.org/wiki/Self-Monitoring,_Analysis_and_Reporting_Technology'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:146.0) Gecko/20100101 Firefox/146.0'}
response = requests.get(url, headers=headers)


In [None]:
# Parse the HTML response
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
print(f"Response status code: {response.status_code}")


Response status code: 200


In [13]:
# Find the SMART attributes table
# The table is usually titled "Known ATA S.M.A.R.T. attributes"
tables = soup.find_all('table', class_='wikitable')
print(f"Found {len(tables)} wikitable(s)")

# Extract the main SMART attributes table (usually the first one)
if tables:
    smart_table = tables[1]
    df = pd.read_html(str(smart_table))[0]
    print(f"\nTable shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    display(df.head(10))


Found 5 wikitable(s)

Table shape: (89, 5)

Columns: ['ID', 'Attribute name', 'Ideal', '!', 'Description']

First few rows:


  df = pd.read_html(str(smart_table))[0]


Unnamed: 0,ID,Attribute name,Ideal,!,Description
0,01 0x01,Read Error Rate,Low,,(Vendor specific raw value.) Stores data relat...
1,02 0x02,Throughput Performance,High,,Overall (general) throughput performance of a ...
2,03 0x03,Spin-Up Time,Low,,Average time of spindle spin up (from zero RPM...
3,04 0x04,Start/Stop Count,,,A tally of spindle start/stop cycles. The spin...
4,05 0x05,Reallocated Sectors Count,Low,[33][34][35],Count of reallocated sectors. The raw value re...
5,06 0x06,Read Channel Margin,,,Margin of a channel while reading data. The fu...
6,07 0x07,Seek Error Rate,Varies,,(Vendor specific raw value.) Rate of seek erro...
7,08 0x08,Seek Time Performance,High,,Average performance of seek operations of the ...
8,09 0x09,Power-On Hours,,,Count of hours in power-on state. The raw valu...
9,10 0x0A,Spin Retry Count,Low,[42],Count of retry of spin start attempts. This at...


In [14]:
# Clean up the data a little. Get rid of the ! column and turn the ID into an integer.
df = df.drop(columns=['!'])
df['ID'] = df['ID'].str.split(" ").str[0].astype(int)
print(f"\nCleaned DataFrame shape: {df.shape}")
display(df.head(10))


Cleaned DataFrame shape: (89, 4)


Unnamed: 0,ID,Attribute name,Ideal,Description
0,1,Read Error Rate,Low,(Vendor specific raw value.) Stores data relat...
1,2,Throughput Performance,High,Overall (general) throughput performance of a ...
2,3,Spin-Up Time,Low,Average time of spindle spin up (from zero RPM...
3,4,Start/Stop Count,,A tally of spindle start/stop cycles. The spin...
4,5,Reallocated Sectors Count,Low,Count of reallocated sectors. The raw value re...
5,6,Read Channel Margin,,Margin of a channel while reading data. The fu...
6,7,Seek Error Rate,Varies,(Vendor specific raw value.) Rate of seek erro...
7,8,Seek Time Performance,High,Average performance of seek operations of the ...
8,9,Power-On Hours,,Count of hours in power-on state. The raw valu...
9,10,Spin Retry Count,Low,Count of retry of spin start attempts. This at...


In [15]:
# Clean the data and save to CSV
# Remove any multi-level column headers if present
if isinstance(df.columns, pd.MultiIndex):
    df.columns = ['_'.join(col).strip() for col in df.columns.values]

# Save to CSV
output_path = 'SMART_attributes.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved SMART attributes to: {output_path}")
print(f"Total attributes: {len(df)}")



Saved SMART attributes to: SMART_attributes.csv
Total attributes: 89
