In [8]:
import pandas as pd
import requests
from io import StringIO

url = 'https://en.wikipedia.org/wiki/List_of_brightest_stars'
headers = {
    'User-Agent': 'Mozilla/5.0'
}

response = requests.get(url, headers=headers)
html_data = StringIO(response.text)

# USE MATCH: This skips the warning boxes and finds the actual data table
tables = pd.read_html(html_data, match='Proper name')
df = tables[0]

# Flatten Multi-index headers
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [' '.join(col).strip() for col in df.columns.values]

print(f"Success! Found {len(df)} stars.")

# Wikipedia's headers often have extra spaces or symbols after flattening. 
# Let's see the first 5 rows of the columns that exist.
print(df.head())

df.to_csv('brightest_stars_cleaned.csv', index=False)

Success! Found 94 stars.
   Rank Visual magnitude (mV)             Proper name[8] Bayer designation  \
0     0                −26.74                        Sun               NaN   
1     1                 −1.46                     Sirius   α Canis Majoris   
2     2                 −0.74                    Canopus         α Carinae   
3     3   −0.27 (0.01 + 1.33)  Rigil Kentaurus & Toliman        α Centauri   
4     4                 −0.05                   Arcturus          α Boötis   

   Distance (ly)  Spectral type Celestial Hemisphere  
0           0.00           G2 V                  NaN  
1           8.60  A0mA1 Va, DA2             Southern  
2         310.00          A9 II             Southern  
3           4.34     G2 V, K1 V             Southern  
4          37.00         K0 III             Northern  
