# 07_03: More data formats with Pandas

In [1]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [2]:
df = pd.read_csv('Planets.csv', thousands=',', parse_dates=['FirstVisited'],
                 dtype={'Diameter': np.float64, 'MeanTemperature': np.float64,
                        'Rings': 'category', 'MagneticField': 'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Planet           10 non-null     object        
 1   Mass             10 non-null     float64       
 2   Diameter         10 non-null     float64       
 3   DayLength        10 non-null     float64       
 4   SunDistance      9 non-null      float64       
 5   OrbitPeriod      10 non-null     float64       
 6   OrbitVelocity    10 non-null     float64       
 7   MeanTemperature  10 non-null     float64       
 8   SurfacePressure  6 non-null      float64       
 9   Moons            10 non-null     int64         
 10  Rings            10 non-null     category      
 11  MagneticField    9 non-null      category      
 12  FirstVisited     9 non-null      datetime64[ns]
 13  FirstMission     9 non-null      object        
dtypes: category(2), datetime64[ns](1), float64(8)

In [3]:
df.to_pickle('myplanets.pkl')

In [4]:
pd.read_pickle('myplanets.pkl').head()

Unnamed: 0,Planet,Mass,Diameter,DayLength,SunDistance,OrbitPeriod,OrbitVelocity,MeanTemperature,SurfacePressure,Moons,Rings,MagneticField,FirstVisited,FirstMission
0,MERCURY,0.33,4879.0,4222.6,57.9,88.0,47.4,167.0,0.0,0,No,Yes,1974-03-29,Mariner 10
1,VENUS,4.87,12104.0,2802.0,108.2,224.7,35.0,464.0,92.0,0,No,No,1962-08-27,Mariner 2
2,EARTH,5.97,12756.0,24.0,149.6,365.2,29.8,15.0,1.0,1,No,Yes,NaT,
3,MOON,0.073,3475.0,708.7,,27.3,1.0,-20.0,0.0,0,No,No,1959-09-12,Luna 2
4,MARS,0.642,6792.0,24.7,227.9,687.0,24.1,-65.0,0.01,2,No,No,1965-07-15,Mariner 4


In [5]:
df.to_feather('planets.feather')

In [6]:
pd.read_feather('planets.feather').head()

Unnamed: 0,Planet,Mass,Diameter,DayLength,SunDistance,OrbitPeriod,OrbitVelocity,MeanTemperature,SurfacePressure,Moons,Rings,MagneticField,FirstVisited,FirstMission
0,MERCURY,0.33,4879.0,4222.6,57.9,88.0,47.4,167.0,0.0,0,No,Yes,1974-03-29,Mariner 10
1,VENUS,4.87,12104.0,2802.0,108.2,224.7,35.0,464.0,92.0,0,No,No,1962-08-27,Mariner 2
2,EARTH,5.97,12756.0,24.0,149.6,365.2,29.8,15.0,1.0,1,No,Yes,NaT,
3,MOON,0.073,3475.0,708.7,,27.3,1.0,-20.0,0.0,0,No,No,1959-09-12,Luna 2
4,MARS,0.642,6792.0,24.7,227.9,687.0,24.1,-65.0,0.01,2,No,No,1965-07-15,Mariner 4


In [None]:
# also: df.to_parquet('planets.parquet')
#       pd.read_parquet('planets.parquet')

In [7]:
pd.read_json('Planets-records.json').head()

Unnamed: 0,Planet,Mass,FirstMission
0,MERCURY,0.33,Mariner 10
1,VENUS,4.87,Mariner 2
2,EARTH,5.97,
3,MOON,0.073,Luna 2
4,MARS,0.642,Mariner 4


In [None]:
# df.to_json('Planets-records.json', orient='records')

In [8]:
pd.read_json('Planets-index.json', orient='index')

Unnamed: 0,Mass,FirstMission
MERCURY,0.33,Mariner 10
VENUS,4.87,Mariner 2
EARTH,5.97,
MOON,0.073,Luna 2
MARS,0.642,Mariner 4
JUPITER,1898.0,Pioneer 10
SATURN,568.0,Pioneer 11
URANUS,86.8,Voyager 2
NEPTUNE,102.0,Voyager 2
PLUTO,0.0146,New Horizons


In [9]:
pd.read_json('Planets-index.json')

Unnamed: 0,MERCURY,VENUS,EARTH,MOON,MARS,JUPITER,SATURN,URANUS,NEPTUNE,PLUTO
Mass,0.33,4.87,5.97,0.073,0.642,1898.0,568.0,86.8,102.0,0.0146
FirstMission,Mariner 10,Mariner 2,,Luna 2,Mariner 4,Pioneer 10,Pioneer 11,Voyager 2,Voyager 2,New Horizons


In [10]:
pd.read_json('Planets-split.json', orient='split')

Unnamed: 0,Planet,Mass,FirstMission
0,MERCURY,0.33,Mariner 10
1,VENUS,4.87,Mariner 2
2,EARTH,5.97,
3,MOON,0.073,Luna 2
4,MARS,0.642,Mariner 4
5,JUPITER,1898.0,Pioneer 10
6,SATURN,568.0,Pioneer 11
7,URANUS,86.8,Voyager 2
8,NEPTUNE,102.0,Voyager 2
9,PLUTO,0.0146,New Horizons


In [None]:
# pd.read_html('https://en.wikipedia.org/wiki/Athletics_at_the_2024_Summer_Olympics')

In [11]:
pd.read_html('Athletics_at_the_2024_Summer_Olympics.html')

[  Athletics at the Games of the XXXIII Olympiad  \
 0                                           NaN   
 1                                         Venue   
 2                                         Dates   
 3                                 No. of events   
 4                                   Competitors   
 5                                  ← 20202028 →   
 
      Athletics at the Games of the XXXIII Olympiad.1  
 0                                                NaN  
 1  Stade de France (track and field events) Pont ...  
 2                                   1–11 August 2024  
 3                     48 (23 men, 23 women, 2 mixed)  
 4                                               1810  
 5                                       ← 20202028 →  ,
    Athletics at the 2024 Summer Olympics  \
 0                                    NaN   
 1                          Qualification   
 2                           Track events   
 3                                  100 m   
 4              

In [12]:
# select on attribute with attrs = {'id': ...}
pd.read_html('Athletics_at_the_2024_Summer_Olympics.html', match='Rank')[0]

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total
0,1,United States,14,11,9,34
1,2,Kenya,4,2,5,11
2,3,Canada,3,1,1,5
3,4,Netherlands,2,1,3,6
4,5,Spain,2,1,1,4
5,6,Norway,2,1,0,3
6,7,Great Britain,1,4,5,10
7,8,Jamaica,1,3,2,6
8,9,Ethiopia,1,3,0,4
9,10,Australia,1,2,4,7


In [13]:
pd.read_html('Athletics_at_the_2024_Summer_Olympics.html', match='Rank', index_col=1)[0].tail()

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Poland,37,0,0,1,1
Puerto Rico,37,0,0,1,1
Qatar,37,0,0,1,1
Zambia,37,0,0,1,1
Totals (43 entries),Totals (43 entries),48,48,49,145


In [14]:
pd.read_html('Athletics_at_the_2024_Summer_Olympics.html', match='Rank', index_col=1, skiprows=[44])[0].tail()

Unnamed: 0_level_0,Rank,Gold,Silver,Bronze,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Czech Republic,37,0,0,1,1
Poland,37,0,0,1,1
Puerto Rico,37,0,0,1,1
Qatar,37,0,0,1,1
Zambia,37,0,0,1,1
