# Pandas - typy - lekcja

In [1]:
import pandas as pd

## Series

In [3]:
DATA = [1.0, 2.0, 3.0, 4.0]

s = pd.Series(DATA)

In [6]:
s

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [8]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
len(dir(s))

417

## DataFrame

In [9]:
DATA = {
    'A': ['a', 'b', 'c', 'd'],
    'B': [11, 22, 33, 44],
    'C': [1.1, 2.2, 3.3, 4.4],
}

df = pd.DataFrame(DATA)

In [10]:
df

Unnamed: 0,A,B,C
0,a,11,1.1
1,b,22,2.2
2,c,33,3.3
3,d,44,4.4


In [11]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [12]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [16]:
len(dir(df))

428

## Brakujące dane

- `np.nan`
- `pd.NA` (eksperymentalnie od Pandas 1.0)
- sprawdzanie czy brakujące `np.isnull()` i `pd.isna()`

In [38]:
DATA = {
    'A': ['a', 'b', 'c', 'd'],
    'B': [11, 22, 33, 44],
    'C': [1.1, 2.2, None, 4.4],
}

df = pd.DataFrame(DATA)
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,False,False
2,False,False,True
3,False,False,False


## Inne typy

- SparseArray
- Interval
- Timestamp
- Timedelta
- Categorical

In [42]:
EMPTY = [pd.NA, pd.NA, pd.NA]
IRIS = ['setosa', 'virginica', 'versicolor']
APOLLO = ['apollo11', 'apollo12', 'apollo14', 'apollo15', 'apollo16', 'apollo17']
STATUS = ['todo', 'done', 'todo', 'done']


pd.DataFrame({
    'A': [1,2,3],
    'B': pd.arrays.SparseArray([pd.NA, pd.NA, pd.NA]),
})

Unnamed: 0,A,B
0,1,
1,2,
2,3,


In [43]:
pd.Categorical(IRIS)

['setosa', 'virginica', 'versicolor']
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [45]:
moon_landings = pd.Categorical(['apollo11', 'apollo12', 'apollo14', 'apollo15', 'apollo16', 'apollo17'])

In [46]:
'apollo18' in moon_landings

False

In [47]:
'apollo11' in moon_landings

True

In [48]:
'apollo13' in moon_landings

False

In [60]:
DATA = pd.Interval(0, 5, closed='right')

5 in DATA

True

In [61]:
0 in DATA

False

In [63]:
pd.Timestamp('21 July 1969, 2:54')

Timestamp('1969-07-21 02:54:00')

In [64]:
pd.Timestamp('1969-07-21')

Timestamp('1969-07-21 00:00:00')

In [65]:
year1970 = pd.Interval(
    left=pd.Timestamp('1970-01-01 00:00:00'),
    right=pd.Timestamp('1971-01-01 00:00:00'),
    closed='left'
)

In [66]:
apollo11 = pd.Timestamp('1969-07-16')
apollo13 = pd.Timestamp('1970-04-11')

In [67]:
apollo11 in year1970

False

In [68]:
apollo13 in year1970

True

In [73]:
apollo11 + pd.Timedelta(days=10)

Timestamp('1969-07-26 00:00:00')

In [74]:
apollo11 + pd.DateOffset(days=10)

Timestamp('1969-07-26 00:00:00')

In [75]:
# https://python.astrotech.io/numerical-analysis/pandas/date-calendar.html#custom-calendar
# https://python.astrotech.io/numerical-analysis/pandas/date-frequency.html

In [84]:
data = pd.date_range(start='2019-12-24',end='2019-12-31', freq='S')

pd.Series(data)

0        2019-12-24 00:00:00
1        2019-12-24 00:00:01
2        2019-12-24 00:00:02
3        2019-12-24 00:00:03
4        2019-12-24 00:00:04
                 ...        
604796   2019-12-30 23:59:56
604797   2019-12-30 23:59:57
604798   2019-12-30 23:59:58
604799   2019-12-30 23:59:59
604800   2019-12-31 00:00:00
Length: 604801, dtype: datetime64[ns]