# Pandas - DataFrame - wyszukiwanie - lekcja

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(0)

df = pd.DataFrame(
    columns = ['Morning', 'Noon', 'Evening', 'Midnight'],
    index = pd.date_range('1999-12-30', periods=7),
    data = np.random.randn(7, 4))

df

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


## Próbkowanie

- First
- Last
- Head
- Tail
- Sample

In [25]:
df2 = df.sample(frac=1.0).reset_index()
df2.columns = ['Datetime', 'Morning', 'Noon', 'Evening', 'Midnight']
df2

Unnamed: 0,Datetime,Morning,Noon,Evening,Midnight
0,1999-12-31,1.867558,-0.977278,0.950088,-0.151357
1,1999-12-30,1.764052,0.400157,0.978738,2.240893
2,2000-01-05,2.269755,-1.454366,0.045759,-0.187184
3,2000-01-03,1.494079,-0.205158,0.313068,-0.854096
4,2000-01-04,-2.55299,0.653619,0.864436,-0.742165
5,2000-01-02,0.761038,0.121675,0.443863,0.333674
6,2000-01-01,-0.103219,0.410599,0.144044,1.454274


## Wybieranie wartości

- Wiersze:

    - Numeric Index
    - String Index
    - Date Index
    
- Kolumny

    - Po nazwie (jedna kolumna, wiele kolumn)
    - Po indeksach (jedna kolumna, wiele kolumn) 

In [34]:
df.Morning

1999-12-30    1.764052
1999-12-31    1.867558
2000-01-01   -0.103219
2000-01-02    0.761038
2000-01-03    1.494079
2000-01-04   -2.552990
2000-01-05    2.269755
Freq: D, Name: Morning, dtype: float64

In [35]:
df['Morning']

1999-12-30    1.764052
1999-12-31    1.867558
2000-01-01   -0.103219
2000-01-02    0.761038
2000-01-03    1.494079
2000-01-04   -2.552990
2000-01-05    2.269755
Freq: D, Name: Morning, dtype: float64

In [37]:
columns = ['Morning','Noon']
df[columns]

Unnamed: 0,Morning,Noon
1999-12-30,1.764052,0.400157
1999-12-31,1.867558,-0.977278
2000-01-01,-0.103219,0.410599
2000-01-02,0.761038,0.121675
2000-01-03,1.494079,-0.205158
2000-01-04,-2.55299,0.653619
2000-01-05,2.269755,-1.454366


## Wycinanie wartości

- Wiersze:

    - Numeric Index
    - String Index
    - Date Index, zakresy dat (rok, miesiąc)
    
- Kolumny

    - Po nazwie (jedna kolumna, wiele kolumn)
    - Po indeksach (jedna kolumna, wiele kolumn) 
    
- Wiersze i kolumny

    - wybrane wiersze i wiele kolumn
    - zakres wierszy i jedna kolumna
    - jeden wiersz i wiele kolumn
    - jeden wiersz i zakres kolumn
    - indeksy binarne (maski)
    - fancy indexing
    - callable

In [63]:
df[1:7:2]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-04,-2.55299,0.653619,0.864436,-0.742165


In [65]:
df[2:]

Unnamed: 0,Morning,Noon,Evening,Midnight
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


In [62]:
df

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


## Selekcja

- loc i iloc
- at i iat

<img src="img/pandas-select-row.png" width="800">
<img src="img/pandas-select-column.png" width="800">
<img src="img/pandas-select-cell.png" width="800">
<img src="img/pandas-select.png" width="800">

In [68]:
df.iat[5,0]

-2.5529898158340787

In [69]:
df.at['2000-01-04', 'Morning']

-2.5529898158340787

In [74]:
df.iloc[5,0]

-2.5529898158340787

In [75]:
df.iloc[5]

Morning    -2.552990
Noon        0.653619
Evening     0.864436
Midnight   -0.742165
Name: 2000-01-04 00:00:00, dtype: float64

In [78]:
df.iloc[:, 2]

1999-12-30    0.978738
1999-12-31    0.950088
2000-01-01    0.144044
2000-01-02    0.443863
2000-01-03    0.313068
2000-01-04    0.864436
2000-01-05    0.045759
Freq: D, Name: Evening, dtype: float64

In [80]:
df.iloc[2:7:3, 1:4:2]

Unnamed: 0,Noon,Midnight
2000-01-01,0.410599,1.454274
2000-01-04,0.653619,-0.742165


In [86]:
df.loc['2000-01-01':'2000-01-04':3, 'Morning':'Evening':2]

Unnamed: 0,Morning,Evening
2000-01-01,-0.103219,0.144044
2000-01-04,-2.55299,0.864436


In [101]:
start = '2000-01-01'
end = '2000-01-04'
columns = slice('Morning', 'Evening', 2)

df.loc[start:end:3, columns].iat[0,0]

-0.10321885179355784

In [100]:
df2.iat[0,0]

-0.10321885179355784

In [None]:
df.loc['2000-01']

In [107]:
df.loc['2000-01'].first('W')

Unnamed: 0,Morning,Noon,Evening,Midnight
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674


In [110]:
df.loc['1999', 'Morning']

1999-12-30    1.764052
1999-12-31    1.867558
Freq: D, Name: Morning, dtype: float64

In [109]:
df.loc['1999', ['Noon', 'Morning']]

Unnamed: 0,Noon,Morning
1999-12-30,0.400157,1.764052
1999-12-31,-0.977278,1.867558


In [114]:
df.loc['1999', 'Morning':'Evening']

Unnamed: 0,Morning,Noon,Evening
1999-12-30,1.764052,0.400157,0.978738
1999-12-31,1.867558,-0.977278,0.950088


In [113]:
df.loc['1999', 'Morning':'Evening'].first('W')

Unnamed: 0,Morning,Noon,Evening
1999-12-30,1.764052,0.400157,0.978738
1999-12-31,1.867558,-0.977278,0.950088


In [116]:
df.loc[['1999-12-30', '2000-01-05'], 'Noon']

1999-12-30    0.400157
2000-01-05   -1.454366
Name: Noon, dtype: float64

In [None]:
date1 = pd.Timestamp('2000-01-02')
date2 = pd.Timestamp('2000-01-05')

df.loc[[date1,date2], 'Noon']

In [119]:
mask = [True, False, False, True, False, True, True]
df.loc[mask]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


In [123]:
df.loc[[True, False, False, True, False, True, True], [True, False, True, False]]

Unnamed: 0,Morning,Evening
1999-12-30,1.764052,0.978738
2000-01-02,0.761038,0.443863
2000-01-04,-2.55299,0.864436
2000-01-05,2.269755,0.045759


In [124]:
df[df > 0]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,,0.950088,
2000-01-01,,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,,0.313068,
2000-01-04,,0.653619,0.864436,
2000-01-05,2.269755,,0.045759,


## Wyszukiwanie

- np.where
- maski
- zapytania proste
- zapytania złożone
- logical_and
- logical_not
- logical_or
- logical_qor

In [125]:
query = np.where(df > 2)

df.iloc[query]

Unnamed: 0,Midnight,Morning
1999-12-30,2.240893,1.764052
2000-01-05,-0.187184,2.269755


In [126]:
df[df['Noon'] < 0]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


In [128]:
query = df['Noon'] < 0
df[query]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


In [129]:
query1 = df['Noon'] < 0
query2 = df['Evening'] > 0.5

df[query1 & query2]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-31,1.867558,-0.977278,0.950088,-0.151357


In [None]:
df[(df['Noon']<0) & (df['Evening']>0.5)]

In [130]:
where = (df['Noon'] < 0) & (df['Evening'] > 0.5)
df[where]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-31,1.867558,-0.977278,0.950088,-0.151357


In [None]:
a = df['Noon'] < 0
b = df['Evening'] > 0.5
query = a & b

# query
# select
# where
# a, b, c, d, ...

df[query]

In [142]:
df.loc[df['Noon']<0, ['Noon','Midnight']]

Unnamed: 0,Noon,Midnight
1999-12-31,-0.977278,-0.151357
2000-01-03,-0.205158,-0.854096
2000-01-05,-1.454366,-0.187184


In [143]:
df.loc[df['Noon']<0, 'Noon':'Midnight']

Unnamed: 0,Noon,Evening,Midnight
1999-12-31,-0.977278,0.950088,-0.151357
2000-01-03,-0.205158,0.313068,-0.854096
2000-01-05,-1.454366,0.045759,-0.187184


In [146]:
df2 = df.copy()

In [150]:
df2.loc[df2['Noon']<0, 'Noon':'Midnight'] = 0
df2

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,0.0,0.0,0.0
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,0.0,0.0,0.0
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,0.0,0.0,0.0


In [155]:
def morning_below_zero(df):
    return abs(df['Morning']) > 1


df.loc[morning_below_zero]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


In [154]:
df.loc[lambda df: abs(df['Morning']) > 1]

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


## Modyfikacja danych

- Cały wiersz
- Cała kolumna
- Wybrane kolumny i wybrane wiersze
- Zakres kolumn i zakres wierszy
- Konkretna komórka
- Spełniające wyrażenie
- Podmiana wartości `replace`

In [159]:
df.loc[lambda df: abs(df['Morning']) > 1] = pd.NA

In [175]:
df3 = pd.DataFrame({
    'A': ['Watney', 'Twardowski', 'Lewis', 'Vogel'],
    'B': np.random.randn(4),
})

In [166]:
df3['B'] = (df3['B'] < 0).replace({True: 1, False: 'brak danych'})

In [176]:
select = (df3['B'] < 0) # bo czujnik był zepsuty i wartości poniżej zero są błędne
replace = {
    True: 1,
    False: pd.NA}

df3['B'] = select.replace(replace)

In [177]:
df3

Unnamed: 0,A,B
0,Watney,1.0
1,Twardowski,
2,Lewis,1.0
3,Vogel,


In [178]:
df.isna()

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,True,True,True,True
1999-12-31,True,True,True,True
2000-01-01,False,False,False,False
2000-01-02,False,False,False,False
2000-01-03,True,True,True,True
2000-01-04,True,True,True,True
2000-01-05,True,True,True,True


In [179]:
df

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,,,,
1999-12-31,,,,
2000-01-01,-0.103219,0.410599,0.144044,1.45427
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,,,,
2000-01-04,,,,
2000-01-05,,,,


In [180]:
df.fillna(0)

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,0.0,0.0,0.0,0.0
1999-12-31,0.0,0.0,0.0,0.0
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,0.0,0.0,0.0,0.0
2000-01-04,0.0,0.0,0.0,0.0
2000-01-05,0.0,0.0,0.0,0.0
