# Pandas - DataFrame - obliczenia - lekcja

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(0)

df = pd.DataFrame(
    columns = ['Morning', 'Noon', 'Evening', 'Midnight'],
    index = pd.date_range('1999-12-30', periods=7),
    data = np.random.randn(7, 4))

df
#              Morning      Noon   Evening  Midnight
# 1999-12-30  1.764052  0.400157  0.978738  2.240893
# 1999-12-31  1.867558 -0.977278  0.950088 -0.151357
# 2000-01-01 -0.103219  0.410599  0.144044  1.454274
# 2000-01-02  0.761038  0.121675  0.443863  0.333674
# 2000-01-03  1.494079 -0.205158  0.313068 -0.854096
# 2000-01-04 -2.552990  0.653619  0.864436 -0.742165
# 2000-01-05  2.269755 -1.454366  0.045759 -0.187184

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


## Statystyka

- Liczba elementów
- Liczba unikalnych elementów
- Liczba wartości w kolumnach
- Suma i suma kumulatywna
- Iloczyn i iloczyn kumulatywny
- Minimum, indeks minimum, kumulatywne minimum
- Maksimum, indeks maksimum, kumulatywne maksimum
- Średnia arytmetyczna
- Błąd standardowy, błąd odchylenia średniej (`sem`)
- Mediana
- Moda
- Odchylenie standardowe
- Kwantyle (Percentyle)
- Wariancja
- Współczynnik korelacji
- Statystyki opisowe
- Średnie odchylenie bezwzględne `mad` (średnia arytmetyczna z odchyleń bezwzględnych) [miara zróżnicowania rozkładu]
- Skośność (w jaki sposób zmienne kształtują się wokół średniej)
- Kurtoza (jak bardzo wyniki są skoncentrowane lub oddalone od średniej)
- Wartość bezwzględna

<img src="img/stats-stdev.png" width="800" />
<img src="img/stats-kurt.png" width="800" />
<img src="img/stats-skew.png" width="800" />
<img src="img/stats-rolling.png" width="800" />
<img src="img/stats-sem.png" width="800" />

In [3]:
df

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,1.764052,0.400157,0.978738,2.240893
1999-12-31,1.867558,-0.977278,0.950088,-0.151357
2000-01-01,-0.103219,0.410599,0.144044,1.454274
2000-01-02,0.761038,0.121675,0.443863,0.333674
2000-01-03,1.494079,-0.205158,0.313068,-0.854096
2000-01-04,-2.55299,0.653619,0.864436,-0.742165
2000-01-05,2.269755,-1.454366,0.045759,-0.187184


In [4]:
df.mean()

Morning     0.785753
Noon       -0.150107
Evening     0.534285
Midnight    0.299148
dtype: float64

In [5]:
df.std()

Morning     1.671798
Noon        0.787967
Evening     0.393169
Midnight    1.151785
dtype: float64

In [6]:
df.rolling(window=3).mean()

Unnamed: 0,Morning,Noon,Evening,Midnight
1999-12-30,,,,
1999-12-31,,,,
2000-01-01,1.17613,-0.055507,0.690957,1.18127
2000-01-02,0.841792,-0.148335,0.512665,0.54553
2000-01-03,0.717299,0.109038,0.300325,0.311284
2000-01-04,-0.099291,0.190045,0.540456,-0.420862
2000-01-05,0.403615,-0.335302,0.407754,-0.594482


In [7]:
df.corr()

Unnamed: 0,Morning,Noon,Evening,Midnight
Morning,1.0,-0.69834,-0.190219,0.201034
Noon,-0.69834,1.0,0.307686,0.359761
Evening,-0.190219,0.307686,1.0,0.136436
Midnight,0.201034,0.359761,0.136436,1.0


In [8]:
df.describe()

Unnamed: 0,Morning,Noon,Evening,Midnight
count,7.0,7.0,7.0,7.0
mean,0.785753,-0.150107,0.534285,0.299148
std,1.671798,0.787967,0.393169,1.151785
min,-2.55299,-1.454366,0.045759,-0.854096
25%,0.328909,-0.591218,0.228556,-0.464674
50%,1.494079,0.121675,0.443863,-0.151357
75%,1.815805,0.405378,0.907262,0.893974
max,2.269755,0.653619,0.978738,2.240893


## Grupowanie

- Co to jest grupowanie
- Do czego służy grupowanie
- Klucze
- Metody grupowania
    
    - Wielkość: size, count, nunique
    - Próbkowanie: first, last, head, tail
    - Statystyki: mean, sum, min, max
    
- Format wyświetlania: Series czy DataFrame

In [11]:
DATA = 'https://python.astrotech.io/_static/phones.csv'

In [78]:
df = pd.read_csv(DATA, parse_dates=['date'], index_col=0)

In [19]:
# df.set_index('index')
# df.drop(columns='index')

Unnamed: 0,date,duration,item,month,network,network_type
0,2014-10-15 06:58:00,34.429,data,2014-11,data,data
1,2014-10-15 06:58:00,13.000,call,2014-11,Vodafone,mobile
2,2014-10-15 14:46:00,23.000,call,2014-11,Meteor,mobile
3,2014-10-15 14:48:00,4.000,call,2014-11,Tesco,mobile
4,2014-10-15 17:27:00,4.000,call,2014-11,Tesco,mobile
...,...,...,...,...,...,...
825,2015-03-13 00:38:00,1.000,sms,2015-03,world,world
826,2015-03-13 00:39:00,1.000,sms,2015-03,Vodafone,mobile
827,2015-03-13 06:58:00,34.429,data,2015-03,data,data
828,2015-03-14 00:13:00,1.000,sms,2015-03,world,world


In [21]:
df

Unnamed: 0_level_0,date,duration,item,month,network,network_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2014-10-15 06:58:00,34.429,data,2014-11,data,data
1,2014-10-15 06:58:00,13.000,call,2014-11,Vodafone,mobile
2,2014-10-15 14:46:00,23.000,call,2014-11,Meteor,mobile
3,2014-10-15 14:48:00,4.000,call,2014-11,Tesco,mobile
4,2014-10-15 17:27:00,4.000,call,2014-11,Tesco,mobile
...,...,...,...,...,...,...
825,2015-03-13 00:38:00,1.000,sms,2015-03,world,world
826,2015-03-13 00:39:00,1.000,sms,2015-03,Vodafone,mobile
827,2015-03-13 06:58:00,34.429,data,2015-03,data,data
828,2015-03-14 00:13:00,1.000,sms,2015-03,world,world


In [22]:
df.groupby('month')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12af05b50>

In [23]:
df.groupby(['month', 'item'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12af05a30>

In [24]:
df.groupby('month').size()

month
2014-11    230
2014-12    157
2015-01    205
2015-02    137
2015-03    101
dtype: int64

In [25]:
df.groupby('month').mean()

Unnamed: 0_level_0,duration
month,Unnamed: 1_level_1
2014-11,115.823657
2014-12,93.260318
2015-01,88.894141
2015-02,113.301453
2015-03,225.251891


In [27]:
df.groupby('month').nunique()

Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,210,77,3,8,5
2014-12,147,63,3,8,5
2015-01,172,72,3,7,4
2015-02,125,64,3,8,5
2015-03,93,48,3,8,5


In [28]:
df.groupby('month').sum()

Unnamed: 0_level_0,duration
month,Unnamed: 1_level_1
2014-11,26639.441
2014-12,14641.87
2015-01,18223.299
2015-02,15522.299
2015-03,22750.441


In [29]:
df.groupby('month').count()

Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,230,230,230,230,230
2014-12,157,157,157,157,157
2015-01,205,205,205,205,205
2015-02,137,137,137,137,137
2015-03,101,101,101,101,101


In [30]:
df.groupby('month').max()

Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,2014-12-11 19:20:00,1940.0,sms,voicemail,voicemail
2014-12,2014-12-14 19:54:00,2120.0,sms,world,world
2015-01,2015-12-01 18:26:00,1859.0,sms,voicemail,voicemail
2015-02,2015-12-02 06:58:00,1863.0,sms,voicemail,voicemail
2015-03,2015-12-03 06:58:00,10528.0,sms,world,world


In [31]:
df.groupby('month').min()

Unnamed: 0_level_0,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,2014-01-11 06:58:00,1.0,call,Meteor,data
2014-12,2014-01-12 06:58:00,1.0,call,Meteor,data
2015-01,2014-12-13 06:58:00,1.0,call,Meteor,data
2015-02,2015-01-02 06:58:00,1.0,call,Meteor,data
2015-03,2015-01-03 06:58:00,1.0,call,Meteor,data


In [33]:
df.groupby('month').first('W')

Unnamed: 0_level_0,duration
month,Unnamed: 1_level_1
2014-11,34.429
2014-12,34.429
2015-01,34.429
2015-02,34.429
2015-03,69.0


In [36]:
df.groupby(['month', 'item']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,duration,network,network_type
month,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11,call,2014-10-15 06:58:00,13.0,Vodafone,mobile
2014-11,data,2014-10-15 06:58:00,34.429,data,data
2014-11,sms,2014-10-16 22:18:00,1.0,Meteor,mobile
2014-12,call,2014-11-14 17:24:00,124.0,voicemail,voicemail
2014-12,data,2014-11-13 06:58:00,34.429,data,data
2014-12,sms,2014-11-14 17:28:00,1.0,Vodafone,mobile
2015-01,call,2014-12-15 20:03:00,4.0,Three,mobile
2015-01,data,2014-12-13 06:58:00,34.429,data,data
2015-01,sms,2014-12-15 19:56:00,1.0,Three,mobile
2015-02,call,2015-01-15 10:36:00,28.0,Three,mobile


In [37]:
df.groupby(['item', 'month']).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,duration,network,network_type
item,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
call,2014-11,2014-10-15 06:58:00,13.0,Vodafone,mobile
call,2014-12,2014-11-14 17:24:00,124.0,voicemail,voicemail
call,2015-01,2014-12-15 20:03:00,4.0,Three,mobile
call,2015-02,2015-01-15 10:36:00,28.0,Three,mobile
call,2015-03,2015-12-02 20:15:00,69.0,landline,landline
data,2014-11,2014-10-15 06:58:00,34.429,data,data
data,2014-12,2014-11-13 06:58:00,34.429,data,data
data,2015-01,2014-12-13 06:58:00,34.429,data,data
data,2015-02,2015-01-13 06:58:00,34.429,data,data
data,2015-03,2015-02-13 06:58:00,34.429,data,data


In [50]:
from datetime import datetime

df['year'] = df['date'].map(lambda x: x.year)
df['month'] = df['date'].map(lambda x: x.month)

df

Unnamed: 0_level_0,date,duration,item,month,network,network_type,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2014-10-15 06:58:00,34.429,data,10,data,data,2014
1,2014-10-15 06:58:00,13.000,call,10,Vodafone,mobile,2014
2,2014-10-15 14:46:00,23.000,call,10,Meteor,mobile,2014
3,2014-10-15 14:48:00,4.000,call,10,Tesco,mobile,2014
4,2014-10-15 17:27:00,4.000,call,10,Tesco,mobile,2014
...,...,...,...,...,...,...,...
825,2015-03-13 00:38:00,1.000,sms,3,world,world,2015
826,2015-03-13 00:39:00,1.000,sms,3,Vodafone,mobile,2015
827,2015-03-13 06:58:00,34.429,data,3,data,data,2015
828,2015-03-14 00:13:00,1.000,sms,3,world,world,2015


In [54]:
df.groupby(['year', 'month']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,duration
year,month,Unnamed: 2_level_1
2014,1,171.643
2014,2,409.85725
2014,3,121.561286
2014,4,20.39432
2014,5,159.9858
2014,6,84.278952
2014,7,138.707524
2014,8,34.1716
2014,9,13.095333
2014,10,151.791185


In [58]:
df.groupby('date').resample('M')

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Int64Index'

In [61]:
df2 = df.copy()

In [65]:
df2.set_index('date', inplace=True)

KeyError: "None of ['date'] are in the columns"

In [72]:
df2.resample('M').mean()

Unnamed: 0_level_0,duration,month,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-31,171.643,1,2014
2014-02-28,409.85725,2,2014
2014-03-31,121.561286,3,2014
2014-04-30,20.39432,4,2014
2014-05-31,159.9858,5,2014
2014-06-30,84.278952,6,2014
2014-07-31,138.707524,7,2014
2014-08-31,34.1716,8,2014
2014-09-30,13.095333,9,2014
2014-10-31,151.791185,10,2014


In [76]:
df.groupby(['year', 'month']).mean()['duration']

array([171.643     , 409.85725   , 121.56128571,  20.39432   ,
       159.9858    ,  84.27895238, 138.70752381,  34.1716    ,
        13.09533333, 151.79118487,  69.3215    ,  77.79458219,
       127.65780198, 114.41145783, 226.98225   , 735.8858    ,
       134.07570588,  55.59192857,  56.49073913, 104.07570588,
       139.27391667, 114.94515385,  61.0574    ,  28.9287    ])

## Agregacje

- Co to jest agregacja
- Do czego służy agregacja
- Format definiowania agragacji: dict, named aggregations
- Metody agregacji
- Własne funckje i lambdy
- Zmiana nazwy i poziomu indeksu

In [108]:
result2 = df.groupby(['month', 'item']).agg({
    'duration': ['sum', 'min', 'max', 'count'],
    'network_type': 'count',
    'date': ['first', 'last'],
})

In [95]:
result = df[df['item'] == 'call'].groupby('month').agg(
    duration_sum=('duration', 'sum'),
    duration_min=('duration', 'max'),
    duration_max=('duration', 'min'),
    duration_count=('duration', 'min'),
    date_first=('date', 'first'),
    date_last=('date', 'first'),
    num_days=('date', lambda x: (max(x) - min(x)).days),
)

In [97]:
result.index

Index(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'], dtype='object', name='month')

In [98]:
result.columns

Index(['duration_sum', 'duration_min', 'duration_max', 'duration_count',
       'date_first', 'date_last', 'num_days'],
      dtype='object')

In [100]:
result2.index

MultiIndex([('2014-11', 'call'),
            ('2014-11', 'data'),
            ('2014-11',  'sms'),
            ('2014-12', 'call'),
            ('2014-12', 'data'),
            ('2014-12',  'sms'),
            ('2015-01', 'call'),
            ('2015-01', 'data'),
            ('2015-01',  'sms'),
            ('2015-02', 'call'),
            ('2015-02', 'data'),
            ('2015-02',  'sms'),
            ('2015-03', 'call'),
            ('2015-03', 'data'),
            ('2015-03',  'sms')],
           names=['month', 'item'])

In [107]:
result2.columns

Index(['duration_sum', 'duration_min', 'duration_max', 'duration_count',
       'network_type_count', 'date_first', 'date_last'],
      dtype='object')

In [105]:
# result2.rename(columns=('...'))

result2.columns = ['_'.join(names) for names in result2.columns]

In [106]:
result2

Unnamed: 0_level_0,Unnamed: 1_level_0,duration_sum,duration_min,duration_max,duration_count,network_type_count,date_first,date_last
month,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-11,call,25547.0,1.0,1940.0,107,107,2014-10-15 06:58:00,2014-12-11 19:01:00
2014-11,data,998.441,34.429,34.429,29,29,2014-10-15 06:58:00,2014-12-11 06:58:00
2014-11,sms,94.0,1.0,1.0,94,94,2014-10-16 22:18:00,2014-11-13 22:31:00
2014-12,call,13561.0,2.0,2120.0,79,79,2014-11-14 17:24:00,2014-12-14 19:54:00
2014-12,data,1032.87,34.429,34.429,30,30,2014-11-13 06:58:00,2014-12-12 06:58:00
2014-12,sms,48.0,1.0,1.0,48,48,2014-11-14 17:28:00,2014-07-12 23:22:00
2015-01,call,17070.0,2.0,1859.0,88,88,2014-12-15 20:03:00,2015-01-14 20:47:00
2015-01,data,1067.299,34.429,34.429,31,31,2014-12-13 06:58:00,2015-12-01 06:58:00
2015-01,sms,86.0,1.0,1.0,86,86,2014-12-15 19:56:00,2015-01-14 23:36:00
2015-02,call,14416.0,1.0,1863.0,67,67,2015-01-15 10:36:00,2015-09-02 17:54:00


In [109]:
result2

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,duration,duration,duration,network_type,date,date
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,count,count,first,last
month,item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2014-11,call,25547.0,1.0,1940.0,107,107,2014-10-15 06:58:00,2014-12-11 19:01:00
2014-11,data,998.441,34.429,34.429,29,29,2014-10-15 06:58:00,2014-12-11 06:58:00
2014-11,sms,94.0,1.0,1.0,94,94,2014-10-16 22:18:00,2014-11-13 22:31:00
2014-12,call,13561.0,2.0,2120.0,79,79,2014-11-14 17:24:00,2014-12-14 19:54:00
2014-12,data,1032.87,34.429,34.429,30,30,2014-11-13 06:58:00,2014-12-12 06:58:00
2014-12,sms,48.0,1.0,1.0,48,48,2014-11-14 17:28:00,2014-07-12 23:22:00
2015-01,call,17070.0,2.0,1859.0,88,88,2014-12-15 20:03:00,2015-01-14 20:47:00
2015-01,data,1067.299,34.429,34.429,31,31,2014-12-13 06:58:00,2015-12-01 06:58:00
2015-01,sms,86.0,1.0,1.0,86,86,2014-12-15 19:56:00,2015-01-14 23:36:00
2015-02,call,14416.0,1.0,1863.0,67,67,2015-01-15 10:36:00,2015-09-02 17:54:00


In [111]:
result2.columns.droplevel(level=0)

Index(['sum', 'min', 'max', 'count', 'count', 'first', 'last'], dtype='object')