# Агрегирование и группировка

In [None]:
import numpy as np
import pandas as pd

In [None]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

In [None]:
planets.head()

In [None]:
rng = np.random.RandomState(1)
ser = pd.Series(rng.rand(5))
ser

In [None]:
ser.sum()

In [None]:
ser.mean()

In [None]:
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
df

In [None]:
df.mean()

In [None]:
df.mean(axis='columns')

In [None]:
planets.dropna().describe()

Встроенные функции агрегирования

| Aggregation              | Description                     |
|--------------------------|---------------------------------|
| ``count()``              | Total number of items           |
| ``first()``, ``last()``  | First and last item             |
| ``mean()``, ``median()`` | Mean and median                 |
| ``min()``, ``max()``     | Minimum and maximum             |
| ``std()``, ``var()``     | Standard deviation and variance |
| ``mad()``                | Mean absolute deviation         |
| ``prod()``               | Product of all items            |
| ``sum()``                | Sum of all items                |

Применимы и к ``DataFrame`` и к ``Series``.

## GroupBy: Разбиение, применение, комбинация (split, apply, combine)

![](split-apply-combine.png)

In [None]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df

In [None]:
df.groupby('key')

In [None]:
df1 = pd.DataFrame({'key1': ['A', 'B', 'C', 'A', 'B', 'C'],'key2': ['A1', 'B1', 'C1', 'A1', 'B1', 'C1'],
                    'key3': ['A3', 'B3', 'C3', 'A3', 'B3', 'C3'],
                   'data': range(6),'data2': range(1,7)}, columns=['key1', 'key2', 'key3', 'data','data2'])

df1.groupby(['key1','key2']).sum()

In [None]:
df1.groupby('key1').aggregate({'data': 'sum', 'data2': 'max'})

Объект ``DataFrameGroupBy`` проводит вычисления "лениво":

In [None]:
df.groupby('key').sum()

In [None]:
planets.groupby('method')

Этот объект можно рассматривать как словарь с ключем по колонкам

In [None]:
planets.groupby('method')['orbital_period']

In [None]:
planets.groupby('method')['orbital_period'].median()

#### Итерирование по группам

In [None]:
for (method, group) in planets.groupby('method'):
    print(f"{method:30} shape={group.shape}")

In [None]:
planets.groupby('method')['year'].describe()

### Методы aggregate, filter, transform, apply

In [None]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

In [None]:
df.groupby('key').aggregate(['min', np.median, max])

In [None]:
df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})

In [None]:
def filter_func(x):
    return x['data2'].std() > 4

display(df, df.groupby('key').std(), df.groupby('key').filter(filter_func))

In [None]:
df.groupby('key').transform(lambda x: x - x.mean())

In [None]:
def norm_by_data2(x):
    # x - это DataFrame сгруппированных значений!
    x['data1'] /= x['data2'].sum()
    return x

display(df, df.groupby('key').apply(norm_by_data2))

### Определение ключа для разбиения

In [None]:
L = [0, 1, 0, 1, 2, 0]
display(df, df.groupby(L).sum())

In [None]:
display(df, df.groupby(df['key']).sum())

In [None]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
display(df2, df2.groupby(mapping).sum())

#### В разбиении можно использовать любую функцию Python

In [None]:
display(df2, df2.groupby(str.lower).mean())

In [None]:
df2.groupby([str.lower, mapping]).mean()

### Пример группировки

In [None]:
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)