# Data Manipulation - Groups and Joins

In [None]:
import pandas as pd

df = pd.DataFrame({'nome':['Edmar','Edmar','Edmar','Dayana','Edmar', 'Edmar', 'Rai','Vamp', 'Vamp', 'Dayana'],
                   'nota' :[9, 7, 9, 7, 9, 7, 8, 10, 9, 8]})


df

In [None]:
df['nome'] == 'Edmar'

In [None]:
mask = df['nome'] == 'Edmar'
mask

In [None]:
df.loc[mask, :]

In [None]:
df.loc[mask, :].mean()

In [None]:
mask = df['nome'] == 'Dayana'
mask

In [None]:
df.loc[mask, :]

In [None]:
df.loc[mask, :].mean()

In [None]:
df.nome.unique()

# Como fazer essa operação para todos os nomes únicos?

- `.groupby()` é uma forma de **agregar** todos os resultados para cada chave única
- sempre que você faz uma **agregação**, o resultado final terá 1 linha para cada valor pelo qual você agregou, portanto, é obrigatório que se aplique uma função agregadora para que todos os valores sejam sumarizados em um único valor associado àquela chave.

Por exemplo, se tivermos:

Nome | Nota
-----|-----
Andre | 10
Andre | 8 
Andre | 6
Joao  | 10
Joao  | 4

O resultado de um `.groupby` por 'Nome' resultaria em 2 linhas

Nome | xxxx
----|-----
Andre| *
Joao | *

O asterisco representa o valor agregado. Isto é, não há como trazer os valores 10, 8 e 6 associados à Andre. Temos, obrigatoriamente, que sumarizá-los em um único dado. Para isso, podemos fazer a média entre 10,8,6 (que seria 8), a soma (que seria 24), ou qualquer outra função agregadora. Assim teríamos um único valor sumarizado para a chave 'Nome'

In [None]:
df.groupby(by='nome')

In [None]:
df.groupby(by='nome').mean()

In [None]:
df.groupby(by='nome').max()

In [None]:
df.groupby(by='nome').min()

## Aggregating methods

- `.mean()`
- `.median()`
- `.max()`
- `.min()`
- `.sum()`
- `.count()`
- `.describe()`
- `.agg()`

### More than one aggregation

In [None]:
# read the `.agg()` help (shift+TAB) to learn which aggregation methods it can handle

df.groupby(by='nome').agg({'max', 'min', 'mean'})

In [None]:
df.groupby(by='nome').agg({'max', 'min', 'mean'})

## Named aggregation

In [None]:
df.groupby(by='nome').agg(nota_max = ('nota','max'), 
                          nota_min = ('nota', min),
                          nota_avg = ('nota','mean'))

# Group by 
>    - Aggregating function
>    - Named aggregation
>    - `as_index = False`

In [None]:
data = pd.read_csv('data/vehicles.csv')
data.rename(columns={'Make':'Manufacturer'}, inplace=True)

In [None]:
data.head()

In [None]:
data.groupby(by='Manufacturer')

In [None]:
data.groupby(by='Manufacturer').mean()

In [None]:
data.groupby(by='Manufacturer').mean()[['Engine Displacement']]

In [None]:
data.Drivetrain.unique()

In [None]:
data.groupby(by='Cylinders')

In [None]:
data.groupby(by='Cylinders').mean()

In [None]:
data.groupby(by='Cylinders').mean()[['Fuel Cost/Year']]

In [None]:
data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                 qtd_fuel = ('Fuel Cost/Year', 'count'))

In [None]:
data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                 median_fuel = ('Fuel Cost/Year', 'median')).reset_index()

In [None]:
avg_fuel = data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                            median_fuel = ('Fuel Cost/Year', 'median')).reset_index()

In [None]:
avg_mpg_two_keys = data.groupby(by=['Year','Cylinders'], as_index=False).mean()[['Year','Cylinders', 'Combined MPG']]

In [None]:
avg_mpg_two_keys

-----

# JOINs

How to merge dataframes based on a specific column

In [None]:
df

In [None]:
df_estados = pd.DataFrame({'nome':['Andre', 'Rai','Edmar','Dayana','Rodrigo'], 
                           'estado':['SP','DF','SP','BA','SP']})

df_estados

In [None]:
pd.merge(left=df, right=df_estados, on='nome')

## Types of Joins

![image-asset.png](data/image.png)

In [None]:
pd.merge(left=df, right=df_estados, on='nome', how='left')

In [None]:
pd.merge(left=df, right=df_estados, on='nome', how='outer')

## Using our vehicles dataframe

In [None]:
data

In [None]:
avg_fuel

In [None]:
pd.merge(left=data, right=avg_fuel, on='Cylinders')

## What if we had different names?

In [None]:
avg_fuel.rename(columns={'Cylinders':'cyl'}, inplace=True)

In [None]:
avg_fuel.head(2)

In [None]:
data.head(2)

In [None]:
pd.merge(left=data, right=avg_fuel, left_on='Cylinders', right_on='cyl')

# How to concatenate dataframes?

In [None]:
small_cars = avg_fuel.loc[avg_fuel['cyl'] < 7, :]

In [None]:
big_cars = avg_fuel.loc[avg_fuel['cyl'] >= 7, :]

In [None]:
small_cars

In [None]:
big_cars

In [None]:
pd.concat([small_cars, big_cars])

# Bins
> ```pd.cut``` vs ```pd.qcut```
> - Specify cutoffs

> - Use case
>     - Scores ~ decis (0,1,2,3,4,5,6,7,8,9)


Suppose I want to break the values of the variable `Combined MPG` into 5 categories: From Very Low to Very High

In [None]:
data['Fuel Barrels/Year']

In [None]:
data['Fuel Barrels/Year'].describe()

In [None]:
data['cat_barrel_year'] = pd.cut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA'])

In [None]:
data.groupby(by='cat_barrel_year').mean()['Engine Displacement']

In [None]:
data['cat_barrel_year'].value_counts()

In [None]:
pd.qcut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA']).value_counts()

In [None]:
data.head()

In [None]:
mpg_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']

In [None]:
# performing the pd.cut operation without specifiyng the labels outputs the intervals
bins = pd.cut(data['Fuel Cost/Year'], 5)
bins

In [None]:
# performing the pd.cut operation using the labels argument outputs your labels
bins = pd.cut(data['Fuel Cost/Year'], 5, labels=mpg_labels)
bins.head(10)

In [None]:
bins = pd.qcut(data['Fuel Cost/Year'],5, labels=mpg_labels)
bins.head(10)

In [None]:
bins.value_counts()

In [None]:
cutoffs = [1000,1500,2000,2500,3000,3500]
bins = pd.cut(data['Fuel Cost/Year'], cutoffs, labels=mpg_labels)
bins.head(10)

In [None]:
bins.value_counts(sort=False)

# Convert categorical variables columns

>    - dummies
>    - One hot encoding

In [None]:
data[['cat_barrel_year']]
# count the values within each category

In [None]:
data['cat_barrel_year'].unique()

In [None]:
data[['cat_barrel_year']].head()

In [None]:
drivetrain = pd.get_dummies(data['cat_barrel_year'])
drivetrain