# Data Manipulation - Groups and Joins

In [6]:
import pandas as pd

df = pd.DataFrame({'nome':['Leticia','Leticia','Leticia','Eduardo','Leticia', 'Leticia', 'Matheus','Thiago', 'Thiago', 'Eduardo'],
                   'nota' :[9, 7, 9, 7, 9, 7, 8, 10, 9, 8],
                   'nps':[5,4,4.5, 4, 4, 5, 3,3.5,3,4]})

df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [7]:
# Get the mean grade for Leticia

In [8]:
df.loc[df['nome']=='Leticia','nota'].mean()

8.2

In [9]:
df.query('nome == "Leticia"')['nota'].mean()

8.2

In [10]:
df['nome'].unique()

array(['Leticia', 'Eduardo', 'Matheus', 'Thiago'], dtype=object)

In [11]:
mask = df['nome'] == 'Eduardo'
mask

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8    False
9     True
Name: nome, dtype: bool

In [12]:
df.loc[mask, :]

Unnamed: 0,nome,nota,nps
3,Eduardo,7,4.0
9,Eduardo,8,4.0


In [13]:
df.loc[mask, 'nota'].mean()

7.5

In [14]:
df.nome.unique()

array(['Leticia', 'Eduardo', 'Matheus', 'Thiago'], dtype=object)

# Como fazer essa operação para todos os nomes únicos?

- `.groupby()` é uma forma de **agregar** todos os resultados para cada chave única
- sempre que você faz uma **agregação**, o resultado final terá 1 linha para cada valor pelo qual você agregou, portanto, é obrigatório que se aplique uma função agregadora para que todos os valores sejam sumarizados em um único valor associado àquela chave.

Por exemplo, se tivermos:

Nome | Nota
-----|-----
Andre | 10
Andre | 8 
Andre | 6
Joao  | 10
Joao  | 4

O resultado de um `.groupby` por 'Nome' resultaria em 2 linhas

Nome | xxxx
----|-----
Andre| *
Joao | *

O asterisco representa o valor agregado. Isto é, não há como trazer os valores 10, 8 e 6 associados à Andre. Temos, obrigatoriamente, que sumarizá-los em um único dado. Para isso, podemos fazer a média entre 10,8,6 (que seria 8), a soma (que seria 24), ou qualquer outra função agregadora. Assim teríamos um único valor sumarizado para a chave 'Nome'

In [15]:
df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [16]:
df_groups = df.groupby(by='nome')

In [17]:
df_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001BEE40D52E0>

In [18]:
df_groups.mean()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,7.5,4.0
Leticia,8.2,4.5
Matheus,8.0,3.0
Thiago,9.5,3.25


In [19]:
df_groups.get_group('Thiago')#.mean()

Unnamed: 0,nome,nota,nps
7,Thiago,10,3.5
8,Thiago,9,3.0


In [20]:
df.groupby(by='nome').mean()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,7.5,4.0
Leticia,8.2,4.5
Matheus,8.0,3.0
Thiago,9.5,3.25


In [21]:
df.groupby(by='nome').max()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,8,4.0
Leticia,9,5.0
Matheus,8,3.0
Thiago,10,3.5


In [22]:
df.groupby(by='nome').min()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,7,4.0
Leticia,7,4.0
Matheus,8,3.0
Thiago,9,3.0


## Aggregating methods

- `.mean()`
- `.median()`
- `.max()`
- `.min()`
- `.sum()`
- `.count()`
- `.describe()`
- `.agg()`

In [23]:
df.groupby('nome').describe()

Unnamed: 0_level_0,nota,nota,nota,nota,nota,nota,nota,nota,nps,nps,nps,nps,nps,nps,nps,nps
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Eduardo,2.0,7.5,0.707107,7.0,7.25,7.5,7.75,8.0,2.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0
Leticia,5.0,8.2,1.095445,7.0,7.0,9.0,9.0,9.0,5.0,4.5,0.5,4.0,4.0,4.5,5.0,5.0
Matheus,1.0,8.0,,8.0,8.0,8.0,8.0,8.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Thiago,2.0,9.5,0.707107,9.0,9.25,9.5,9.75,10.0,2.0,3.25,0.353553,3.0,3.125,3.25,3.375,3.5


### More than one aggregation

In [24]:
x = df.groupby('nome')

In [25]:
x.agg()

TypeError: Must provide 'func' or tuples of '(column, aggfunc).

In [None]:
import numpy as np

In [None]:
df = pd.DataFrame({'nome':['Leticia','Leticia','Leticia','Eduardo','Leticia', 'Leticia', 'Matheus','Thiago', 'Thiago', 'Eduardo'],
                   'nota' :[9, 7, 9, 7, 9, 7, 8, 10, 9, 8],
                   'nps':[5,4,4.5, 4, 4, 5, 3,3.5,3,4]})

df

In [None]:
df.groupby('nome').median()

In [None]:
df.groupby('nome').agg('median')

In [26]:
df.groupby('nome').agg({np.mean,'median',max})

Unnamed: 0_level_0,nota,nota,nota,nps,nps,nps
Unnamed: 0_level_1,median,max,mean,median,max,mean
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Eduardo,7.5,8,7.5,4.0,4.0,4.0
Leticia,9.0,9,8.2,4.5,5.0,4.5
Matheus,8.0,8,8.0,3.0,3.0,3.0
Thiago,9.5,10,9.5,3.25,3.5,3.25


In [27]:
def always_9(x):
    return 9 


df.groupby('nome').agg([always_9,'median'])

Unnamed: 0_level_0,nota,nota,nps,nps
Unnamed: 0_level_1,always_9,median,always_9,median
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Eduardo,9,7.5,9,4.0
Leticia,9,9.0,9,4.5
Matheus,9,8.0,9,3.0
Thiago,9,9.5,9,3.25


In [28]:
# read the `.agg()` help (shift+TAB) to learn which aggregation methods it can handle

df.groupby(by='nome').agg({'max', 'min', 'mean'})

Unnamed: 0_level_0,nota,nota,nota,nps,nps,nps
Unnamed: 0_level_1,max,mean,min,max,mean,min
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Eduardo,8,7.5,7,4.0,4.0,4.0
Leticia,9,8.2,7,5.0,4.5,4.0
Matheus,8,8.0,8,3.0,3.0,3.0
Thiago,10,9.5,9,3.5,3.25,3.0


In [29]:
grouped_results = df.groupby(by='nome').agg({'nota': 'mean',
                                             'nps': ['min','max']})

In [30]:
grouped_results

Unnamed: 0_level_0,nota,nps,nps
Unnamed: 0_level_1,mean,min,max
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Eduardo,7.5,4.0,4.0
Leticia,8.2,4.0,5.0
Matheus,8.0,3.0,3.0
Thiago,9.5,3.0,3.5


## Named aggregation

In [31]:
grouped_results = df.groupby(by='nome').agg(nota_max = ('nota',max), 
                                            nota_min = ('nota', min),
                                            nota_avg = ('nota',np.mean),
                                            nps_mean = ('nps', 'mean'))

In [32]:
grouped_results

Unnamed: 0_level_0,nota_max,nota_min,nota_avg,nps_mean
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eduardo,8,7,7.5,4.0
Leticia,9,7,8.2,4.5
Matheus,8,8,8.0,3.0
Thiago,10,9,9.5,3.25


In [33]:
grouped_results.loc[['Thiago'],:]

Unnamed: 0_level_0,nota_max,nota_min,nota_avg,nps_mean
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thiago,10,9,9.5,3.25


In [34]:
grouped_results.reset_index()

Unnamed: 0,nome,nota_max,nota_min,nota_avg,nps_mean
0,Eduardo,8,7,7.5,4.0
1,Leticia,9,7,8.2,4.5
2,Matheus,8,8,8.0,3.0
3,Thiago,10,9,9.5,3.25


In [35]:
grouped_results = df.groupby(by='nome', as_index=False).agg(nota_max = ('nota','max'), 
                                                            nota_min = ('nota', min),
                                                            nota_avg = ('nota','mean'),
                                                            nps_mean = ('nps', 'mean'))

In [36]:
grouped_results

Unnamed: 0,nome,nota_max,nota_min,nota_avg,nps_mean
0,Eduardo,8,7,7.5,4.0
1,Leticia,9,7,8.2,4.5
2,Matheus,8,8,8.0,3.0
3,Thiago,10,9,9.5,3.25


# Group by 
>    - Aggregating function
>    - Named aggregation
>    - `as_index = False`

In [37]:
data = pd.read_csv('data/vehicles.csv')
data = data.rename(columns={'Make':'Manufacturer'})

In [38]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [39]:
#Get the mean for AM general manufacterer

In [40]:
#Get mean Engine displ for the manufacters 

In [41]:
%%timeit 

data.groupby(by='Manufacturer').mean()[['Engine Displacement']]

6.42 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [42]:
%%timeit

data.groupby(by='Manufacturer')[['Engine Displacement']].mean()

2.86 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
data.groupby(by='Manufacturer')[['Engine Displacement']].mean().reset_index()

Unnamed: 0,Manufacturer,Engine Displacement
0,AM General,3.350000
1,ASC Incorporated,3.800000
2,Acura,2.834768
3,Alfa Romeo,2.556098
4,American Motors Corporation,3.813636
...,...,...
122,Volkswagen,2.236008
123,Volvo,2.504742
124,Wallace Environmental,4.315625
125,Yugo,1.200000


In [44]:
grouped_results = data.groupby(by='Cylinders', as_index=False).agg(mean_fuel_cost = ('Fuel Cost/Year','mean'),
                                                               count_vehicles = ('Fuel Cost/Year','count'))

In [45]:
grouped_results

Unnamed: 0,Cylinders,mean_fuel_cost,count_vehicles
0,2.0,2004.166667,48
1,3.0,962.437811,201
2,4.0,1487.879798,13494
3,5.0,1813.278008,723
4,6.0,1943.19624,12765
5,8.0,2414.734934,7998
6,10.0,2926.797386,153
7,12.0,3143.149466,562
8,16.0,4050.0,8


In [46]:
type(grouped_results)

pandas.core.frame.DataFrame

In [47]:
grouped_results[['mean_fuel_cost']]

Unnamed: 0,mean_fuel_cost
0,2004.166667
1,962.437811
2,1487.879798
3,1813.278008
4,1943.19624
5,2414.734934
6,2926.797386
7,3143.149466
8,4050.0


In [48]:
data

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


In [49]:
# media de cilindros e max e min de ano para as diferentes classes de veiculo

In [50]:
data

data.groupby(by='Vehicle Class').agg(c_mean=('Cylinders','mean'),
                                     y_max = ('Year','max'),
                                     y_min = ('Year','min'))

Unnamed: 0_level_0,c_mean,y_max,y_min
Vehicle Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Compact Cars,4.837223,2017,1985
Large Cars,7.128588,2017,1984
Midsize Cars,5.665026,2017,1985
Midsize Station Wagons,5.173077,2017,1998
Midsize-Large Station Wagons,5.113744,1997,1985
Minicompact Cars,5.764026,2017,1985
Minivan - 2WD,5.800604,2017,1999
Minivan - 4WD,6.0,2017,1999
Small Pickup Trucks,4.648855,1997,1985
Small Pickup Trucks 2WD,4.802993,2017,1984


In [51]:
pd.options.display.max_rows=99

In [52]:
# usar groupby e describe
data.groupby(['Vehicle Class','Cylinders']).agg(['mean','min','max','std'])#.astype(int)



Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Year,Year,Year,Engine Displacement,Engine Displacement,Engine Displacement,Engine Displacement,Fuel Barrels/Year,Fuel Barrels/Year,...,Combined MPG,Combined MPG,CO2 Emission Grams/Mile,CO2 Emission Grams/Mile,CO2 Emission Grams/Mile,CO2 Emission Grams/Mile,Fuel Cost/Year,Fuel Cost/Year,Fuel Cost/Year,Fuel Cost/Year
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,std,mean,min,max,std,mean,min,...,max,std,mean,min,max,std,mean,min,max,std
Vehicle Class,Cylinders,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Compact Cars,3.0,2015.666667,2014,2017,1.154701,1.133333,1.0,1.2,0.098473,9.112816,8.240250,...,40,2.534609,243.666667,222.000000,272.000000,16.631478,920.833333,850,1050,62.005620
Compact Cars,4.0,2000.506620,1985,2017,10.184981,1.983230,1.3,2.5,0.289030,13.317621,0.060000,...,50,4.343150,359.664477,51.000000,522.764706,55.132147,1415.063254,650,2250,264.824595
Compact Cars,5.0,2004.829384,1985,2016,6.973370,2.453555,2.2,3.0,0.114327,15.248337,12.207778,...,27,2.114933,410.342455,329.148148,535.789474,41.030462,1727.251185,1250,2250,275.601556
Compact Cars,6.0,2000.492887,1985,2017,9.225569,3.014812,2.3,5.3,0.352127,16.634951,10.632581,...,31,2.141143,448.236147,277.000000,740.583333,46.443272,1870.543933,1300,3400,261.499584
Compact Cars,8.0,2005.281879,1985,2017,9.021173,5.044966,3.0,7.0,0.909837,20.439669,15.695714,...,21,2.357090,550.880512,425.000000,1110.875000,100.301235,2455.201342,1750,4150,452.618413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Vans, Passenger Type",4.0,1984.000000,1984,1984,0.000000,1.950000,1.9,2.0,0.057735,18.687710,15.695714,...,21,2.943920,503.861161,423.190476,592.466667,82.785651,1887.500000,1600,2200,306.526236
"Vans, Passenger Type",5.0,1993.000000,1993,1993,,2.500000,2.5,2.5,,21.974000,21.974000,...,15,,592.466667,592.466667,592.466667,,2200.000000,2200,2200,
"Vans, Passenger Type",6.0,1999.493827,1984,2017,8.339849,4.201235,3.5,4.9,0.278161,22.355663,19.388824,...,17,1.026050,603.095090,522.764706,740.583333,42.026147,2266.666667,1950,2800,161.438843
"Vans, Passenger Type",8.0,2004.270142,1984,2017,9.913339,5.406161,4.6,6.2,0.445090,24.328472,0.143077,...,19,1.401904,665.697909,535.789474,987.444444,63.035351,2505.924171,1900,3700,243.536394


-----

# JOINs

How to merge dataframes based on a specific column

In [53]:
df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [54]:
df_estados = pd.DataFrame({'nome':['Andre', 'Matheus','Eduardo','Alexandre','Gabriela','Eduardo'], 
                           'estado':['SP','PR','MG','SP','MS','SP']})
df_estados

Unnamed: 0,nome,estado
0,Andre,SP
1,Matheus,PR
2,Eduardo,MG
3,Alexandre,SP
4,Gabriela,MS
5,Eduardo,SP


In [55]:
pd.merge(left=df, right=df_estados)

Unnamed: 0,nome,nota,nps,estado
0,Eduardo,7,4.0,MG
1,Eduardo,7,4.0,SP
2,Eduardo,8,4.0,MG
3,Eduardo,8,4.0,SP
4,Matheus,8,3.0,PR


## Types of Joins

![image-asset.png](data/image.png)

In [56]:
df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [57]:
df_estados

Unnamed: 0,nome,estado
0,Andre,SP
1,Matheus,PR
2,Eduardo,MG
3,Alexandre,SP
4,Gabriela,MS
5,Eduardo,SP


In [58]:
pd.merge(left=df, right=df_estados)

Unnamed: 0,nome,nota,nps,estado
0,Eduardo,7,4.0,MG
1,Eduardo,7,4.0,SP
2,Eduardo,8,4.0,MG
3,Eduardo,8,4.0,SP
4,Matheus,8,3.0,PR


In [59]:
pd.merge(left=df, right=df_estados, how='left')

Unnamed: 0,nome,nota,nps,estado
0,Leticia,9,5.0,
1,Leticia,7,4.0,
2,Leticia,9,4.5,
3,Eduardo,7,4.0,MG
4,Eduardo,7,4.0,SP
5,Leticia,9,4.0,
6,Leticia,7,5.0,
7,Matheus,8,3.0,PR
8,Thiago,10,3.5,
9,Thiago,9,3.0,


In [60]:
pd.merge(left=df, right=df_estados,  how='outer')

Unnamed: 0,nome,nota,nps,estado
0,Leticia,9.0,5.0,
1,Leticia,7.0,4.0,
2,Leticia,9.0,4.5,
3,Leticia,9.0,4.0,
4,Leticia,7.0,5.0,
5,Eduardo,7.0,4.0,MG
6,Eduardo,7.0,4.0,SP
7,Eduardo,8.0,4.0,MG
8,Eduardo,8.0,4.0,SP
9,Matheus,8.0,3.0,PR


## Using our vehicles dataframe

In [61]:
data

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


In [62]:
avg_fuel = data.groupby('Cylinders').agg(avg_fuel_col = ('Fuel Cost/Year','mean')).reset_index()
avg_fuel

Unnamed: 0,Cylinders,avg_fuel_col
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [63]:
#pd.merge(left=avg_fuel,right=data, right_on='Cylinders', left_index=True)
pd.merge(left=avg_fuel,right=data)

Unnamed: 0,Cylinders,avg_fuel_col,Manufacturer,Model,Year,Engine Displacement,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,2.0,2004.166667,BMW,i3 REX,2014,0.6,Auto(A1),Rear-Wheel Drive,Subcompact Cars,Premium Gas or Electricity,1.563190,41,37,39,40.000000,1050
1,2.0,2004.166667,BMW,i3 REX,2015,0.6,Automatic (A1),Rear-Wheel Drive,Subcompact Cars,Premium Gas or Electricity,1.563190,41,37,39,40.000000,1050
2,2.0,2004.166667,BMW,i3 REX,2016,0.6,Automatic (A1),Rear-Wheel Drive,Subcompact Cars,Premium Gas or Electricity,1.563190,41,37,39,37.000000,1050
3,2.0,2004.166667,Mazda,RX-7,1985,1.1,Automatic 4-spd,Rear-Wheel Drive,Two Seaters,Regular,19.388824,15,21,17,522.764706,1950
4,2.0,2004.166667,Mazda,RX-7,1985,1.1,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,19.388824,15,20,17,522.764706,1950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,16.0,4050.000000,Bugatti,Veyron,2011,8.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050
35948,16.0,4050.000000,Bugatti,Veyron,2012,8.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050
35949,16.0,4050.000000,Bugatti,Veyron,2013,8.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050
35950,16.0,4050.000000,Bugatti,Veyron,2014,8.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050


In [64]:
avg_fuel = data.groupby('Cylinders').agg(avg_fuel = ('Fuel Cost/Year','mean')).reset_index()
#avg_fuel = avg_fuel.rename({'avg_fuel':'Fuel Cost/Year'},axis=1)

In [65]:
avg_fuel

Unnamed: 0,Cylinders,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [66]:
pd.merge(left=data, right=avg_fuel)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,avg_fuel
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,1487.879798
1,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,1487.879798
2,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,20,26,22,403.954545,1500,1487.879798
3,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,22,28,24,370.291667,1400,1487.879798
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,19,27,22,403.954545,1500,1487.879798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,Bugatti,Veyron,2011,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,4050.000000
35948,Bugatti,Veyron,2012,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,4050.000000
35949,Bugatti,Veyron,2013,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,4050.000000
35950,Bugatti,Veyron,2014,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,4050.000000


## What if we had different names?

In [67]:
avg_fuel.rename(columns={'Cylinders':'cyl'}, inplace=True)

In [68]:
avg_fuel.head(2)

Unnamed: 0,cyl,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811


In [69]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [70]:
pd.merge(left=data, right=avg_fuel, left_on='Cylinders', right_on='cyl')

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,cyl,avg_fuel
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,4.0,1487.879798
1,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,4.0,1487.879798
2,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,20,26,22,403.954545,1500,4.0,1487.879798
3,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,22,28,24,370.291667,1400,4.0,1487.879798
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,19,27,22,403.954545,1500,4.0,1487.879798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,Bugatti,Veyron,2011,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,16.0,4050.000000
35948,Bugatti,Veyron,2012,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,16.0,4050.000000
35949,Bugatti,Veyron,2013,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,16.0,4050.000000
35950,Bugatti,Veyron,2014,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,16.0,4050.000000


# How to concatenate dataframes?

In [71]:
avg_fuel.rename(columns={'cyl':'Cylinders'}, inplace=True)

In [72]:
avg_fuel

Unnamed: 0,Cylinders,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [73]:
small_cars = avg_fuel.loc[avg_fuel['Cylinders'] < 7, :]
small_cars

Unnamed: 0,Cylinders,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624


In [74]:
big_cars = avg_fuel.loc[avg_fuel['Cylinders'] >= 7, :]
big_cars

Unnamed: 0,Cylinders,avg_fuel
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [75]:
pd.concat([small_cars, big_cars])

Unnamed: 0,Cylinders,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [76]:
pd.concat([small_cars, big_cars], axis=1).reset_index(drop=True)

Unnamed: 0,Cylinders,avg_fuel,Cylinders.1,avg_fuel.1
0,2.0,2004.166667,,
1,3.0,962.437811,,
2,4.0,1487.879798,,
3,5.0,1813.278008,,
4,6.0,1943.19624,,
5,,,8.0,2414.734934
6,,,10.0,2926.797386
7,,,12.0,3143.149466
8,,,16.0,4050.0


In [77]:
pd.concat([avg_fuel, data], axis=1)

Unnamed: 0,Cylinders,avg_fuel,Manufacturer,Model,Year,Engine Displacement,Cylinders.1,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,2.0,2004.166667,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,3.0,962.437811,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,4.0,1487.879798,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,5.0,1813.278008,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,6.0,1943.196240,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,,,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,,,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,,,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,,,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


# Bins
> ```pd.cut``` vs ```pd.qcut```
> - Specify cutoffs

> - Use case
>     - Scores ~ decis (0,1,2,3,4,5,6,7,8,9)


Suppose I want to break the values of the variable `Combined MPG` into 5 categories: From Very Low to Very High

In [78]:
data['Fuel Barrels/Year']

0        19.388824
1        25.354615
2        20.600625
3        25.354615
4        20.600625
           ...    
35947     9.155833
35948     9.155833
35949     9.155833
35950     9.155833
35951     9.417429
Name: Fuel Barrels/Year, Length: 35952, dtype: float64

In [79]:
data['Fuel Barrels/Year'].describe()

count    35952.000000
mean        17.609056
std          4.467283
min          0.060000
25%         14.699423
50%         17.347895
75%         20.600625
max         47.087143
Name: Fuel Barrels/Year, dtype: float64

In [80]:
pd.cut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA'])

0         M
1         M
2         M
3         M
4         M
         ..
35947    MB
35948    MB
35949    MB
35950    MB
35951    MB
Name: Fuel Barrels/Year, Length: 35952, dtype: category
Categories (5, object): ['MB' < 'B' < 'M' < 'A' < 'MA']

In [81]:
data['cat_barrel_year'] = pd.cut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA'])
data['cat_barrel_year']

0         M
1         M
2         M
3         M
4         M
         ..
35947    MB
35948    MB
35949    MB
35950    MB
35951    MB
Name: cat_barrel_year, Length: 35952, dtype: category
Categories (5, object): ['MB' < 'B' < 'M' < 'A' < 'MA']

In [82]:
# Count the values of the 'cat_barrel_year'

In [84]:
data['cat_barrel_year'].value_counts()

B     23804
M     11062
A       605
MB      455
MA       26
Name: cat_barrel_year, dtype: int64

In [85]:
data.groupby(by='cat_barrel_year').count()['Engine Displacement']

cat_barrel_year
MB      455
B     23804
M     11062
A       605
MA       26
Name: Engine Displacement, dtype: int64

In [86]:
data['cat_barrel_year'].value_counts().sort_values(ascending=True)

MA       26
MB      455
A       605
M     11062
B     23804
Name: cat_barrel_year, dtype: int64

In [94]:
pd.qcut(data['Fuel Barrels/Year'], 5).value_counts()

(15.696, 18.312]    9871
(13.734, 15.696]    7196
(0.059, 13.734]     7192
(20.601, 47.087]    7080
(18.312, 20.601]    4613
Name: Fuel Barrels/Year, dtype: int64

In [95]:
# performing the pd.cut operation without specifiyng the labels outputs the intervals
bins = pd.cut(data['Fuel Cost/Year'], 5)
bins

0        (1640.0, 2680.0]
1        (1640.0, 2680.0]
2        (1640.0, 2680.0]
3        (1640.0, 2680.0]
4        (1640.0, 2680.0]
               ...       
35947     (594.8, 1640.0]
35948     (594.8, 1640.0]
35949     (594.8, 1640.0]
35950     (594.8, 1640.0]
35951     (594.8, 1640.0]
Name: Fuel Cost/Year, Length: 35952, dtype: category
Categories (5, interval[float64, right]): [(594.8, 1640.0] < (1640.0, 2680.0] < (2680.0, 3720.0] < (3720.0, 4760.0] < (4760.0, 5800.0]]

In [96]:
data['Fuel Cost/Year'].describe()

count    35952.000000
mean      1892.598465
std        506.958627
min        600.000000
25%       1500.000000
50%       1850.000000
75%       2200.000000
max       5800.000000
Name: Fuel Cost/Year, dtype: float64

In [97]:
# performing the pd.cut operation using the labels argument outputs your labels
mpg_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
bins = pd.cut(data['Fuel Cost/Year'], 5, labels=mpg_labels)
bins.head(10)

0         Low
1         Low
2         Low
3         Low
4         Low
5    Very Low
6    Very Low
7         Low
8    Very Low
9    Very Low
Name: Fuel Cost/Year, dtype: category
Categories (5, object): ['Very Low' < 'Low' < 'Moderate' < 'High' < 'Very High']

In [98]:
bins = pd.cut(data['Fuel Cost/Year'], [600,1500,1850,2200,5800],labels=['MB','B','A','MA'])
bins.value_counts()

B     10004
MB     9350
A      8993
MA     7603
Name: Fuel Cost/Year, dtype: int64

In [99]:
bins = pd.qcut(data['Fuel Cost/Year'],5, labels=mpg_labels)
bins.head(10)

0     Moderate
1    Very High
2         High
3    Very High
4    Very High
5          Low
6     Very Low
7          Low
8          Low
9     Very Low
Name: Fuel Cost/Year, dtype: category
Categories (5, object): ['Very Low' < 'Low' < 'Moderate' < 'High' < 'Very High']

# Convert categorical variables columns

>    - dummies
>    - One hot encoding

In [100]:
data['cat_barrel_year'].unique()

['M', 'B', 'MB', 'A', 'MA']
Categories (5, object): ['MB' < 'B' < 'M' < 'A' < 'MA']

In [101]:
data[['cat_barrel_year']]
# count the values within each category

Unnamed: 0,cat_barrel_year
0,M
1,M
2,M
3,M
4,M
...,...
35947,MB
35948,MB
35949,MB
35950,MB


In [102]:
pd.get_dummies(data['cat_barrel_year'])

Unnamed: 0,MB,B,M,A,MA
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0
...,...,...,...,...,...
35947,1,0,0,0,0
35948,1,0,0,0,0
35949,1,0,0,0,0
35950,1,0,0,0,0


# Resumo
**Groupby**
* nome_dataframe.groupby(‘coluna de agregação’).função_agregadora()
* Função agragadora = média, moda, mediana, min,max,describe - qualquer função que receba vários valores e retornam apenas um, o groupby aceita funções personalizadas
* nome_dataframe.groupby(‘coluna de agregação’).get_group(‘valor especifico’) - pega um grupo especifico
* nome_dataframe.groupby(‘coluna de agregação’).agg(função_agregadora) - função pode estar em string, nome de função, lista, dicionario, set, novo_nome = (‘nome da coluna’,função agregadora)
* dataframe.reset_index() - transforma o indice em coluna 
* nome_dataframe.groupby(‘coluna de agregação’, as_ index = False).função_agregadora()
* **União de tabelas**
* Merge
* pd.merge(dataframe_esquerda,dataframe_direita,how=tipo de join)
* Tipo join = inner, left, right, outter
* pd.merge(dataframe_esquerda,dataframe_direita,right_on=’nome da coluna chave’, left_on=’nome da coluna chave’)
* Concat
* pd.concat([lista de dataframes]) - junta as linhas nas mesmas colunas
