# Data Manipulation - Groups and Joins

In [1]:
import pandas as pd

df = pd.DataFrame({'nome':['Leticia','Leticia','Leticia','Eduardo','Leticia', 'Leticia', 'Matheus','Thiago', 'Thiago', 'Eduardo'],
                   'nota' :[9, 7, 9, 7, 9, 7, 8, 10, 9, 8],
                   'nps':[5,4,4.5, 4, 4, 5, 3,3.5,3,4]})

df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [11]:
mask = df['nome']=='Leticia'

In [12]:
df[mask]['nota'].mean()

8.2

In [8]:
df[df['nome']=='Leticia']['nota'].mean()

8.2

In [5]:
df.loc[df['nome']=='Leticia','nota'].mean()

8.2

In [6]:
df.query('nome == "Leticia"')['nota'].mean()

8.2

In [7]:
df['nome'] == 'Leticia'

0     True
1     True
2     True
3    False
4     True
5     True
6    False
7    False
8    False
9    False
Name: nome, dtype: bool

In [13]:
mask = df['nome'] == 'Leticia'
mask

0     True
1     True
2     True
3    False
4     True
5     True
6    False
7    False
8    False
9    False
Name: nome, dtype: bool

In [14]:
df.loc[mask, :]

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
4,Leticia,9,4.0
5,Leticia,7,5.0


In [15]:
df.loc[mask, 'nota'].mean()

8.2

In [16]:
df['nome'].unique()

array(['Leticia', 'Eduardo', 'Matheus', 'Thiago'], dtype=object)

In [17]:
mask = df['nome'] == 'Eduardo'
mask

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8    False
9     True
Name: nome, dtype: bool

In [18]:
df.loc[mask, :]

Unnamed: 0,nome,nota,nps
3,Eduardo,7,4.0
9,Eduardo,8,4.0


In [20]:
df.loc[mask, 'nota'].mean()

7.5

In [18]:
df.nome.unique()

array(['Leticia', 'Eduardo', 'Matheus', 'Thiago'], dtype=object)

# Como fazer essa operação para todos os nomes únicos?

- `.groupby()` é uma forma de **agregar** todos os resultados para cada chave única
- sempre que você faz uma **agregação**, o resultado final terá 1 linha para cada valor pelo qual você agregou, portanto, é obrigatório que se aplique uma função agregadora para que todos os valores sejam sumarizados em um único valor associado àquela chave.

Por exemplo, se tivermos:

Nome | Nota
-----|-----
Andre | 10
Andre | 8 
Andre | 6
Joao  | 10
Joao  | 4

O resultado de um `.groupby` por 'Nome' resultaria em 2 linhas

Nome | xxxx
----|-----
Andre| *
Joao | *

O asterisco representa o valor agregado. Isto é, não há como trazer os valores 10, 8 e 6 associados à Andre. Temos, obrigatoriamente, que sumarizá-los em um único dado. Para isso, podemos fazer a média entre 10,8,6 (que seria 8), a soma (que seria 24), ou qualquer outra função agregadora. Assim teríamos um único valor sumarizado para a chave 'Nome'

In [21]:
df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [24]:
df_groups = df.groupby(by='nome')

In [30]:
df_groups.get_group('Leticia').mean()

nota    8.2
nps     4.5
dtype: float64

In [23]:
df.groupby(by='nome').mean()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,7.5,4.0
Leticia,8.2,4.5
Matheus,8.0,3.0
Thiago,9.5,3.25


In [31]:
df.groupby(by='nome').max()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,8,4.0
Leticia,9,5.0
Matheus,8,3.0
Thiago,10,3.5


In [34]:
df.groupby(by='nome').min()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,7,4.0
Leticia,7,4.0
Matheus,8,3.0
Thiago,9,3.0


## Aggregating methods

- `.mean()`
- `.median()`
- `.max()`
- `.min()`
- `.sum()`
- `.count()`
- `.describe()`
- `.agg()`

In [35]:
df.groupby('nome').describe()

Unnamed: 0_level_0,nota,nota,nota,nota,nota,nota,nota,nota,nps,nps,nps,nps,nps,nps,nps,nps
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Eduardo,2.0,7.5,0.707107,7.0,7.25,7.5,7.75,8.0,2.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0
Leticia,5.0,8.2,1.095445,7.0,7.0,9.0,9.0,9.0,5.0,4.5,0.5,4.0,4.0,4.5,5.0,5.0
Matheus,1.0,8.0,,8.0,8.0,8.0,8.0,8.0,1.0,3.0,,3.0,3.0,3.0,3.0,3.0
Thiago,2.0,9.5,0.707107,9.0,9.25,9.5,9.75,10.0,2.0,3.25,0.353553,3.0,3.125,3.25,3.375,3.5


### More than one aggregation

In [36]:
x = df.groupby('nome')

In [40]:
x.agg()

Unnamed: 0_level_0,nota,nps
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Eduardo,7.5,4.0
Leticia,8.2,4.5
Matheus,8.0,3.0
Thiago,9.5,3.25


In [38]:
import numpy as np

In [42]:
df = pd.DataFrame({'nome':['Leticia','Leticia','Leticia','Eduardo','Leticia', 'Leticia', 'Matheus','Thiago', 'Thiago', 'Eduardo'],
                   'nota' :[9, 7, 9, 7, 9, 7, 8, 10, 9, 8],
                   'nps':[5,4,4.5, 4, 4, 5, 3,3.5,3,4]})

df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [44]:
df.groupby('nome').agg([np.mean,'median'])

Unnamed: 0_level_0,nota,nota,nota,nps,nps,nps
Unnamed: 0_level_1,max,min,mean,max,min,mean
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Eduardo,8,7,7.5,4.0,4.0,4.0
Leticia,9,7,8.2,5.0,4.0,4.5
Matheus,8,8,8.0,3.0,3.0,3.0
Thiago,10,9,9.5,3.5,3.0,3.25


In [43]:
# read the `.agg()` help (shift+TAB) to learn which aggregation methods it can handle

df.groupby(by='nome').agg({'max', 'min', 'mean'})

Unnamed: 0_level_0,nota,nota,nota,nps,nps,nps
Unnamed: 0_level_1,max,mean,min,max,mean,min
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Eduardo,8,7.5,7,4.0,4.0,4.0
Leticia,9,8.2,7,5.0,4.5,4.0
Matheus,8,8.0,8,3.0,3.0,3.0
Thiago,10,9.5,9,3.5,3.25,3.0


In [45]:
grouped_results = df.groupby(by='nome').agg({'nota': 'mean', 
                                             'nps': ['min','max']})

In [46]:
grouped_results

Unnamed: 0_level_0,nota,nps,nps
Unnamed: 0_level_1,mean,min,max
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Eduardo,7.5,4.0,4.0
Leticia,8.2,4.0,5.0
Matheus,8.0,3.0,3.0
Thiago,9.5,3.0,3.5


## Named aggregation

In [50]:
grouped_results = df.groupby(by='nome').agg(nota_max = ('nota',max), 
                                            nota_min = ('nota', min),
                                            nota_avg = ('nota',np.mean),
                                            nps_mean = ('nps', 'mean'))

In [52]:
grouped_results

Unnamed: 0_level_0,nota_max,nota_min,nota_avg,nps_mean
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Eduardo,8,7,7.5,4.0
Leticia,9,7,8.2,4.5
Matheus,8,8,8.0,3.0
Thiago,10,9,9.5,3.25


In [53]:
grouped_results.loc['Thiago',:]

nota_max    10.00
nota_min     9.00
nota_avg     9.50
nps_mean     3.25
Name: Thiago, dtype: float64

In [54]:
grouped_results.reset_index()

Unnamed: 0,nome,nota_max,nota_min,nota_avg,nps_mean
0,Eduardo,8,7,7.5,4.0
1,Leticia,9,7,8.2,4.5
2,Matheus,8,8,8.0,3.0
3,Thiago,10,9,9.5,3.25


In [55]:
grouped_results = df.groupby(by='nome', as_index=False).agg(nota_max = ('nota','max'), 
                                                            nota_min = ('nota', min),
                                                            nota_avg = ('nota','mean'),
                                                            nps_mean = ('nps', 'mean'))

In [56]:
grouped_results

Unnamed: 0,nome,nota_max,nota_min,nota_avg,nps_mean
0,Eduardo,8,7,7.5,4.0
1,Leticia,9,7,8.2,4.5
2,Matheus,8,8,8.0,3.0
3,Thiago,10,9,9.5,3.25


# Group by 
>    - Aggregating function
>    - Named aggregation
>    - `as_index = False`

In [57]:
data = pd.read_csv('data/vehicles.csv')
data.rename(columns={'Make':'Manufacturer'}, inplace=True)

In [58]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [60]:
groups_vehicles = data.groupby(by='Manufacturer')

In [64]:
groups_vehicles.get_group('AM General').mean()

Year                       1984.500000
Engine Displacement           3.350000
Cylinders                     5.000000
Fuel Barrels/Year            22.674670
City MPG                     15.000000
Highway MPG                  15.000000
Combined MPG                 14.750000
CO2 Emission Grams/Mile     611.358244
Fuel Cost/Year             2287.500000
dtype: float64

In [63]:
data.groupby(by='Manufacturer').mean()

Unnamed: 0_level_0,Year,Engine Displacement,Cylinders,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AM General,1984.500000,3.350000,5.000000,22.674670,15.000000,15.000000,14.750000,611.358244,2287.500000
ASC Incorporated,1987.000000,3.800000,6.000000,20.600625,14.000000,21.000000,16.000000,555.437500,2550.000000
Acura,2003.493377,2.834768,5.231788,15.673371,18.890728,25.940397,21.506623,422.585325,1852.483444
Alfa Romeo,1991.878049,2.556098,5.317073,17.208234,17.097561,23.902439,19.512195,463.952115,1962.195122
American Motors Corporation,1984.590909,3.813636,5.545455,18.758092,16.045455,20.181818,17.681818,505.758823,1893.181818
...,...,...,...,...,...,...,...,...,...
Volkswagen,2002.928367,2.236008,4.595033,14.594784,21.226361,28.985673,24.093601,392.741721,1579.417383
Volvo,2002.182706,2.504742,4.945607,16.186996,17.981869,25.064156,20.605300,435.803755,1812.273361
Wallace Environmental,1991.500000,4.315625,7.812500,24.404196,12.437500,16.000000,13.875000,657.990029,2996.875000
Yugo,1988.375000,1.200000,4.000000,13.206218,23.000000,28.250000,25.000000,356.068256,1350.000000


In [66]:
data.groupby(by='Manufacturer').mean()[['Engine Displacement']]

Unnamed: 0_level_0,Engine Displacement
Manufacturer,Unnamed: 1_level_1
AM General,3.350000
ASC Incorporated,3.800000
Acura,2.834768
Alfa Romeo,2.556098
American Motors Corporation,3.813636
...,...
Volkswagen,2.236008
Volvo,2.504742
Wallace Environmental,4.315625
Yugo,1.200000


In [65]:
%%timeit 

data.groupby(by='Manufacturer').mean()[['Engine Displacement']]

26.4 ms ± 6.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [113]:
%%timeit

data.groupby(by='Manufacturer')[['Engine Displacement']].mean()

3.03 ms ± 85.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [67]:
data.groupby(by='Manufacturer')[['Engine Displacement']].mean()

Unnamed: 0_level_0,Engine Displacement
Manufacturer,Unnamed: 1_level_1
AM General,3.350000
ASC Incorporated,3.800000
Acura,2.834768
Alfa Romeo,2.556098
American Motors Corporation,3.813636
...,...
Volkswagen,2.236008
Volvo,2.504742
Wallace Environmental,4.315625
Yugo,1.200000


In [118]:
data.groupby(by='Manufacturer').mean()[['Engine Displacement']].reset_index()

Unnamed: 0,Manufacturer,Engine Displacement
0,AM General,3.350000
1,ASC Incorporated,3.800000
2,Acura,2.834768
3,Alfa Romeo,2.556098
4,American Motors Corporation,3.813636
...,...,...
122,Volkswagen,2.236008
123,Volvo,2.504742
124,Wallace Environmental,4.315625
125,Yugo,1.200000


In [114]:
data.groupby(by='Manufacturer')[['Engine Displacement']].mean()

Unnamed: 0_level_0,Engine Displacement
Manufacturer,Unnamed: 1_level_1
AM General,3.350000
ASC Incorporated,3.800000
Acura,2.834768
Alfa Romeo,2.556098
American Motors Corporation,3.813636
...,...
Volkswagen,2.236008
Volvo,2.504742
Wallace Environmental,4.315625
Yugo,1.200000


In [119]:
data.groupby(by='Cylinders')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002A2CC17BCC0>

In [120]:
data.groupby(by='Cylinders').mean()

Unnamed: 0_level_0,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2.0,1997.0625,1.239583,17.503468,17.0,22.9375,19.104167,471.734739,2004.166667
3.0,1996.38806,1.052239,9.160623,34.00995,40.323383,36.572139,246.695246,962.437811
4.0,1999.780643,2.06657,14.120702,21.560323,28.272417,24.075441,380.939902,1487.879798
5.0,2002.64177,2.636653,16.514187,17.785615,24.68603,20.334716,444.828844,1813.278008
6.0,2001.294242,3.439342,18.086572,16.328946,22.661261,18.606189,487.609906,1943.19624
8.0,2000.77832,5.222581,22.3254,13.323331,18.537134,15.206302,604.159066,2414.734934
10.0,2008.777778,5.911765,24.182393,11.653595,18.366013,13.941176,652.086493,2926.797386
12.0,2006.218861,5.907473,25.831975,10.893238,16.969751,13.014235,696.034399,3143.149466
16.0,2011.125,8.0,32.961,8.0,14.625,10.0,873.0625,4050.0


In [122]:
data.groupby(by='Cylinders').mean()[['Fuel Cost/Year']]

Unnamed: 0_level_0,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1
2.0,2004.166667
3.0,962.437811
4.0,1487.879798
5.0,1813.278008
6.0,1943.19624
8.0,2414.734934
10.0,2926.797386
12.0,3143.149466
16.0,4050.0


In [76]:
data
# media de cilindros e max e min de ano para as diferentes classes de veiculo
data.groupby('Vehicle Class').agg({'Cylinders':np.mean,'Year':[max,min]})
#data.groupby('Vehicle Class').agg(mean_cyl=('Cylinders',np.mean),max_year=('Year',max),min_year=('Year',min)).reset_index()

Unnamed: 0_level_0,Cylinders,Year,Year
Unnamed: 0_level_1,mean,max,min
Vehicle Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Compact Cars,4.837223,2017,1985
Large Cars,7.128588,2017,1984
Midsize Cars,5.665026,2017,1985
Midsize Station Wagons,5.173077,2017,1998
Midsize-Large Station Wagons,5.113744,1997,1985
Minicompact Cars,5.764026,2017,1985
Minivan - 2WD,5.800604,2017,1999
Minivan - 4WD,6.0,2017,1999
Small Pickup Trucks,4.648855,1997,1985
Small Pickup Trucks 2WD,4.802993,2017,1984


In [77]:
grouped_results = data.groupby(by='Cylinders', as_index=False).agg(mean_fuel_cost = ('Fuel Cost/Year','mean'),
                                                               count_vehicles = ('Fuel Cost/Year','count'))

In [78]:
grouped_results

Unnamed: 0,Cylinders,mean_fuel_cost,count_vehicles
0,2.0,2004.166667,48
1,3.0,962.437811,201
2,4.0,1487.879798,13494
3,5.0,1813.278008,723
4,6.0,1943.19624,12765
5,8.0,2414.734934,7998
6,10.0,2926.797386,153
7,12.0,3143.149466,562
8,16.0,4050.0,8


In [79]:
type(grouped_results)

pandas.core.frame.DataFrame

In [135]:
type(grouped_results.style)

pandas.io.formats.style.Styler

In [82]:
styled_dataframe = grouped_results.style

In [81]:
styled_dataframe.highlight_max(color='Brown')

Unnamed: 0,Cylinders,mean_fuel_cost,count_vehicles
0,2.0,2004.166667,48
1,3.0,962.437811,201
2,4.0,1487.879798,13494
3,5.0,1813.278008,723
4,6.0,1943.19624,12765
5,8.0,2414.734934,7998
6,10.0,2926.797386,153
7,12.0,3143.149466,562
8,16.0,4050.0,8


In [83]:
grouped_results[['mean_fuel_cost']]

Unnamed: 0,mean_fuel_cost
0,2004.166667
1,962.437811
2,1487.879798
3,1813.278008
4,1943.19624
5,2414.734934
6,2926.797386
7,3143.149466
8,4050.0


In [84]:
grouped_results[['mean_fuel_cost']].style.format('{:.2f}')

Unnamed: 0,mean_fuel_cost
0,2004.17
1,962.44
2,1487.88
3,1813.28
4,1943.2
5,2414.73
6,2926.8
7,3143.15
8,4050.0


In [85]:
grouped_results.style.format({'mean_fuel_cost':'{:.2f}'}).background_gradient()

Unnamed: 0,Cylinders,mean_fuel_cost,count_vehicles
0,2.0,2004.17,48
1,3.0,962.44,201
2,4.0,1487.88,13494
3,5.0,1813.28,723
4,6.0,1943.2,12765
5,8.0,2414.73,7998
6,10.0,2926.8,153
7,12.0,3143.15,562
8,16.0,4050.0,8


In [158]:
data.groupby(by='Cylinders').mean()

Unnamed: 0_level_0,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2.0,1997.0625,1.239583,17.503468,17.0,22.9375,19.104167,471.734739,2004.166667
3.0,1996.38806,1.052239,9.160623,34.00995,40.323383,36.572139,246.695246,962.437811
4.0,1999.780643,2.06657,14.120702,21.560323,28.272417,24.075441,380.939902,1487.879798
5.0,2002.64177,2.636653,16.514187,17.785615,24.68603,20.334716,444.828844,1813.278008
6.0,2001.294242,3.439342,18.086572,16.328946,22.661261,18.606189,487.609906,1943.19624
8.0,2000.77832,5.222581,22.3254,13.323331,18.537134,15.206302,604.159066,2414.734934
10.0,2008.777778,5.911765,24.182393,11.653595,18.366013,13.941176,652.086493,2926.797386
12.0,2006.218861,5.907473,25.831975,10.893238,16.969751,13.014235,696.034399,3143.149466
16.0,2011.125,8.0,32.961,8.0,14.625,10.0,873.0625,4050.0


In [159]:
data.head()

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [188]:
avg_mpg_two_keys = data.groupby(by=['Year','Cylinders']).mean()[['Combined MPG']].reset_index()

In [189]:
avg_mpg_two_keys

Unnamed: 0,Year,Cylinders,Combined MPG
0,1984,4.0,21.333333
1,1984,6.0,15.838235
2,1984,8.0,13.848101
3,1985,2.0,17.000000
4,1985,3.0,39.000000
...,...,...,...
249,2017,4.0,27.233766
250,2017,6.0,21.171004
251,2017,8.0,17.568750
252,2017,10.0,16.285714


In [191]:
avg_mpg_two_keys.index.get_level_values(0)

RangeIndex(start=0, stop=254, step=1)

In [196]:
df.groupby(by='nome').agg(['max','min']).columns.get_level_values(1)

Index(['max', 'min', 'max', 'min'], dtype='object')

In [202]:
df.groupby(by='nome').agg(['max','min']).columns

MultiIndex([('nota', 'max'),
            ('nota', 'min'),
            ( 'nps', 'max'),
            ( 'nps', 'min')],
           )

In [200]:
x = df.groupby(by='nome').agg(['max','min']).columns

-----

# JOINs

How to merge dataframes based on a specific column

In [203]:
df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [205]:
df_estados = pd.DataFrame({'nome':['Andre', 'Matheus','Eduardo','Alexandre','Gabriela'], 
                           'estado':['SP','PR','MG','SP','MS']})

df_estados

Unnamed: 0,nome,estado
0,Andre,SP
1,Matheus,PR
2,Eduardo,MG
3,Alexandre,SP
4,Gabriela,MS


In [206]:
pd.merge(left=df, right=df_estados)

Unnamed: 0,nome,nota,nps,estado
0,Eduardo,7,4.0,MG
1,Eduardo,8,4.0,MG
2,Matheus,8,3.0,PR


In [210]:
df_estados = pd.DataFrame({'name':['Andre', 'Matheus','Eduardo','Alexandre','Eduardo'], 
                           'estado':['SP','PR','MG','SP','MS']})

df_estados

Unnamed: 0,name,estado
0,Andre,SP
1,Matheus,PR
2,Eduardo,MG
3,Alexandre,SP
4,Eduardo,MS


In [211]:
pd.merge(left=df, right=df_estados, left_on='nome', right_on='name')

Unnamed: 0,nome,nota,nps,name,estado
0,Eduardo,7,4.0,Eduardo,MG
1,Eduardo,7,4.0,Eduardo,MS
2,Eduardo,8,4.0,Eduardo,MG
3,Eduardo,8,4.0,Eduardo,MS
4,Matheus,8,3.0,Matheus,PR


## Types of Joins

![image-asset.png](data/image.png)

In [212]:
df

Unnamed: 0,nome,nota,nps
0,Leticia,9,5.0
1,Leticia,7,4.0
2,Leticia,9,4.5
3,Eduardo,7,4.0
4,Leticia,9,4.0
5,Leticia,7,5.0
6,Matheus,8,3.0
7,Thiago,10,3.5
8,Thiago,9,3.0
9,Eduardo,8,4.0


In [215]:
pd.merge(left=df, right=df_estados, left_on='nome', right_on='name', how='left')

Unnamed: 0,nome,nota,nps,name,estado
0,Leticia,9,5.0,,
1,Leticia,7,4.0,,
2,Leticia,9,4.5,,
3,Eduardo,7,4.0,Eduardo,MG
4,Eduardo,7,4.0,Eduardo,MS
5,Leticia,9,4.0,,
6,Leticia,7,5.0,,
7,Matheus,8,3.0,Matheus,PR
8,Thiago,10,3.5,,
9,Thiago,9,3.0,,


In [217]:
pd.merge(left=df, right=df_estados, left_on='nome', right_on='name', how='outer')

Unnamed: 0,nome,nota,nps,name,estado
0,Leticia,9.0,5.0,,
1,Leticia,7.0,4.0,,
2,Leticia,9.0,4.5,,
3,Leticia,9.0,4.0,,
4,Leticia,7.0,5.0,,
5,Eduardo,7.0,4.0,Eduardo,MG
6,Eduardo,7.0,4.0,Eduardo,MS
7,Eduardo,8.0,4.0,Eduardo,MG
8,Eduardo,8.0,4.0,Eduardo,MS
9,Matheus,8.0,3.0,Matheus,PR


## Using our vehicles dataframe

In [218]:
data

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


In [224]:
avg_fuel = data.groupby('Cylinders').agg(avg_fuel = ('Fuel Cost/Year','mean')).reset_index()
avg_fuel

Unnamed: 0,Cylinders,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [222]:
data

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


In [225]:
pd.merge(left=data, right=avg_fuel, on='Cylinders')

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,avg_fuel
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,1487.879798
1,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,1487.879798
2,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,20,26,22,403.954545,1500,1487.879798
3,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,22,28,24,370.291667,1400,1487.879798
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,19,27,22,403.954545,1500,1487.879798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,Bugatti,Veyron,2011,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,4050.000000
35948,Bugatti,Veyron,2012,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,4050.000000
35949,Bugatti,Veyron,2013,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,4050.000000
35950,Bugatti,Veyron,2014,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,4050.000000


## What if we had different names?

In [226]:
avg_fuel.rename(columns={'Cylinders':'cyl'}, inplace=True)

In [227]:
avg_fuel.head(2)

Unnamed: 0,cyl,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811


In [228]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [229]:
pd.merge(left=data, right=avg_fuel, left_on='Cylinders', right_on='cyl')

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,cyl,avg_fuel
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,4.0,1487.879798
1,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,4.0,1487.879798
2,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,20,26,22,403.954545,1500,4.0,1487.879798
3,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,22,28,24,370.291667,1400,4.0,1487.879798
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,19,27,22,403.954545,1500,4.0,1487.879798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,Bugatti,Veyron,2011,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,16.0,4050.000000
35948,Bugatti,Veyron,2012,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,16.0,4050.000000
35949,Bugatti,Veyron,2013,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,16.0,4050.000000
35950,Bugatti,Veyron,2014,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,16.0,4050.000000


# How to concatenate dataframes?

In [232]:
avg_fuel

Unnamed: 0,cyl,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [231]:
small_cars = avg_fuel.loc[avg_fuel['cyl'] < 7, :]
small_cars

Unnamed: 0,cyl,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624


In [233]:
big_cars = avg_fuel.loc[avg_fuel['cyl'] >= 7, :]

In [234]:
small_cars

Unnamed: 0,cyl,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624


In [242]:
big_cars = big_cars.reset_index(drop=True)

In [243]:
big_cars

Unnamed: 0,cyl,avg_fuel
0,8.0,2414.734934
1,10.0,2926.797386
2,12.0,3143.149466
3,16.0,4050.0


In [248]:
pd.concat([small_cars, big_cars]).reset_index(drop=True)

Unnamed: 0,cyl,avg_fuel
0,2.0,2004.166667
1,3.0,962.437811
2,4.0,1487.879798
3,5.0,1813.278008
4,6.0,1943.19624
5,8.0,2414.734934
6,10.0,2926.797386
7,12.0,3143.149466
8,16.0,4050.0


In [249]:
pd.concat([small_cars, big_cars], axis=1).reset_index(drop=True)

Unnamed: 0,cyl,avg_fuel,cyl.1,avg_fuel.1
0,2.0,2004.166667,8.0,2414.734934
1,3.0,962.437811,10.0,2926.797386
2,4.0,1487.879798,12.0,3143.149466
3,5.0,1813.278008,16.0,4050.0
4,6.0,1943.19624,,


# Bins
> ```pd.cut``` vs ```pd.qcut```
> - Specify cutoffs

> - Use case
>     - Scores ~ decis (0,1,2,3,4,5,6,7,8,9)


Suppose I want to break the values of the variable `Combined MPG` into 5 categories: From Very Low to Very High

In [250]:
data['Fuel Barrels/Year']

0        19.388824
1        25.354615
2        20.600625
3        25.354615
4        20.600625
           ...    
35947     9.155833
35948     9.155833
35949     9.155833
35950     9.155833
35951     9.417429
Name: Fuel Barrels/Year, Length: 35952, dtype: float64

In [251]:
data['Fuel Barrels/Year'].describe()

count    35952.000000
mean        17.609056
std          4.467283
min          0.060000
25%         14.699423
50%         17.347895
75%         20.600625
max         47.087143
Name: Fuel Barrels/Year, dtype: float64

In [252]:
data['Fuel Barrels/Year']

0        19.388824
1        25.354615
2        20.600625
3        25.354615
4        20.600625
           ...    
35947     9.155833
35948     9.155833
35949     9.155833
35950     9.155833
35951     9.417429
Name: Fuel Barrels/Year, Length: 35952, dtype: float64

In [254]:
pd.cut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA'])

0         M
1         M
2         M
3         M
4         M
         ..
35947    MB
35948    MB
35949    MB
35950    MB
35951    MB
Name: Fuel Barrels/Year, Length: 35952, dtype: category
Categories (5, object): ['MB' < 'B' < 'M' < 'A' < 'MA']

In [255]:
data['cat_barrel_year'] = pd.cut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA'])
data['cat_barrel_year']

0         M
1         M
2         M
3         M
4         M
         ..
35947    MB
35948    MB
35949    MB
35950    MB
35951    MB
Name: cat_barrel_year, Length: 35952, dtype: category
Categories (5, object): ['MB' < 'B' < 'M' < 'A' < 'MA']

In [256]:
data.groupby(by='cat_barrel_year').mean()['Engine Displacement']

cat_barrel_year
MB    1.863077
B     2.696333
M     4.642560
A     5.755868
MA    6.000000
Name: Engine Displacement, dtype: float64

In [259]:
data['cat_barrel_year'].value_counts()

B     23804
M     11062
A       605
MB      455
MA       26
Name: cat_barrel_year, dtype: int64

In [258]:
pd.qcut(data['Fuel Barrels/Year'], 5).value_counts()

(15.696, 18.312]    9871
(13.734, 15.696]    7196
(0.059, 13.734]     7192
(20.601, 47.087]    7080
(18.312, 20.601]    4613
Name: Fuel Barrels/Year, dtype: int64

In [260]:
mpg_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']

In [261]:
# performing the pd.cut operation without specifiyng the labels outputs the intervals
bins = pd.cut(data['Fuel Cost/Year'], 5)
bins

0        (1640.0, 2680.0]
1        (1640.0, 2680.0]
2        (1640.0, 2680.0]
3        (1640.0, 2680.0]
4        (1640.0, 2680.0]
               ...       
35947     (594.8, 1640.0]
35948     (594.8, 1640.0]
35949     (594.8, 1640.0]
35950     (594.8, 1640.0]
35951     (594.8, 1640.0]
Name: Fuel Cost/Year, Length: 35952, dtype: category
Categories (5, interval[float64]): [(594.8, 1640.0] < (1640.0, 2680.0] < (2680.0, 3720.0] < (3720.0, 4760.0] < (4760.0, 5800.0]]

In [262]:
# performing the pd.cut operation using the labels argument outputs your labels
bins = pd.cut(data['Fuel Cost/Year'], 5, labels=mpg_labels)
bins.head(10)

0         Low
1         Low
2         Low
3         Low
4         Low
5    Very Low
6    Very Low
7         Low
8    Very Low
9    Very Low
Name: Fuel Cost/Year, dtype: category
Categories (5, object): ['Very Low' < 'Low' < 'Moderate' < 'High' < 'Very High']

In [263]:
bins = pd.qcut(data['Fuel Cost/Year'],5, labels=mpg_labels)
bins.head(10)

0     Moderate
1    Very High
2         High
3    Very High
4    Very High
5          Low
6     Very Low
7          Low
8          Low
9     Very Low
Name: Fuel Cost/Year, dtype: category
Categories (5, object): ['Very Low' < 'Low' < 'Moderate' < 'High' < 'Very High']

# Convert categorical variables columns

>    - dummies
>    - One hot encoding

In [266]:
data[['cat_barrel_year']]
# count the values within each category

Unnamed: 0,cat_barrel_year
0,M
1,M
2,M
3,M
4,M
...,...
35947,MB
35948,MB
35949,MB
35950,MB


In [267]:
data['cat_barrel_year'].unique()

['M', 'B', 'MB', 'A', 'MA']
Categories (5, object): ['MB' < 'B' < 'M' < 'A' < 'MA']

In [268]:
pd.get_dummies(data['cat_barrel_year'])

Unnamed: 0,MB,B,M,A,MA
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0
...,...,...,...,...,...
35947,1,0,0,0,0
35948,1,0,0,0,0
35949,1,0,0,0,0
35950,1,0,0,0,0
