import numpy as np
# ^^^ pyforest auto-imports - don't write above this line
# Data Manipulation - PART 2

## Aggregations

In [2]:
import pandas as pd

In [47]:
df = pd.DataFrame({'nome':['Andre','Andre','Joao','Joao','Joao','Andre'],
                   'nota' :[9, 7, 8, 10, 5, 8],
                   'tempo' :[55, 34, 44, 45, 46, 60]})


df

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,34
2,Joao,8,44
3,Joao,10,45
4,Joao,5,46
5,Andre,8,60


- selecting columns

In [5]:
df[['nome','tempo']]

Unnamed: 0,nome,tempo
0,Andre,55
1,Andre,34
2,Joao,44
3,Joao,45
4,Joao,46
5,Andre,60


In [7]:
df.loc[[0,3,4], ['nome','tempo']]

Unnamed: 0,nome,tempo
0,Andre,55
3,Joao,45
4,Joao,46


In [8]:
# : -> todas as linhas
df.loc[:, ['nome','tempo']]

Unnamed: 0,nome,tempo
0,Andre,55
1,Andre,34
2,Joao,44
3,Joao,45
4,Joao,46
5,Andre,60


In [19]:
# funciona mas não recomendo
df[['nome','tempo']].loc[0:3]

Unnamed: 0,nome,tempo
0,Andre,55
1,Andre,34
2,Joao,44
3,Joao,45


In [9]:
# : -> todas as colunas
df.loc[[0,3,4], :]

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
3,Joao,10,45
4,Joao,5,46


In [12]:
df.iloc[[0,3,4], [0,2]]

Unnamed: 0,nome,tempo
0,Andre,55
3,Joao,45
4,Joao,46


In [20]:
df['nome']

0    Andre
1    Andre
2     Joao
3     Joao
4     Joao
5    Andre
Name: nome, dtype: object

In [23]:
df.nota

0     9
1     7
2     8
3    10
4     5
5     8
Name: nota, dtype: int64

In [27]:
df['nota'] > 7

0     True
1    False
2     True
3     True
4    False
5     True
Name: nota, dtype: bool

In [32]:
df.loc[ df['nota'] <= 7 , :]

Unnamed: 0,nome,nota,tempo
1,Andre,7,34
4,Joao,5,46


In [33]:
df.query('nota <= 7')

Unnamed: 0,nome,nota,tempo
1,Andre,7,34
4,Joao,5,46


In [44]:
my_list = [1,2,3]

In [39]:
df

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,0
2,Joao,8,44
3,Joao,10,45
4,Joao,5,0
5,Andre,8,60


In [50]:
df

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,34
2,Joao,8,44
3,Joao,10,45
4,Joao,5,46
5,Andre,8,60


In [51]:
df['nome'] == 'Andre'

0     True
1     True
2    False
3    False
4    False
5     True
Name: nome, dtype: bool

In [52]:
mask = df['nome'] == 'Andre'
mask

0     True
1     True
2    False
3    False
4    False
5     True
Name: nome, dtype: bool

In [57]:
df

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,34
2,Joao,8,44
3,Joao,10,45
4,Joao,5,46
5,Andre,8,60


In [61]:
df.mean()

nota      7.833333
tempo    47.333333
dtype: float64

In [56]:
df.loc[mask, :]

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,34
5,Andre,8,60


In [62]:
df.loc[mask, :].mean()

nota      8.000000
tempo    49.666667
dtype: float64

In [63]:
mask = df['nome'] == 'Joao'
df.loc[mask, :].mean()

nota      7.666667
tempo    45.000000
dtype: float64

In [64]:
df.loc[mask, :]

Unnamed: 0,nome,nota,tempo
2,Joao,8,44
3,Joao,10,45
4,Joao,5,46


- mean is a form of aggregation

## Aggregate by `nome`

In [65]:
df

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,34
2,Joao,8,44
3,Joao,10,45
4,Joao,5,46
5,Andre,8,60


In [66]:
df.groupby(by='nome')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002732EF60198>

In [67]:
df.groupby(by='nome').mean()

Unnamed: 0_level_0,nota,tempo
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Andre,8.0,49.666667
Joao,7.666667,45.0


In [68]:
results = df.groupby(by='nome').mean()
results

Unnamed: 0_level_0,nota,tempo
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Andre,8.0,49.666667
Joao,7.666667,45.0


In [72]:
results.loc['Andre', 'nota']

8.0

In [74]:
results.iloc[0, 0]

8.0

## Groups not as index:

In [78]:
df.groupby(by='nome').mean()

Unnamed: 0_level_0,nota,tempo
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Andre,8.0,49.666667
Joao,7.666667,45.0


In [76]:
df.groupby(by='nome').mean().reset_index()

Unnamed: 0,nome,nota,tempo
0,Andre,8.0,49.666667
1,Joao,7.666667,45.0


In [82]:
df.groupby(by='nome', as_index=False).mean()

Unnamed: 0,nome,nota,tempo
0,Andre,8.0,49.666667
1,Joao,7.666667,45.0


In [84]:
df.groupby(by='nome', as_index=False).max()

Unnamed: 0,nome,nota,tempo
0,Andre,9,60
1,Joao,10,46


## Apply specific aggregation for each column

In [99]:
f = {'nota': 'mean', 
     'tempo': np.std}

<IPython.core.display.Javascript object>

In [100]:
df.groupby(by='nome').agg(f)

Unnamed: 0_level_0,nota,tempo
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
Andre,8.0,13.796135
Joao,7.666667,1.0


## More than one aggregation

In [102]:
df.groupby(by='nome').agg(['mean',np.std])

<IPython.core.display.Javascript object>

Unnamed: 0_level_0,nota,nota,tempo,tempo
Unnamed: 0_level_1,mean,std,mean,std
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Andre,8.0,1.0,49.666667,13.796135
Joao,7.666667,2.516611,45.0,1.0


In [105]:
df.groupby(by='nome').agg(['mean',np.std]).loc[:, [('nota','mean'),('tempo','mean')]]

<IPython.core.display.Javascript object>

Unnamed: 0_level_0,nota,tempo
Unnamed: 0_level_1,mean,mean
nome,Unnamed: 1_level_2,Unnamed: 2_level_2
Andre,8.0,49.666667
Joao,7.666667,45.0


## Named aggregation

In [107]:
df.groupby(by='nome').agg(nota_avg = ('nota','mean'),
                          tempo_avg = ('tempo','mean'),
                          tempo_std = ('tempo','std'))

Unnamed: 0_level_0,nota_avg,tempo_avg,tempo_std
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andre,8.0,49.666667,13.796135
Joao,7.666667,45.0,1.0


In [117]:
df.groupby(by='nome').agg(nota_avg = ('nota', np.mean),
                          tempo_avg = ('tempo','mean'),
                          tempo_std = ('tempo','std')).reset_index()

<IPython.core.display.Javascript object>

Unnamed: 0,nome,nota_avg,tempo_avg,tempo_std
0,Andre,3,49.666667,13.796135
1,Joao,3,45.0,1.0


# In summary: Group by 
>    - Aggregating function (f)
>    - Named aggregation
>    - as_index = False

In [124]:
data = pd.read_csv('vehicles/vehicles.csv')
data.rename(columns={'Make':'Manufacturer'}, inplace=True)

In [125]:
data.head()

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [132]:
data.groupby(by='Cylinders')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002732F7DECF8>

In [151]:
results = data.groupby(by='Cylinders').mean()
results

Unnamed: 0_level_0,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2.0,1997.0625,1.239583,17.503468,17.0,22.9375,19.104167,471.734739,2004.166667
3.0,1996.38806,1.052239,9.160623,34.00995,40.323383,36.572139,246.695246,962.437811
4.0,1999.780643,2.06657,14.120702,21.560323,28.272417,24.075441,380.939902,1487.879798
5.0,2002.64177,2.636653,16.514187,17.785615,24.68603,20.334716,444.828844,1813.278008
6.0,2001.294242,3.439342,18.086572,16.328946,22.661261,18.606189,487.609906,1943.19624
8.0,2000.77832,5.222581,22.3254,13.323331,18.537134,15.206302,604.159066,2414.734934
10.0,2008.777778,5.911765,24.182393,11.653595,18.366013,13.941176,652.086493,2926.797386
12.0,2006.218861,5.907473,25.831975,10.893238,16.969751,13.014235,696.034399,3143.149466
16.0,2011.125,8.0,32.961,8.0,14.625,10.0,873.0625,4050.0


In [153]:
#fuel = results.loc[6:10, ['Fuel Cost/Year']]
#fuel = results.loc[[6,8,10], ['Fuel Cost/Year']]
fuel = results.loc[:, ['Fuel Cost/Year']]
fuel

Unnamed: 0_level_0,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1
2.0,2004.166667
3.0,962.437811
4.0,1487.879798
5.0,1813.278008
6.0,1943.19624
8.0,2414.734934
10.0,2926.797386
12.0,3143.149466
16.0,4050.0


In [154]:
fuel.sort_values(by='Fuel Cost/Year')

Unnamed: 0_level_0,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1
3.0,962.437811
4.0,1487.879798
5.0,1813.278008
6.0,1943.19624
2.0,2004.166667
8.0,2414.734934
10.0,2926.797386
12.0,3143.149466
16.0,4050.0


In [130]:
data.groupby(by='Cylinders').mean().loc[:, ['Fuel Cost/Year']].sort_values(by='Fuel Cost/Year')

Unnamed: 0_level_0,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1
3.0,962.437811
4.0,1487.879798
5.0,1813.278008
6.0,1943.19624
2.0,2004.166667
8.0,2414.734934
10.0,2926.797386
12.0,3143.149466
16.0,4050.0


In [177]:
data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                 std_fuel = ('Fuel Cost/Year', 'std'))

Unnamed: 0_level_0,avg_fuel,std_fuel
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,2004.166667,293.154765
3.0,962.437811,168.209772
4.0,1487.879798,278.101583
5.0,1813.278008,250.726178
6.0,1943.19624,274.865226
8.0,2414.734934,407.337968
10.0,2926.797386,417.543968
12.0,3143.149466,501.399335
16.0,4050.0,0.0


In [181]:
data.head()

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [183]:
avg_mpg = data.groupby(by='Manufacturer').mean()
avg_mpg

Unnamed: 0_level_0,Year,Engine Displacement,Cylinders,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AM General,1984.500000,3.350000,5.000000,22.674670,15.000000,15.000000,14.750000,611.358244,2287.500000
ASC Incorporated,1987.000000,3.800000,6.000000,20.600625,14.000000,21.000000,16.000000,555.437500,2550.000000
Acura,2003.493377,2.834768,5.231788,15.673371,18.890728,25.940397,21.506623,422.585325,1852.483444
Alfa Romeo,1991.878049,2.556098,5.317073,17.208234,17.097561,23.902439,19.512195,463.952115,1962.195122
American Motors Corporation,1984.590909,3.813636,5.545455,18.758092,16.045455,20.181818,17.681818,505.758823,1893.181818
...,...,...,...,...,...,...,...,...,...
Volkswagen,2002.928367,2.236008,4.595033,14.594784,21.226361,28.985673,24.093601,392.741721,1579.417383
Volvo,2002.182706,2.504742,4.945607,16.186996,17.981869,25.064156,20.605300,435.803755,1812.273361
Wallace Environmental,1991.500000,4.315625,7.812500,24.404196,12.437500,16.000000,13.875000,657.990029,2996.875000
Yugo,1988.375000,1.200000,4.000000,13.206218,23.000000,28.250000,25.000000,356.068256,1350.000000


In [179]:
avg_mpg = data.groupby(by='Manufacturer', as_index=False).mean()
avg_mpg.head()

Unnamed: 0,Manufacturer,Year,Engine Displacement,Cylinders,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,1984.5,3.35,5.0,22.67467,15.0,15.0,14.75,611.358244,2287.5
1,ASC Incorporated,1987.0,3.8,6.0,20.600625,14.0,21.0,16.0,555.4375,2550.0
2,Acura,2003.493377,2.834768,5.231788,15.673371,18.890728,25.940397,21.506623,422.585325,1852.483444
3,Alfa Romeo,1991.878049,2.556098,5.317073,17.208234,17.097561,23.902439,19.512195,463.952115,1962.195122
4,American Motors Corporation,1984.590909,3.813636,5.545455,18.758092,16.045455,20.181818,17.681818,505.758823,1893.181818


In [203]:
avg_mpg = data.groupby(by='Manufacturer').agg(avg_mpg=('Highway MPG', 'mean')).reset_index()
avg_mpg

Unnamed: 0,Manufacturer,avg_mpg
0,AM General,15.000000
1,ASC Incorporated,21.000000
2,Acura,25.940397
3,Alfa Romeo,23.902439
4,American Motors Corporation,20.181818
...,...,...
122,Volkswagen,28.985673
123,Volvo,25.064156
124,Wallace Environmental,16.000000
125,Yugo,28.250000


## Aggregate by more than one key

In [184]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [186]:
data.groupby(by=['Manufacturer','Cylinders']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Manufacturer,Cylinders,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AM General,4.0,1984.500000,2.500000,19.994724,17.000000,17.000000,16.500000,539.101103,2025.000000
AM General,6.0,1984.500000,4.200000,25.354615,13.000000,13.000000,13.000000,683.615385,2550.000000
ASC Incorporated,6.0,1987.000000,3.800000,20.600625,14.000000,21.000000,16.000000,555.437500,2550.000000
Acura,4.0,2003.018018,2.060360,13.730928,21.603604,28.747748,24.270270,370.291343,1563.063063
Acura,5.0,1994.400000,2.500000,16.480500,17.900000,23.500000,20.000000,444.350000,2000.000000
...,...,...,...,...,...,...,...,...,...
Wallace Environmental,6.0,1991.444444,2.900000,22.584389,14.555556,15.666667,14.666667,608.924074,2777.777778
Wallace Environmental,8.0,1991.416667,5.600000,25.354615,11.000000,15.000000,13.000000,683.615385,3100.000000
Wallace Environmental,12.0,1991.714286,5.085714,29.146832,9.571429,14.714286,11.428571,785.861755,3585.714286
Yugo,4.0,1988.375000,1.200000,13.206218,23.000000,28.250000,25.000000,356.068256,1350.000000


In [187]:
data.groupby(by=['Manufacturer','Cylinders'], as_index=False).mean()

Unnamed: 0,Manufacturer,Cylinders,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,4.0,1984.500000,2.500000,19.994724,17.000000,17.000000,16.500000,539.101103,2025.000000
1,AM General,6.0,1984.500000,4.200000,25.354615,13.000000,13.000000,13.000000,683.615385,2550.000000
2,ASC Incorporated,6.0,1987.000000,3.800000,20.600625,14.000000,21.000000,16.000000,555.437500,2550.000000
3,Acura,4.0,2003.018018,2.060360,13.730928,21.603604,28.747748,24.270270,370.291343,1563.063063
4,Acura,5.0,1994.400000,2.500000,16.480500,17.900000,23.500000,20.000000,444.350000,2000.000000
...,...,...,...,...,...,...,...,...,...,...
252,Wallace Environmental,6.0,1991.444444,2.900000,22.584389,14.555556,15.666667,14.666667,608.924074,2777.777778
253,Wallace Environmental,8.0,1991.416667,5.600000,25.354615,11.000000,15.000000,13.000000,683.615385,3100.000000
254,Wallace Environmental,12.0,1991.714286,5.085714,29.146832,9.571429,14.714286,11.428571,785.861755,3585.714286
255,Yugo,4.0,1988.375000,1.200000,13.206218,23.000000,28.250000,25.000000,356.068256,1350.000000


In [188]:
avg_mpg_two_keys = data.groupby(by=['Manufacturer','Cylinders']).mean().reset_index()
avg_mpg_two_keys

Unnamed: 0,Manufacturer,Cylinders,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,4.0,1984.500000,2.500000,19.994724,17.000000,17.000000,16.500000,539.101103,2025.000000
1,AM General,6.0,1984.500000,4.200000,25.354615,13.000000,13.000000,13.000000,683.615385,2550.000000
2,ASC Incorporated,6.0,1987.000000,3.800000,20.600625,14.000000,21.000000,16.000000,555.437500,2550.000000
3,Acura,4.0,2003.018018,2.060360,13.730928,21.603604,28.747748,24.270270,370.291343,1563.063063
4,Acura,5.0,1994.400000,2.500000,16.480500,17.900000,23.500000,20.000000,444.350000,2000.000000
...,...,...,...,...,...,...,...,...,...,...
252,Wallace Environmental,6.0,1991.444444,2.900000,22.584389,14.555556,15.666667,14.666667,608.924074,2777.777778
253,Wallace Environmental,8.0,1991.416667,5.600000,25.354615,11.000000,15.000000,13.000000,683.615385,3100.000000
254,Wallace Environmental,12.0,1991.714286,5.085714,29.146832,9.571429,14.714286,11.428571,785.861755,3585.714286
255,Yugo,4.0,1988.375000,1.200000,13.206218,23.000000,28.250000,25.000000,356.068256,1350.000000


In [189]:
#named aggregation
avg_year_cylinders = data.groupby(by=['Year','Cylinders']).agg(avg_cost = ('Fuel Cost/Year','mean'),
                                                               count_cost = ('Fuel Cost/Year','count')).reset_index()
avg_year_cylinders

Unnamed: 0,Year,Cylinders,avg_cost,count_cost
0,1984,4.0,1609.313725,204
1,1984,6.0,2134.313725,204
2,1984,8.0,2563.291139,237
3,1985,2.0,1950.000000,5
4,1985,3.0,850.000000,6
...,...,...,...,...
249,2017,4.0,1361.168831,385
250,2017,6.0,1783.457249,269
251,2017,8.0,2152.812500,160
252,2017,10.0,2514.285714,7


# JOINs

How to merge dataframes based on specific column(s) - Keys

In [221]:
df = pd.DataFrame({'nome':['Andre','Andre','Joao','Joao','Joao','Andre'],
                   'nota' :[9, 7, 8, 10, 5, 8],
                   'tempo' :[55, 34, 44, 45, 46, 60]})

df

Unnamed: 0,nome,nota,tempo
0,Andre,9,55
1,Andre,7,34
2,Joao,8,44
3,Joao,10,45
4,Joao,5,46
5,Andre,8,60


In [224]:
classes = pd.DataFrame({'nome':['Andre','Joao'],
                        'classroom':[221, 332]})
classes

Unnamed: 0,nome,classroom
0,Andre,221
1,Joao,332


In [226]:
classes

Unnamed: 0,nome,classroom
0,Andre,221
1,Joao,332


In [228]:
pd.merge(left=df, right=classes)

Unnamed: 0,nome,nota,tempo,classroom
0,Andre,9,55,221
1,Andre,7,34,221
2,Andre,8,60,221
3,Joao,8,44,332
4,Joao,10,45,332
5,Joao,5,46,332


## What if the names of the key were different in each dataset?

In [230]:
classes = pd.DataFrame({'name':['Andre','Joao'],
                        'classroom':[221, 332]})
classes

Unnamed: 0,name,classroom
0,Andre,221
1,Joao,332


In [232]:
pd.merge(left=df, right=classes, left_on='nome', right_on='name')

Unnamed: 0,nome,nota,tempo,name,classroom
0,Andre,9,55,Andre,221
1,Andre,7,34,Andre,221
2,Andre,8,60,Andre,221
3,Joao,8,44,Joao,332
4,Joao,10,45,Joao,332
5,Joao,5,46,Joao,332


In [233]:
classes = pd.DataFrame({'name':['Andre','Joaquim'],
                        'classroom':[221, 332]})
classes

Unnamed: 0,name,classroom
0,Andre,221
1,Joaquim,332


In [235]:
pd.merge(left=df, right=classes, how='left', left_on='nome', right_on='name')

Unnamed: 0,nome,nota,tempo,name,classroom
0,Andre,9,55,Andre,221.0
1,Andre,7,34,Andre,221.0
2,Joao,8,44,,
3,Joao,10,45,,
4,Joao,5,46,,
5,Andre,8,60,Andre,221.0


# In our vehicles dataset

In [237]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [239]:
avg_mpg

Unnamed: 0,Manufacturer,avg_mpg
0,AM General,15.000000
1,ASC Incorporated,21.000000
2,Acura,25.940397
3,Alfa Romeo,23.902439
4,American Motors Corporation,20.181818
...,...,...
122,Volkswagen,28.985673
123,Volvo,25.064156
124,Wallace Environmental,16.000000
125,Yugo,28.250000


In [240]:
pd.merge(left=data, 
         right=avg_mpg)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,avg_mpg
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,15.0
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,15.0
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,15.0
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,15.0
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,39.3
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100,39.3
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,39.3
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100,39.3


# How to concatenate dataframes?

In [241]:
# creating a dataframe containing only the Manufacturer Lexus
lexus = data.loc[ data['Manufacturer']=='Lexus', :]
lexus.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
21128,Lexus,CT 200h,2011,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.595238,800
21129,Lexus,CT 200h,2012,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.595238,800


In [242]:
# creating a dataframe containing only the Manufacturer Audi
audi = data.loc[data['Manufacturer'] == 'Audi', :]

In [245]:
audi.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
503,Audi,100,1989,2.3,5.0,Automatic 3-spd,Front-Wheel Drive,Midsize Cars,Regular,18.311667,17,20,18,493.722222,1850
504,Audi,100,1989,2.3,5.0,Manual 5-spd,Front-Wheel Drive,Midsize Cars,Regular,17.347895,16,23,19,467.736842,1750


In [246]:
lexus.shape

(397, 15)

In [247]:
audi.shape

(890, 15)

In [248]:
pd.concat([lexus, audi])

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
21128,Lexus,CT 200h,2011,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.595238,800
21129,Lexus,CT 200h,2012,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.595238,800
21130,Lexus,CT 200h,2013,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.000000,800
21131,Lexus,CT 200h,2014,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,179.000000,800
21132,Lexus,CT 200h,2015,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.000000,800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1388,Audi,allroad quattro,2013,2.0,4.0,Automatic (S8),All-Wheel Drive,Small Station Wagons,Premium,14.330870,20,27,23,394.000000,1750
1389,Audi,allroad quattro,2013,2.0,4.0,Automatic (S8),All-Wheel Drive,Small Station Wagons,Premium or E85,14.330870,20,27,23,394.000000,1750
1390,Audi,allroad quattro,2014,2.0,4.0,Automatic (S8),All-Wheel Drive,Small Station Wagons,Premium or E85,14.330870,20,27,23,394.000000,1750
1391,Audi,allroad quattro,2015,2.0,4.0,Automatic (S8),All-Wheel Drive,Small Station Wagons,Premium or E85,13.733750,21,28,24,373.000000,1700


In [249]:
# Concatenating both results
lexus_audi = pd.concat([lexus, audi])

In [250]:
lexus_audi.head()

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
21128,Lexus,CT 200h,2011,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.595238,800
21129,Lexus,CT 200h,2012,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.595238,800
21130,Lexus,CT 200h,2013,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.0,800
21131,Lexus,CT 200h,2014,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,179.0,800
21132,Lexus,CT 200h,2015,1.8,4.0,Automatic (variable gear ratios),Front-Wheel Drive,Compact Cars,Regular,7.847857,43,40,42,211.0,800
