# 数据分组和聚合

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('starbucks_store_worldwide.csv')
df.head()

Unnamed: 0,Brand,Store Number,Store Name,Ownership Type,Street Address,City,State/Province,Country,Postcode,Phone Number,Timezone,Longitude,Latitude
0,Starbucks,47370-257954,"Meritxell, 96",Licensed,"Av. Meritxell, 96",Andorra la Vella,7,AD,AD500,376818720.0,GMT+1:00 Europe/Andorra,1.53,42.51
1,Starbucks,22331-212325,Ajman Drive Thru,Licensed,"1 Street 69, Al Jarf",Ajman,AJ,AE,,,GMT+04:00 Asia/Dubai,55.47,25.42
2,Starbucks,47089-256771,Dana Mall,Licensed,Sheikh Khalifa Bin Zayed St.,Ajman,AJ,AE,,,GMT+04:00 Asia/Dubai,55.47,25.39
3,Starbucks,22126-218024,Twofour 54,Licensed,Al Salam Street,Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.38,24.48
4,Starbucks,17127-178586,Al Ain Tower,Licensed,"Khaldiya Area, Abu Dhabi Island",Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.54,24.51


In [3]:
df1 = pd.DataFrame({'fruit':['apple','banana','orange','apple','banana'],
                    'color':['red','yellow','yellow','cyan','cyan'],
                    'price':[8.5,6.8,5.6,7.8,6.4]})
df1

Unnamed: 0,color,fruit,price
0,red,apple,8.5
1,yellow,banana,6.8
2,yellow,orange,5.6
3,cyan,apple,7.8
4,cyan,banana,6.4


## 分组
- pd.groupby()

In [4]:
# pd.groupby() 分组
df1.groupby(by='fruit')

<pandas.core.groupby.DataFrameGroupBy object at 0x7f5ffd119a90>

In [5]:
# 遍历
for name,group in df1.groupby(by='fruit'):
    print(name) # 输出组名
    print("---" * 10)
    print(group) # 输出数据块 是DataFrame类型

apple
------------------------------
  color  fruit  price
0   red  apple    8.5
3  cyan  apple    7.8
banana
------------------------------
    color   fruit  price
1  yellow  banana    6.8
4    cyan  banana    6.4
orange
------------------------------
    color   fruit  price
2  yellow  orange    5.6


In [6]:
# 选取任意数据块  dict(list(分组对象))['想要选取的组名']
dict(list(df1.groupby(by='fruit')))['apple']

Unnamed: 0,color,fruit,price
0,red,apple,8.5
3,cyan,apple,7.8


## 聚合
- 常见数学函数
    - 分组.函数名
- 自定义函数 func()
    - 分组.agg(func)

In [7]:
# 根据水果类型来求价格平均值
df1.groupby(by='fruit')['price'].mean()

fruit
apple     8.15
banana    6.60
orange    5.60
Name: price, dtype: float64

In [8]:
# 语法糖
df1['price'].groupby(df1['fruit']).mean()

fruit
apple     8.15
banana    6.60
orange    5.60
Name: price, dtype: float64

In [9]:
# as_index=False 添加默认的行索引，不写或=True都是以组名作为行索引
df1.groupby(by='fruit', as_index=False)['price'].mean()

Unnamed: 0,fruit,price
0,apple,8.15
1,banana,6.6
2,orange,5.6


In [10]:
# 计算每类水果的差值
# 自定义聚合函数
def cha(arr):
    return (arr.max() - arr.min())

# agg aggregate聚合
df1.groupby(by='fruit')['price'].agg(cha)

fruit
apple     0.7
banana    0.4
orange    0.0
Name: price, dtype: float64

In [11]:
# 根据多个条件进行操作
# 根据水果的种类和颜色进行分组并求价格平均值
df1.groupby(by=['fruit', 'color'])['price'].mean()# 是一个Series对象

fruit   color 
apple   cyan      7.8
        red       8.5
banana  cyan      6.4
        yellow    6.8
orange  yellow    5.6
Name: price, dtype: float64

In [12]:
# 注意与上面的区别
df1.groupby(by=['fruit', 'color'])[['price']].mean()# 是一个DataFrame对象

Unnamed: 0_level_0,Unnamed: 1_level_0,price
fruit,color,Unnamed: 2_level_1
apple,cyan,7.8
apple,red,8.5
banana,cyan,6.4
banana,yellow,6.8
orange,yellow,5.6


In [13]:
df1[['price']].groupby(by=[df1['fruit'], df1['color']]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
fruit,color,Unnamed: 2_level_1
apple,cyan,7.8
apple,red,8.5
banana,cyan,6.4
banana,yellow,6.8
orange,yellow,5.6


# 补充（了解）

In [14]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Sam', 'Make', 'Tony', 'Jim', 'Nana'])
people

Unnamed: 0,a,b,c,d,e
Sam,0.955938,-0.900751,-1.284187,-0.455872,1.113778
Make,-0.63095,-0.859545,0.67049,-2.694156,0.895334
Tony,-0.447696,0.075579,0.737844,0.396468,-1.49968
Jim,-0.625034,0.753504,1.482354,1.637331,0.580098
Nana,-0.822734,0.565339,-0.842149,0.309266,-0.284803


## 通过字典或者series可以进行分组

In [2]:
# 字典分组
m = {'a': 'red', 'b': 'red', 'c': 'blue','d': 'blue', 'e': 'red', 'f' : 'orange'}

In [16]:
# 注意m的对应关系是有就用没有也不报错！！！
people.groupby(m, axis=1).sum()

Unnamed: 0,blue,red
Sam,-1.740059,1.168966
Make,-2.023666,-0.595161
Tony,1.134312,-1.871797
Jim,3.119684,0.708568
Nana,-0.532883,-0.542198


In [17]:
# Series分组
s1 = pd.Series(m)
s1

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [18]:
people.groupby(s1, axis=1).count()

Unnamed: 0,blue,red
Sam,2,3
Make,2,3
Tony,2,3
Jim,2,3
Nana,2,3


## 通过函数进行分组

In [19]:
people

Unnamed: 0,a,b,c,d,e
Sam,0.955938,-0.900751,-1.284187,-0.455872,1.113778
Make,-0.63095,-0.859545,0.67049,-2.694156,0.895334
Tony,-0.447696,0.075579,0.737844,0.396468,-1.49968
Jim,-0.625034,0.753504,1.482354,1.637331,0.580098
Nana,-0.822734,0.565339,-0.842149,0.309266,-0.284803


In [20]:
# 以行索引的字符串长度进行分组
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.330905,-0.147247,0.198167,1.181458,1.693876
4,-1.901379,-0.218627,0.566185,-1.988422,-0.889149


# 练习！！！！！！
- 分别读取四个给定csv文件，并合并为一个csv文件
- 去掉重复数据，重新设置连续索引
- 汽车分自动挡和手动挡，那么分别有多少量？
- 那个城市二手车数量最多？
- 找出深圳市最多的二手车品牌 

In [21]:
import numpy as np
import pandas as pd

In [22]:
# 1.
file1 = pd.read_csv('guazi_bj.csv')
file2 = pd.read_csv('guazi_sz.csv')
file3 = pd.read_csv('guazi_gz.csv')
file4 = pd.read_csv('guazi_sh.csv')
file = pd.concat([file1, file2, file3, file4])
file

Unnamed: 0,title,brand,buy_time,km,speedbox,displacement,es_price,new_price,city,year_type,level,suv,horsepower,fuel,length,width,height,owners,drive
0,大众 宝来 2014款 1.6L 自动时尚型,一汽-大众,2014-08,3.82,自动,1.6L,8.00,13.00,北京,2014款,紧凑型,0,105马力,汽油,4523,1775,1467,,前置前驱
1,福特 福睿斯 2015款 1.5L 自动时尚型,长安福特,2015-12,2.35,自动,1.5L,7.80,13.00,北京,2015款,紧凑型,0,113马力,汽油,4587,1825,1490,,前置前驱
2,大众 速腾 2012款 1.6L 自动舒适型,一汽-大众,2012-05,6.67,自动,1.6L,7.00,16.40,北京,2012款,紧凑型,0,105马力,汽油,4644,1778,1482,,前置前驱
3,奔驰C级 2011款 C 200 CGI 时尚型,北京奔驰,2013-01,11.83,自动,1.8T,15.00,42.10,北京,2011款,中型,0,184马力,汽油,4591,1770,1444,,前置后驱
4,大众 帕萨特 2013款 2.0TSI DSG御尊版,上汽大众,2013-11,8.95,自动,2.0T,13.00,27.70,北京,2013款,中型,0,200马力,汽油,4870,1834,1472,,前置前驱
5,哈弗H6 2011款 2.0L 手动两驱精英型,长城汽车,2012-11,5.17,手动,2.0L,4.50,11.50,北京,2011款,紧凑型,1,133马力,汽油,4640,1825,1690,,前置前驱
6,起亚K3 2013款 1.6L 自动GLS,东风悦达起亚,2014-07,6.58,自动,1.6L,7.00,13.50,北京,2013款,紧凑型,0,128马力,汽油,4600,1780,1445,,前置前驱
7,奔驰C级 2011款 C 200 CGI 时尚型,北京奔驰,2012-08,7.33,自动,1.8T,16.00,42.10,北京,2011款,中型,0,184马力,汽油,4591,1770,1444,,前置后驱
8,奔驰E级 2015款 改款 E 260 L,北京奔驰,2016-01,10.36,自动,2.0T,32.00,53.00,北京,2015款,中大型,0,211马力,汽油,5024,1854,1477,,前置后驱
9,本田 缤智 2016款 1.8L CVT两驱先锋型,广汽本田,2016-08,0.87,自动,1.8L,12.50,16.70,北京,2016款,小型,1,136马力,汽油,4294,1772,1605,,前置前驱


In [23]:
# 2. 
# file.duplicated().sum()
file.drop_duplicates(inplace=True)
file.reset_index(inplace=True)
file

Unnamed: 0,index,title,brand,buy_time,km,speedbox,displacement,es_price,new_price,city,year_type,level,suv,horsepower,fuel,length,width,height,owners,drive
0,0,大众 宝来 2014款 1.6L 自动时尚型,一汽-大众,2014-08,3.82,自动,1.6L,8.00,13.00,北京,2014款,紧凑型,0,105马力,汽油,4523,1775,1467,,前置前驱
1,1,福特 福睿斯 2015款 1.5L 自动时尚型,长安福特,2015-12,2.35,自动,1.5L,7.80,13.00,北京,2015款,紧凑型,0,113马力,汽油,4587,1825,1490,,前置前驱
2,2,大众 速腾 2012款 1.6L 自动舒适型,一汽-大众,2012-05,6.67,自动,1.6L,7.00,16.40,北京,2012款,紧凑型,0,105马力,汽油,4644,1778,1482,,前置前驱
3,3,奔驰C级 2011款 C 200 CGI 时尚型,北京奔驰,2013-01,11.83,自动,1.8T,15.00,42.10,北京,2011款,中型,0,184马力,汽油,4591,1770,1444,,前置后驱
4,4,大众 帕萨特 2013款 2.0TSI DSG御尊版,上汽大众,2013-11,8.95,自动,2.0T,13.00,27.70,北京,2013款,中型,0,200马力,汽油,4870,1834,1472,,前置前驱
5,5,哈弗H6 2011款 2.0L 手动两驱精英型,长城汽车,2012-11,5.17,手动,2.0L,4.50,11.50,北京,2011款,紧凑型,1,133马力,汽油,4640,1825,1690,,前置前驱
6,6,起亚K3 2013款 1.6L 自动GLS,东风悦达起亚,2014-07,6.58,自动,1.6L,7.00,13.50,北京,2013款,紧凑型,0,128马力,汽油,4600,1780,1445,,前置前驱
7,7,奔驰C级 2011款 C 200 CGI 时尚型,北京奔驰,2012-08,7.33,自动,1.8T,16.00,42.10,北京,2011款,中型,0,184马力,汽油,4591,1770,1444,,前置后驱
8,8,奔驰E级 2015款 改款 E 260 L,北京奔驰,2016-01,10.36,自动,2.0T,32.00,53.00,北京,2015款,中大型,0,211马力,汽油,5024,1854,1477,,前置后驱
9,9,本田 缤智 2016款 1.8L CVT两驱先锋型,广汽本田,2016-08,0.87,自动,1.8L,12.50,16.70,北京,2016款,小型,1,136马力,汽油,4294,1772,1605,,前置前驱


In [40]:
# 3.1
file.groupby('speedbox').count()

Unnamed: 0_level_0,index,title,brand,buy_time,km,displacement,es_price,new_price,city,year_type,level,suv,horsepower,fuel,length,width,height,owners,drive
speedbox,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
手动,1393,1393,1393,1387,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,0,1393
自动,5623,5623,5623,5580,5623,5623,5623,5623,5623,5623,5623,5623,5623,5623,5623,5623,5623,0,5623


In [41]:
# 3.2
print('自动挡：' + str(len(file[file['speedbox'] == '自动'])))
print('手动挡：' + str(len(file[file['speedbox'] == '手动'])))

自动挡：5623
手动挡：1393


In [35]:
# 4.
file.groupby("city").count().sort_values('title')

Unnamed: 0_level_0,index,title,brand,buy_time,km,speedbox,displacement,es_price,new_price,year_type,level,suv,horsepower,fuel,length,width,height,owners,drive
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
深圳,1497,1497,1497,1483,1497,1497,1497,1497,1497,1497,1497,1497,1497,1497,1497,1497,1497,0,1497
北京,1539,1539,1539,1531,1539,1539,1539,1539,1539,1539,1539,1539,1539,1539,1539,1539,1539,0,1539
上海,1973,1973,1973,1964,1973,1973,1973,1973,1973,1973,1973,1973,1973,1973,1973,1973,1973,0,1973
广州,2007,2007,2007,1989,2007,2007,2007,2007,2007,2007,2007,2007,2007,2007,2007,2007,2007,0,2007


In [38]:
# 5.
file[file['city'] == '深圳'].groupby('brand').count().sort_values('title', ascending=False)

Unnamed: 0_level_0,index,title,buy_time,km,speedbox,displacement,es_price,new_price,city,year_type,level,suv,horsepower,fuel,length,width,height,owners,drive
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
东风日产,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119,0,119
长安福特,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119,0,119
一汽丰田,84,84,82,84,84,84,84,84,84,84,84,84,84,84,84,84,84,0,84
北京现代,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82,0,82
广汽本田,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,0,81
上汽通用别克,81,81,79,81,81,81,81,81,81,81,81,81,81,81,81,81,81,0,81
上汽大众,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,0,78
一汽-大众,68,68,66,68,68,68,68,68,68,68,68,68,68,68,68,68,68,0,68
上汽通用雪佛兰,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,0,58
广汽丰田,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,0,50
