# Aggregating

# 行迭代

In [None]:
import pandas as pd
pd.set_option('display.width', 60)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
coviddaily = pd.read_csv("data/coviddaily.csv", parse_dates=["casedate"])
ltbrazil = pd.read_csv("data/ltbrazil.csv")

# 按位置和病例日期升序排列新冠肺炎数据
coviddaily = coviddaily.sort_values(['location','casedate'])

# 使用itertuples迭代行，每次更改组时将其附加到列表中
prevloc = 'ZZZ'
rowlist = []
casecnt = 0
for row in coviddaily.itertuples():
  if (prevloc!=row.location):
    if (prevloc!='ZZZ'):
      rowlist.append({'location':prevloc, 'casecnt':casecnt})
    casecnt = 0
    prevloc = row.location
  casecnt += row.new_cases
  
rowlist.append({'location':prevloc, 'casecnt':casecnt})
len(rowlist)
rowlist[0:4]

# 从行列表创建数据帧
covidtotals = pd.DataFrame(rowlist)
covidtotals.head()

# 对陆地温度数据进行排序，并删除缺少温度值的行
ltbrazil = ltbrazil.sort_values(['station','month'])
ltbrazil = ltbrazil.dropna(subset=['temperature'])

# 使用itertuples迭代行，每次更改组时将其附加到列表中
prevstation = 'ZZZ'
prevtemp = 0
rowlist = []
tempcnt = 0
stationcnt = 0
for row in ltbrazil.itertuples():
  if (prevstation!=row.station):
    if (prevstation!='ZZZ'):
      rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
    tempcnt = 0
    stationcnt = 0
    prevstation = row.station

  # 仅选择与之前温度相差3度以内的行  
  if ((0 <= abs(row.temperature-prevtemp) <= 3) or (stationcnt==0)):
    tempcnt += row.temperature
    stationcnt += 1
  
  prevtemp = row.temperature

rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
rowlist[0:5]
ltbrazilavgs = pd.DataFrame(rowlist)
ltbrazilavgs.head()

# numpy迭代

In [None]:
import pandas as pd
pd.set_option('display.width', 68)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
coviddaily = pd.read_csv("data/coviddaily.csv", parse_dates=["casedate"])
ltbrazil = pd.read_csv("data/ltbrazil.csv")

# 创建位置列表
loclist = coviddaily.location.unique().tolist()

# 使用numpy数组计算总和
rowlist = []
casevalues = coviddaily[['location','new_cases']].to_numpy()
for locitem in loclist:
  cases = [casevalues[j][1] for j in range(len(casevalues))\
    if casevalues[j][0]==locitem]
  rowlist.append(sum(cases))

len(rowlist)
len(loclist)
rowlist[0:5]
casetotals = pd.DataFrame(zip(loclist,rowlist), columns=(['location','casetotals']))
casetotals.head()

# 对陆地温度数据进行排序，并删除缺少温度值的行
ltbrazil = ltbrazil.sort_values(['station','month'])
ltbrazil = ltbrazil.dropna(subset=['temperature'])

# 使用numpy数组进行迭代
prevstation = 'ZZZ'
prevtemp = 0
rowlist = []
tempvalues = ltbrazil[['station','temperature']].to_numpy()
tempcnt = 0
stationcnt = 0
for j in range(len(tempvalues)):
  station = tempvalues[j][0]
  temperature = tempvalues[j][1]
  if (prevstation!=station):
    if (prevstation!='ZZZ'):
      rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
    tempcnt = 0
    stationcnt = 0
    prevstation = station
  
  if ((0 <= abs(temperature-prevtemp) <= 3) or (stationcnt==0)):
    tempcnt += temperature
    stationcnt += 1
  
  prevtemp = temperature

rowlist.append({'station':prevstation, 'avgtemp':tempcnt/stationcnt, 'stationcnt':stationcnt})
rowlist[0:5]

# 创建陆地温度平均值的数据框架
ltbrazilavgs = pd.DataFrame(rowlist)
ltbrazilavgs.head()

# 分组基础

In [None]:
import pandas as pd
pd.set_option('display.width', 68)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.0f}'.format
coviddaily = pd.read_csv("data/coviddaily.csv", parse_dates=["casedate"])

# 创建pandas分组数据帧
countrytots = coviddaily.groupby(['location'])
type(countrytots)

# 为每个国家的第一行和最后一行创建数据帧
countrytots.first().iloc[0:5, 0:5]
countrytots.last().iloc[0:5, 0:5]
type(countrytots.last())

# 为一个国家赢得所有的选票
countrytots.get_group(('Zimbabwe')).iloc[0:5, 0:5]

# 在分组中循环
for name, group in countrytots:
  if (name[0] in ['Malta','Kuwait']):
    print(group.iloc[0:5, 0:5])

# 显示每个国家的行数
countrytots.size()

# 按国家显示汇总统计数据
countrytots.new_cases.describe().head(3).T
countrytots.new_cases.sum().head()

# 复杂分组

In [None]:
import pandas as pd
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 9)
pd.set_option('display.max_rows', 30)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# nls97数据结构综述
nls97.iloc[:,0:7].info()

# 进一步查看一些数据
catvars = ['gender','maritalstatus','highestdegree']

for col in catvars:
  print(nls97[col].value_counts().\
    sort_index(), sep="\n\n", end="\n\n\n")

# 回顾一些描述性统计
contvars = ['satmath','satverbal',
  'weeksworked06','gpaoverall','childathome']

nls97[contvars].describe()

# 按性别查看sat数学成绩
nls97.groupby('gender')['satmath'].mean()

# 按性别和最高学历查看sat数学成绩
nls97.groupby(['gender','highestdegree'])['satmath'].\
  mean()

# 按性别和最高学历查看sat数学和口语成绩
nls97.groupby(['gender','highestdegree'])[['satmath','satverbal']].mean()

# 添加最大偏差和标准偏差
nls97.groupby(['gender','highestdegree'])\
  ['gpaoverall'].agg(['count','mean','max','std'])

# 使用字典查找更复杂的聚合
pd.options.display.float_format = '{:,.1f}'.format
aggdict = {'weeksworked06':['count', 'mean',
 'max','std'], 'childathome':['count', 'mean',
 'max', 'std']}
nls97.groupby(['highestdegree']).agg(aggdict)
nls97.groupby(['maritalstatus']).agg(aggdict)

# 分组-UDF

In [6]:
import pandas as pd
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.1f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 创建计算四分位数间距的函数
def iqr(x):
  return x.quantile(0.75) - x.quantile(0.25)

# 运行四分位数间距函数
aggdict = {'weeksworked06':['count', 'mean', iqr], 'childathome':['count', 'mean', iqr]}
nls97.groupby(['highestdegree']).agg(aggdict)

# 定义一个函数，将摘要统计信息作为序列返回
def gettots(x):
  out = {}
  out['qr1'] = x.quantile(0.25)
  out['med'] = x.median()
  out['qr3'] = x.quantile(0.75)
  out['count'] = x.count()
  return out

# 使用apply运行函数
pd.options.display.float_format = '{:,.0f}'.format
nls97.groupby(['highestdegree'])['weeksworked06'].\
  apply(gettots)
  

# 链接 reset_index 用于设置默认索引
nls97.groupby(['highestdegree'])['weeksworked06'].\
  apply(gettots).reset_index()

# 允许创建索引
nlssums = nls97.groupby(['highestdegree'])\
  ['weeksworked06'].apply(gettots).unstack()
nlssums
nlssums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0. None to 7. Professional
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   qr1     8 non-null      float64
 1   med     8 non-null      float64
 2   qr3     8 non-null      float64
 3   count   8 non-null      float64
dtypes: float64(4)
memory usage: 320.0+ bytes


# 分组-Dataframe

In [5]:
import pandas as pd
pd.set_option('display.width', 62)
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 50)
pd.options.display.float_format = '{:,.0f}'.format
coviddaily = pd.read_csv("data/coviddaily.csv", parse_dates=["casedate"])
ltbrazil = pd.read_csv("data/ltbrazil.csv")

coviddaily[['location','casedate',
  'new_cases','new_deaths']]. \
  set_index(['location','casedate']). \
  sample(10, random_state=1)

# 将每天一个国家的新冠肺炎数据转换为每天所有国家的汇总值
coviddailytotals = coviddaily.loc[coviddaily.\
  casedate.between('2023-02-01','2024-01-31')].\
  groupby(['casedate'], as_index=False)\
  [['new_cases','new_deaths']].\
  sum()

coviddailytotals.head(10)

# 创建一个包含巴西每个站点平均温度的数据帧
ltbrazil.head(2).T
ltbrazil = ltbrazil.dropna(subset=['temperature'])
ltbrazilavgs = ltbrazil.groupby(['station'],
  as_index=False).\
  agg({'latabs':'first','elevation':'first',
  'temperature':'mean'})
ltbrazilavgs.head(10)



Unnamed: 0,station,latabs,elevation,temperature
0,ALTAMIRA,3,112,28
1,ALTA_FLORESTA_AERO,10,289,32
2,ARAXA,20,1004,22
3,BACABAL,4,25,29
4,BAGE,31,242,20
5,BARRA_DO_CORDA,6,153,28
6,BARREIRAS,12,439,27
7,BARTOLOMEU_LISANDRO,22,17,26
8,BAURU,22,617,25
9,BELEM,1,10,28


# 透视-Dataframe

In [7]:
import pandas as pd
pd.set_option('display.width', 72)
pd.set_option('display.max_columns', 7 )
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

coviddaily = pd.read_csv("data/coviddaily.csv",
  parse_dates=["casedate"])
ltbrazil = pd.read_csv("data/ltbrazil.csv")

coviddailytotals = \
  pd.pivot_table(coviddaily.loc[coviddaily.casedate. \
  between('2023-02-01','2024-01-31')], 
  values=['new_cases','new_deaths'], index='casedate', 
  aggfunc='sum')

coviddailytotals.head(10)

# 创建一个包含巴西每个站点平均温度的数据帧
ltbrazil = ltbrazil.dropna(subset=['temperature'])

ltbrazilavgs = \
  pd.pivot_table(ltbrazil, index=['station'], 
  aggfunc={'latabs':'first','elevation':'first',
  'temperature':'mean'})

ltbrazilavgs.head(10)

Unnamed: 0_level_0,elevation,latabs,temperature
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALTAMIRA,112,3,28
ALTA_FLORESTA_AERO,289,10,32
ARAXA,1004,20,22
BACABAL,25,4,29
BAGE,242,31,20
BARRA_DO_CORDA,153,6,28
BARREIRAS,439,12,27
BARTOLOMEU_LISANDRO,17,22,26
BAURU,617,22,25
BELEM,10,1,28
