In [None]:
import pandas as pd

#　删除重复项

In [None]:
pd.set_option('display.width', 62)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
covidcases = pd.read_csv("data/covidcases.csv")

# 为每日案例、累积列和人口统计列创建列表
dailyvars = ['casedate','new_cases','new_deaths']
totvars = ['location','total_cases','total_deaths']

demovars = ['population','population_density',
  'median_age','gdp_per_capita',
  'hospital_beds_per_thousand','region']
covidcases[dailyvars + totvars + demovars].head(2).T

# 创建每日数据帧
coviddaily = covidcases[['location'] + dailyvars]
coviddaily.shape
coviddaily.head()

# 为每个国家/地区选择一行
covidcases.location.nunique()
coviddemo = \
  covidcases[['casedate'] + totvars + demovars].\
  sort_values(['location','casedate']).\
  drop_duplicates(['location'], keep='last').\
  rename(columns={'casedate':'lastdate'})

coviddemo.shape
coviddemo.head(2).T

# 每组的总和值
covidtotals = covidcases.groupby(['location'],
  as_index=False).\
  agg({'new_cases':'sum','new_deaths':'sum',
    'median_age':'last','gdp_per_capita':'last',
    'region':'last','casedate':'last',
    'population':'last'}).\
  rename(columns={'new_cases':'total_cases',
    'new_deaths':'total_deaths',
    'casedate':'lastdate'})
  
covidtotals.head(2).T

# 多对多重塑

many to many reshape

In [None]:
pd.set_option('display.width', 56)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
cma = pd.read_csv("data/cmacollections.csv")
cma['category'] = cma.category.str.strip().str[0:15]
cma['title'] = cma.title.str.strip().str[0:30]

# show the cma collections data
cma.shape

cma.head(4).T
cma.itemid.nunique()

cma.drop_duplicates(['itemid','citation']).\
  itemid.count()
cma.drop_duplicates(['itemid','creatorid']).\
  itemid.count()

# show a collection item with duplicated citations and creators
cma.set_index(['itemid'], inplace=True)
cma.loc[124733, ['title','citation',
  'creation_date','creator','birth_year']].head(6)

# create a collections data frame
collectionsvars = \
  ['title','category','creation_date']
cmacollections = cma[collectionsvars].\
  reset_index().\
  drop_duplicates(['itemid']).\
  set_index(['itemid'])
cmacollections.shape
cmacollections.head()
cmacollections.loc[124733]

# create a citations data frame
cmacitations = cma[['citation']].\
  reset_index().\
  drop_duplicates(['itemid','citation']).\
  set_index(['itemid'])
cmacitations.loc[124733]

# create a creators data frame
creatorsvars = \
  ['creator','birth_year','death_year']
cmacreators = cma[creatorsvars].\
  reset_index().\
  drop_duplicates(['itemid','creator']).\
  set_index(['itemid'])
cmacreators.loc[124733]

# count the number of collection items with a creator born after 1950
cmacreators['birth_year'] = \
  cmacreators.birth_year.str.\
  findall("\d+").str[0].astype(float)
    
youngartists = \
  cmacreators.loc[cmacreators.birth_year>1950,
  ['creator']].assign(creatorbornafter1950='Y')
youngartists.shape[0]==youngartists.index.nunique()
youngartists

cmacollections = \
  pd.merge(cmacollections, youngartists, 
  left_on=['itemid'], right_on=['itemid'], how='left', validate="many_to_many")
cmacollections.fillna({'creatorbornafter1950':'N'}, inplace=True)
cmacollections.shape
cmacollections.creatorbornafter1950.value_counts()

youngartists

# 堆栈熔化

stack_melt

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)

# view some of the weeks worked values
nls97.set_index(['originalid'], inplace=True)
weeksworkedcols = ['weeksworked17','weeksworked18',
  'weeksworked19','weeksworked20','weeksworked21']

nls97.loc[[2,3],weeksworkedcols].T
nls97.shape

# use stack to convert data from wide to long
weeksworked = nls97[weeksworkedcols].\
  stack().\
  reset_index().\
  rename(columns={'level_1':'year',0:'weeksworked'})

pd.__version__

weeksworked.loc[weeksworked.originalid.isin([2,3])]

# Fix the year values
weeksworked['year'] = \
  weeksworked.year.str[-2:].astype(int)+2000
weeksworked.loc[weeksworked.originalid.isin([2,3])]
weeksworked.shape

# use melt to transform data from wide to long
weeksworked = nls97.reset_index().\
  loc[:,['originalid'] + weeksworkedcols].\
  melt(id_vars=['originalid'],
  value_vars=weeksworkedcols,
  var_name='year', value_name='weeksworked')

weeksworked['year'] = \
  weeksworked.year.str[-2:].astype(int)+2000
weeksworked.set_index(['originalid'], inplace=True)
weeksworked.loc[[2,3]]


nls97.head(2).T

# reshape more columns with melt
colenrcols = \
  ['colenroct17','colenroct18','colenroct19',
  'colenroct20','colenroct21']
colenr = nls97.reset_index().\
  loc[:,['originalid'] + colenrcols].\
  melt(id_vars=['originalid'], value_vars=colenrcols,
    var_name='year', value_name='colenr')

colenr['year'] = colenr.year.str[-2:].astype(int)+2000
colenr.set_index(['originalid'], inplace=True)
colenr.loc[[2,3]]

# merge the weeks worked and enrollment data
workschool = \
  pd.merge(weeksworked, colenr, on=['originalid','year'], how="inner", validate="many_to_many")
workschool.shape

#workschool.set_index(['originalid'], inplace=True)
workschool.loc[[2,3]]

# 宽至长

wide to long

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index('personid', inplace=True)

# view some of the weeks worked and college enrollment data
weeksworkedcols = ['weeksworked17','weeksworked18',
  'weeksworked19','weeksworked20','weeksworked21']

colenrcols = ['colenroct17','colenroct18',
   'colenroct19','colenroct20','colenroct21']

nls97.loc[nls97.originalid.isin([2,3]),
  ['originalid'] + weeksworkedcols + colenrcols].T

# run the wide_to_long function
workschool = pd.wide_to_long(nls97[['originalid'] 
  + weeksworkedcols + colenrcols], 
  stubnames=['weeksworked','colenroct'], 
  i=['originalid'], j='year').reset_index()
workschool['year'] = workschool.year+2000
workschool = workschool.\
  sort_values(['originalid','year'])
workschool.set_index(['originalid'], inplace=True)
workschool.loc[[2,3]]

# run the melt with unaligned suffixes
weeksworkedcols = ['weeksworked16','weeksworked18',
  'weeksworked19','weeksworked20','weeksworked21']
workschool = pd.wide_to_long(nls97[['originalid']
  + weeksworkedcols + colenrcols], 
  stubnames=['weeksworked','colenroct'], 
  i=['originalid'], j='year').reset_index()
workschool['year'] = workschool.year+2000
workschool = workschool.sort_values(['originalid','year'])
workschool.set_index(['originalid'], inplace=True)
workschool.loc[[2,3]]

# 卸载支点

unstack pivot

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
weeksworkedstacked = pd.read_pickle("data/nlsweeksworkedstacked.pkl")
workschoolmelted = pd.read_pickle("data/nlsworkschoolmelted.pkl")

# view the stacked weeks worked data
weeksworkedstacked.head(10)
weeksworkedstacked.index

# use stack to convert from long to wide
weeksworked = weeksworkedstacked.unstack()
weeksworked.head(10)

# use pivot to convert from long to wide
workschoolmelted.loc[workschoolmelted.originalid.isin([1,2])].sort_values(['originalid','year'])
workschool = workschoolmelted.pivot(index='originalid', columns='year', values=['weeksworked','colenroct']).reset_index()
workschool.columns = workschool.columns.map('{0[0]}{0[1]}'.format)
workschool.loc[workschool.originalid.isin([1,2])].T

## 卸载支点b

unstack pivotb

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index(['originalid'], inplace=True)

# stack the data again
weeksworkedcols = ['weeksworked17','weeksworked18',
  'weeksworked19','weeksworked20','weeksworked21']

weeksworkedstacked = nls97[weeksworkedcols].\
  stack()
weeksworkedstacked.loc[[2,3]]

pd.__version__

# melt the data again
weeksworkedmelted = nls97.reset_index().\
  loc[:,['originalid'] + weeksworkedcols].\
  melt(id_vars=['originalid'], 
  value_vars=weeksworkedcols,
  var_name='year', value_name='weeksworked')
weeksworkedmelted.loc[weeksworkedmelted.\
  originalid.isin([2,3])].\
  sort_values(['originalid','year'])

# use stack to convert from long to wide
weeksworked = weeksworkedstacked.unstack()
weeksworked.loc[[2,3]].T

# use pivot to convert from long to wide
weeksworked = weeksworkedmelted.\
  pivot(index='originalid',
  columns='year', values=['weeksworked']).\
  reset_index()
weeksworked.columns = ['originalid'] + \
  [col[1] for col in weeksworked.columns[1:]]
weeksworked.loc[weeksworked.originalid.isin([2,3])].T