# Series Operations

# 基本系列

In [1]:
import pandas as pd
pd.set_option('display.width', 78)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

# 从 GPA 栏创建系列
gpaoverall = nls97.gpaoverall
type(gpaoverall)
gpaoverall.head()
gpaoverall.index

# 使用括号符号选择 gpa 值
gpaoverall[:5]
gpaoverall.tail()
gpaoverall[-5:]

# 使用定位符选择数值
gpaoverall.loc[100061]
gpaoverall.loc[[100061]]
gpaoverall.loc[[100061,100139,100284]]
gpaoverall.loc[100061:100833]

# 使用 iloc 选择值
gpaoverall.iloc[[0]]
gpaoverall.iloc[[0,1,2,3,4]]
gpaoverall.iloc[:5]
gpaoverall.iloc[-5:]

personid
999291   3.11
999406   2.17
999543    NaN
999698    NaN
999963   3.78
Name: gpaoverall, dtype: float64

# 统计摘要

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.width', 78)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97f.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 显示一些描述性统计
gpaoverall = nls97.gpaoverall

gpaoverall.mean()
gpaoverall.describe()
gpaoverall.quantile(np.arange(0.1,1.1,0.1))

# 基于数值的子集
gpaoverall.loc[gpaoverall.between(3,3.5)].head(5)
gpaoverall.loc[gpaoverall.between(3,3.5)].count()
gpaoverall.loc[(gpaoverall<2) | (gpaoverall>4)].sample(5, random_state=10)
gpaoverall.loc[gpaoverall>gpaoverall.quantile(0.99)].\
  agg(['count','min','max'])

# 对所有值进行测试
(gpaoverall>4).any() # 任何人的 GPA 均大于 4
(gpaoverall>=0).all() # 所有人的 GPA 都大于 0
(gpaoverall>=0).sum() # GPA 超过 0 的人数
(gpaoverall==0).sum() # GPA 等于 0 的人数
gpaoverall.isnull().sum() # GPA 值缺失的人数

# 显示高薪和低收入者的 GPA
nls97.loc[nls97.wageincome20 > nls97.wageincome20.quantile(0.75),'gpaoverall'].mean()
nls97.loc[nls97.wageincome20 < nls97.wageincome20.quantile(0.25),'gpaoverall'].mean()

# 显示带有分类数据系列的计数
nls97.maritalstatus.describe()
nls97.maritalstatus.value_counts()

gpaoverall>4

# 修改数值

In [2]:
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97f.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 将数列的所有数值乘以一个标量
nls97.gpaoverall.head()
gpaoverall100 = nls97['gpaoverall'] * 100

gpaoverall100.head()

# 使用 loc 访问器将标量应用于选定的行
nls97.loc[[135335], 'gpaoverall'] = 3
nls97.loc[[999406,151672,750699],'gpaoverall'] = 0
nls97.gpaoverall.head()

# 使用多个系列设置值
nls97['childnum'] = nls97.childathome + nls97.childnotathome
nls97.childnum.value_counts().sort_index()

# 使用索引将汇总值应用于选定的行
nls97.loc[135335:781297,'gpaoverall'] = nls97.gpaoverall.mean()
nls97.gpaoverall.head()

# 使用 iloc 访问器对选定行应用标量
nls97.iloc[0, 15] = 2
nls97.iloc[1:4, 15] = 1
nls97.gpaoverall.head()

# 过滤后设置值
nls97.gpaoverall.nlargest()
nls97.loc[nls97.gpaoverall>4, 'gpaoverall'] = 4
nls97.gpaoverall.nlargest()

type(nls97.loc[[135335], 'gpaoverall'])
type(nls97.loc[[135335], ['gpaoverall']])


pandas.core.frame.DataFrame

# 有条件地改变

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.width', 64)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.1f}'.format
nls97 = pd.read_csv("data/nls97f.csv", low_memory=False)
nls97.set_index("personid", inplace=True)
landtemps = pd.read_csv("data/landtemps2023avgs.csv")

# 使用 numpy where 函数创建包含 2 个值的分类序列
landtemps.elevation.quantile(np.arange(0.2,1.1,0.2))
landtemps['elevation_group'] = np.where(landtemps.elevation>\
  landtemps.elevation.quantile(0.8),'High','Low')
landtemps.elevation_group = landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'], 
  observed=False)['elevation'].\
  agg(['count','min','max'])

# 使用 numpy where 函数创建包含 3 个值的分类序列
landtemps['elevation_group'] = \
  np.where(landtemps.elevation>
    landtemps.elevation.quantile(0.8),'High',
    np.where(landtemps.elevation>landtemps.elevation.\
      median(),'Medium','Low'))
landtemps.elevation_group = landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].\
  agg(['count','min','max'])

# 使用 numpy select 评估条件列表
test = [(nls97.gpaoverall<2) & 
  (nls97.highestdegree=='0. None'), 
   nls97.highestdegree=='0. None', 
   nls97.gpaoverall<2]
result = ['1. Low GPA/No Dip','2. No Diploma',
 '3. Low GPA']
nls97['hsachieve'] = np.select(test, result, '4. Did Okay')
nls97[['hsachieve','gpaoverall','highestdegree']].\
  sample(7, random_state=6)
nls97.hsachieve.value_counts().sort_index()

def gethsachieve(row):
  if (row.gpaoverall<2 and row.highestdegree=="0. None"):
    hsachieve2 = "1. Low GPA/No Dip"
  elif (row.highestdegree=="0. None"):
    hsachieve2 = "2. No Diploma"
  elif (row.gpaoverall<2):
    hsachieve2 = "3. Low GPA"
  else:
    hsachieve2 = '4. Did Okay'
  return hsachieve2

nls97['hsachieve2'] = nls97.apply(gethsachieve, axis=1)
nls97.groupby(['hsachieve','hsachieve2']).size()

# 使用 apply 和 lambda 创建更复杂的分类数列
def getsleepdeprivedreason(row):
  if (row.nightlyhrssleep>=6):
    sleepdeprivedreason = "Not Sleep Deprived"
  elif (row.nightlyhrssleep>0):
    if (row.weeksworked20+row.weeksworked21 < 80):
      if (row.childathome>2):
        sleepdeprivedreason = "Child Rearing"
      else:
        sleepdeprivedreason = "Other Reasons"
    else:
      if (row.wageincome20>=62000 or row.highestgradecompleted>=16):
        sleepdeprivedreason = "Work Pressure"
      else:
        sleepdeprivedreason = "Income Pressure"
  else:
    sleepdeprivedreason = "Unknown"
  return sleepdeprivedreason

nls97['sleepdeprivedreason'] = nls97.apply(getsleepdeprivedreason, axis=1)
nls97.sleepdeprivedreason = nls97.sleepdeprivedreason.astype('category')
nls97.sleepdeprivedreason.value_counts()

# 如果个人曾获得学士学位，则创建一个标记
nls97.loc[[999406,750699], 
  'colenrfeb00':'colenroct04'].T
nls97['baenrollment'] = nls97.filter(like="colenr").\
  transform(lambda x: x.str[0:1]=='3').\
  any(axis=1)

nls97.loc[[999406,750699], ['baenrollment']].T
nls97.baenrollment.value_counts()

  landtemps.groupby(['elevation_group'])['elevation'].\


baenrollment
False    4987
True     3997
Name: count, dtype: int64

# 符串系列字

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.width', 74)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97ca.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 测试字符串中是否存在字符串模式
nls97.govprovidejobs.value_counts()

nls97['govprovidejobsdefprob'] = \
  np.where(nls97.govprovidejobs.isnull(),
    np.nan,
      np.where(nls97.govprovidejobs.str.\
      contains("not"),"No","Yes"))
pd.crosstab(nls97.govprovidejobs, nls97.govprovidejobsdefprob)

# 处理字符串中的前导或尾随空格
nls97.maritalstatus.value_counts()
nls97.maritalstatus.str.startswith(' ').any()
nls97.maritalstatus.str.endswith(' ').any()
nls97['evermarried'] = \
  np.where(nls97.maritalstatus.isnull(),np.nan,
    np.where(nls97.maritalstatus.str.\
      strip()=="Never-married","No","Yes"))
pd.crosstab(nls97.maritalstatus, nls97.evermarried)

# 使用isin将字符串值与值列表进行比较
nls97['receivedba'] = \
  np.where(nls97.highestdegree.isnull(),np.nan,
    np.where(nls97.highestdegree.str[0:1].\
      isin(['4','5','6','7']),"Yes","No"))
pd.crosstab(nls97.highestdegree, nls97.receivedba)

# 从最高阶值中删除前面的数字
nls97.highestdegree.value_counts(dropna=False).sort_index()
nls97.fillna({"highestdegree":"99. Unknown"},
  inplace=True)
onlytext = lambda x: x[x.find(".") + 2:]
highestdegreenonum = nls97.highestdegree.\
  astype(str).transform(onlytext)
highestdegreenonum.value_counts(dropna=False).\
  sort_index()

# 用一个简单的例子来使用findall
nls97.maritalstatus.head()
nls97.maritalstatus.head().str.findall("r")

pd.concat([nls97.maritalstatus.head(),
   nls97.maritalstatus.head().str.findall("r"),
   nls97.maritalstatus.head().str.findall("r").\
       str.len()],
   axis=1)

# 使用文本中的数字将文本响应转换为数字
pd.concat([nls97.weeklyhrstv.head(),\
  nls97.weeklyhrstv.str.findall("\d+").head()], axis=1)

def getnum(numlist):
  highval = 0
  if (type(numlist) is list):
    lastval = int(numlist[-1])
    if (numlist[0]=='40'):
      highval = 45
    elif (lastval==2):
      highval = 1
    else:
      highval = lastval - 5
  else:
    highval = np.nan
  return highval

nls97['weeklyhrstvnum'] = nls97.weeklyhrstv.str.\
  findall("\d+").apply(getnum)
  
nls97[['weeklyhrstvnum','weeklyhrstv']].head(7)

pd.crosstab(nls97.weeklyhrstv, nls97.weeklyhrstvnum)

# 用替代值替换序列中的值
comphrsold = ['Less than 1 hour a week',
  '1 to 3 hours a week','4 to 6 hours a week',
  '7 to 9 hours a week','10 hours or more a week']
comphrsnew = ['A. Less than 1 hour a week',
  'B. 1 to 3 hours a week','C. 4 to 6 hours a week',
  'D. 7 to 9 hours a week','E. 10 hours or more a week']
nls97.weeklyhrscomputer.value_counts().sort_index()
nls97.weeklyhrscomputer.replace(comphrsold, comphrsnew, inplace=True)
nls97.weeklyhrscomputer.value_counts().sort_index()

nls97['maritalstatus'] = nls97.maritalstatus.str.strip()
nls97.maritalstatus.value_counts(dropna=False).sort_index()
nls97.loc[nls97.maritalstatus=="Never-married"].maritalstatus.head(2).T
nls97.loc[[100284,101089],"maritalstatus"] = "Never-married "
nls97.to_csv("data/nls97ca.csv")

# 日期转换

In [4]:
import pandas as pd
from dateutil.relativedelta import relativedelta

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 220)
pd.options.display.float_format = '{:,.0f}'.format
covidcases = pd.read_csv("data/covidcases.csv")
nls97 = pd.read_csv("data/nls97c.csv")
nls97.set_index("personid", inplace=True)

# 显示出生月份和年份值
nls97[['birthmonth','birthyear']].isnull().sum()
nls97.birthmonth.value_counts(dropna=False).\
  sort_index()
nls97.birthyear.value_counts().sort_index()

# 使用fillna修复缺失值
nls97.fillna({"birthmonth":\
 int(nls97.birthmonth.mean())}, inplace=True)
nls97.birthmonth.value_counts(dropna=False).\
 sort_index()

# 使用月份和日期整数创建日期时间列
nls97['birthdate'] = pd.to_datetime(dict(year=nls97.birthyear, month=nls97.birthmonth, day=15))
nls97[['birthmonth','birthyear','birthdate']].head()
nls97[['birthmonth','birthyear','birthdate']].isnull().sum()

# 定义一个函数来计算给定的开始和结束日期
def calcage(startdate, enddate):
  age = enddate.year - startdate.year
  if (enddate.month<startdate.month or (enddate.month==startdate.month and enddate.day<startdate.day)):
    age = age - 1
  return age

# 计算年龄
rundate = pd.to_datetime('2024-03-01')
nls97["age"] = nls97.apply(lambda x: calcage(x.birthdate, rundate), axis=1)
nls97.loc[100061:100583, ['age','birthdate']]

nls97["age2"] = nls97.\
  apply(lambda x: relativedelta(rundate, 
    x.birthdate).years,
    axis=1)
(nls97['age']!=nls97['age2']).sum()
nls97.groupby(['age','age2']).size()

# 将字符串列转换为日期时间列
covidcases.iloc[:, 0:6].dtypes
covidcases.iloc[:, 0:6].sample(2, random_state=1).T
covidcases['casedate'] = pd.to_datetime(covidcases.casedate, format='%Y-%m-%d')
covidcases.iloc[:, 0:6].dtypes

# 获取日期时间列的描述性统计
covidcases.casedate.nunique()
covidcases.casedate.describe()

# 按国家计算自首例病例以来的天数
firstcase = covidcases.loc[covidcases.new_cases>0,['location','casedate']].\
  sort_values(['location','casedate']).\
  drop_duplicates(['location'], keep='first').\
  rename(columns={'casedate':'firstcasedate'})
covidcases = pd.merge(covidcases, firstcase, left_on=['location'], right_on=['location'], how="left", validate="many_to_many")
covidcases['dayssincefirstcase'] = covidcases.casedate - covidcases.firstcasedate
covidcases.dayssincefirstcase.describe()

count                          36501
mean     637 days 01:36:55.862579112
std      378 days 15:34:06.667833980
min                  0 days 00:00:00
25%                315 days 00:00:00
50%                623 days 00:00:00
75%                931 days 00:00:00
max               1491 days 00:00:00
Name: dayssincefirstcase, dtype: object

# AI

In [None]:
import pandas as pd
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
llm = OpenAI(api_token="Your API Key")

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 35)
pd.set_option('display.max_rows', 220)
pd.options.display.float_format = '{:,.0f}'.format

# 加载数据帧并创建智能数据帧对象
covidcases = pd.read_csv("data/covidcases.csv")

nls97 = pd.read_csv("data/nls97f.csv", low_memory=False)
nls97.set_index("personid", inplace=True)
nls97sdf = SmartDataframe(nls97, config={"llm": llm})

# 运行一些查询
nls97sdf.chat("Show average of gpaoverall")
nls97sdf.chat("Show average for each weeks worked column")
nls97sdf.chat("Show satmath average by gender")

nls97sdf = nls97sdf.chat("Set childnum to child at home plus child not at home")
nls97sdf[['childnum','childathome','childnotathome']].\
  sample(5, random_state=1)

nls97sdf = nls97sdf.chat("evermarried is 'No' when maritalstatus is 'Never-married', else 'Yes'")
nls97sdf.groupby(['evermarried','maritalstatus']).size()

nls97sdf = nls97sdf.chat("if maritalstatus is ‘Never-married’ set evermarried2 to 'No', otherwise 'Yes'")
nls97sdf.groupby(['evermarried2','maritalstatus']).size()

nls97sdf = nls97sdf.chat("set weeksworkedavg to the average for weeksworked columns")
nls97sdf

nls97sdf.gpaenglish.describe()
nls97sdf = nls97sdf.chat("set missing gpaenglish to the average")
nls97sdf.gpaenglish.describe()


# 按国家计算自首例病例以来的天数
firstcase = covidcases.\
  sort_values(['location','casedate']).\
  drop_duplicates(['location'], keep='first')

firstcase.set_index('location', inplace=True)

firstcase.shape

firstcase[['iso_code','continent','casedate',
  'total_cases','new_cases']].head(2).T

# 用pandasai智能数据框架试试
covidcasessdf = SmartDataframe(covidcases, config={"llm": llm})

firstcasesdf = covidcasessdf.chat("Show first casedate and location and other values for each country.")
firstcasesdf.shape
firstcasesdf = firstcasesdf.chat("Make location the index.")
firstcasesdf[['location','continent','casedate',
  'total_cases','new_cases']].head(2).T