# Taking Measure Of Data

# 加载数据表

In [None]:

import pandas as pd
pd.set_option('display.width', 70)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.0f}'.format


# 获取 nls 数据集的基本统计数据
nls97 = pd.read_csv("data/nls97.csv")
nls97.set_index("personid", inplace=True)
nls97.index
nls97.shape
nls97.index.nunique()
nls97.info()
nls97.head(2).T

# 获取 covid 病例数据集的基本统计信息
covidtotals = pd.read_csv("data/covidtotals.csv",
  parse_dates=['lastdate'])
covidtotals.set_index("iso_code", inplace=True)
covidtotals.index
covidtotals.shape
covidtotals.index.nunique()
covidtotals.info()
covidtotals.sample(2, random_state=1).T

# 选择列

In [None]:
# import pandas and numpy, and load the nls97 data
import pandas as pd
pd.set_option('display.width', 78)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 15)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97.csv")
nls97.set_index("personid", inplace=True)

nls97[nls97.select_dtypes(['object']).columns] = \
  nls97.select_dtypes(['object']). \
  transform(lambda x: x.astype('category'))

# 使用 pandas 索引操作符选择列
analysisdemo = nls97['gender']
type(analysisdemo)
analysisdemo = nls97[['gender']]
type(analysisdemo)
analysisdemo = nls97.loc[:,['gender']]
type(analysisdemo)
analysisdemo = nls97.iloc[:,[0]]
type(analysisdemo)

# 从 pandas 数据帧中选择多列
analysisdemo = nls97[['gender','maritalstatus',
 'highestgradecompleted']]
analysisdemo.shape
analysisdemo.head()

analysisdemo = nls97.loc[:,['gender','maritalstatus',
 'highestgradecompleted']]
analysisdemo.shape
analysisdemo.head()

# 使用列表选择多列
keyvars = ['gender','maritalstatus',
 'highestgradecompleted','wageincome',
 'gpaoverall','weeksworked17','colenroct17']
analysiskeys = nls97[keyvars]
analysiskeys.info()

# 使用过滤操作符选择多列
analysiswork = nls97.filter(like="weeksworked")
analysiswork.info()

# 根据数据类型选择多列
analysiscats = nls97.select_dtypes(include=["category"])
analysiscats.info()

# 整理列
demo = ['gender','birthmonth','birthyear']
highschoolrecord = ['satverbal','satmath','gpaoverall',
 'gpaenglish','gpamath','gpascience']
govresp = ['govprovidejobs','govpricecontrols',
  'govhealthcare','govelderliving','govindhelp',
  'govunemp','govincomediff','govcollegefinance',
  'govdecenthousing','govprotectenvironment']
demoadult = ['highestgradecompleted','maritalstatus',
  'childathome','childnotathome','wageincome',
  'weeklyhrscomputer','weeklyhrstv','nightlyhrssleep',
  'highestdegree']
weeksworked = ['weeksworked00','weeksworked01',
  'weeksworked02','weeksworked03','weeksworked04',
  'weeksworked05','weeksworked06',  'weeksworked07',
  'weeksworked08','weeksworked09','weeksworked10',
  'weeksworked11','weeksworked12','weeksworked13',
  'weeksworked14','weeksworked15','weeksworked16',
  'weeksworked17']
colenr = ['colenrfeb97','colenroct97','colenrfeb98',
  'colenroct98','colenrfeb99',  'colenroct99',
  'colenrfeb00','colenroct00','colenrfeb01',
  'colenroct01','colenrfeb02','colenroct02',
  'colenrfeb03','colenroct03','colenrfeb04',
  'colenroct04','colenrfeb05','colenroct05',
  'colenrfeb06','colenroct06','colenrfeb07',
  'colenroct07','colenrfeb08','colenroct08',
  'colenrfeb09','colenroct09','colenrfeb10',
  'colenroct10','colenrfeb11','colenroct11',
  'colenrfeb12','colenroct12','colenrfeb13',
  'colenroct13',  'colenrfeb14','colenroct14',
  'colenrfeb15','colenroct15','colenrfeb16',
  'colenroct16','colenrfeb17','colenroct17']

nls97 = nls97[demoadult + demo + highschoolrecord + \
  govresp + weeksworked + colenr]
nls97.dtypes

nls97.select_dtypes(exclude=["category"]).info()

nls97.filter(regex='income')

# 选择行

In [None]:
import pandas as pd
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97.csv")
nls97.set_index("personid", inplace=True)

# 使用切分法选择几行
nls97[1000:1004].T
nls97[1000:1004:2].T

# 使用 Python 切片法选择前 3 行
nls97[:3].T

# 使用 tail() 和 Python 分片法选择最后 3 行
nls97[-3:].T

# 使用 loc 和 iloc 选择几行
nls97.loc[[195884,195891,195970]].T
nls97.loc[195884:195970].T
nls97.iloc[[0]].T
nls97.iloc[[0,1,2]].T
nls97.iloc[[-3,-2,-1]].T

# 有条件地选择多行
nls97.nightlyhrssleep.quantile(0.05)
nls97.nightlyhrssleep.count()
sleepcheckbool = nls97.nightlyhrssleep<=4
sleepcheckbool
lowsleep = nls97.loc[sleepcheckbool]
lowsleep = nls97.loc[nls97.nightlyhrssleep<=4]
lowsleep.shape

# 根据多个条件选择行
lowsleep.childathome.describe()
lowsleep3pluschildren = nls97.loc[(nls97.nightlyhrssleep<=4) & (nls97.childathome>=3)]
lowsleep3pluschildren.shape

# 根据多个条件选择行，并选择列
lowsleep3pluschildren = nls97.loc[(nls97.nightlyhrssleep<=4) & (nls97.childathome>=3), ['nightlyhrssleep','childathome']]
lowsleep3pluschildren


# 分类计数

In [None]:
import pandas as pd
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97.csv")
nls97.set_index("personid", inplace=True)

nls97[nls97.select_dtypes(['object']).columns] = \
  nls97.select_dtypes(['object']). \
  transform(lambda x: x.astype('category'))

# 显示类别数据类型的列名，并检查缺失的数量
catcols = nls97.select_dtypes(include=["category"]).columns
print(nls97[catcols].isnull().sum())

# 显示婚姻状况的频率
nls97.maritalstatus.value_counts()

# 关闭按频率排序
nls97.maritalstatus.value_counts(sort=False)

# 用百分比代替计数
nls97.maritalstatus.value_counts(sort=False, normalize=True)

# 为所有政府责任变量显示百分比
nls97.filter(like="gov").apply(pd.Series.value_counts, normalize=True)

# 对已婚者的所有政府责任变量进行百分比计算
nls97[nls97.maritalstatus=="Married"].\
  filter(like="gov").\
  apply(pd.Series.value_counts, normalize=True)

# 计算数据框中所有类别变量的频率和百分比
freqout = open('views/frequencies.txt', 'w') 
for col in nls97.\
  select_dtypes(include=["category"]):
    print(col, "----------------------",
      "frequencies",
    nls97[col].value_counts(sort=False),
      "percentages",
    nls97[col].value_counts(normalize=True,
      sort=False),
    sep="\n\n", end="\n\n\n", file=freqout)

freqout.close()



# 连续统计

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.1f}'.format
covidtotals = pd.read_csv("data/covidtotals.csv",
  parse_dates=['lastdate'])
covidtotals.set_index("iso_code", inplace=True)

# 查看几行 covid 病例数据
covidtotals.shape
covidtotals.sample(1, random_state=1).T
covidtotals.dtypes

# 获取累积值的描述性统计信息
totvars = ['total_cases',
  'total_deaths','total_cases_pm',
  'total_deaths_pm']
covidtotals[totvars].describe()
covidtotals[totvars].\
  quantile(np.arange(0.0, 1.1, 0.1))

# 查看病例总数的分布情况
plt.hist(covidtotals['total_cases']/1000, bins=12)
plt.title("Total Covid Cases (in thousands)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

# OpenAI

In [None]:
import pandas as pd
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe

pd.set_option('display.width', 70)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv",
  parse_dates=['lastdate'])
covidtotals.set_index("iso_code", inplace=True)

llm = OpenAI(api_token="Your API key")
type(llm)

covidtotalssdf = SmartDataframe(covidtotals, config={"llm": llm})
type(covidtotalssdf)

covidtotalssdf.chat("Show me some information about the data")
covidtotalssdf.chat("Show first five rows.")
covidtotalssdf.chat("Show total cases for locations with the five most total cases.")
covidtotalssdf.chat("Show total cases pm, total deaths pm, and location for locations with the 10 highest total cases pm.")

covidtotalsabb = covidtotalssdf.chat("Select total cases pm, total deaths pm, and location.")
covidtotalsabb

type(covidtotalsabb)

covidtotalsabb = covidtotalssdf.chat("Grab total cases pm, total deaths pm, and location.")
covidtotalsabb

covidtotalssdf.chat("Show total cases pm and location where total cases pm greater than 95th percentile.")
covidtotalssdf.chat("Summarize values for total cases pm and total deaths pm.").T
covidtotalssdf.chat("Show sum of total cases and total deaths by region.")
covidtotalssdf.chat("Plot the total_cases_pm column data distribution")
covidtotalssdf.chat("Plot the total_cases_pm data distribution")
covidtotalssdf.chat("Plot total cases pm values against total deaths pm values")
covidtotalssdf.chat("Plot total cases pm values against total deaths pm values with line")
covidtotalssdf.chat("Plot total cases pm values against total deaths pm values with lmplot without extreme values")
covidtotalssdf.chat("Use regplot to show total deaths pm against total cases pm")
covidtotalssdf.chat("Use regplot to show total deaths pm against total cases pm without extreme values")