# 初探

Functions Classes

vscode 编辑 settings.json，引入外部类：

```json
    "python.analysis.extraPaths": [
        "./data-cleaning/II/12.FunctionsClasses/helperfunctions"
    ]
```

In [None]:
import pandas as pd
import os
import sys
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index('personid', inplace=True)

# 导入 basicdescriptives 模块
sys.path.append(os.getcwd() + "/helperfunctions")
import basicdescriptives as bd

pd.set_option('display.width', 64)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 50)

# 初步了解 NLS 数据
dfinfo = bd.getfirstlook(nls97)
bd.displaydict(dfinfo)

# 向 nrows 和 uniqueid 参数传递值
dfinfo = bd.getfirstlook(nls97,2,'originalid')
bd.displaydict(dfinfo)

# 处理部分字典键和值
dfinfo['nrows']
dfinfo['dtypes']
dfinfo['nrows'] == dfinfo['uniqueids']

# 测量

In [None]:
import pandas as pd
import os
import sys
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index('personid', inplace=True)

# 导入 basicdescriptives 
sys.path.append(os.getcwd() + "/helperfunctions")
import basicdescriptives as bd
import importlib
importlib.reload(bd)

pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format

# 显示连续变量的汇总统计
bd.gettots(nls97[['satverbal','satmath']]).T
bd.gettots(nls97.filter(like="weeksworked"))

# 计算每列和每行的缺失数
missingsbycols, missingsbyrows = \
  bd.getmissings(nls97[['weeksworked20',
  'weeksworked21']], True)
missingsbycols
missingsbyrows
missingsbycols, missingsbyrows = \
  bd.getmissings(nls97[['weeksworked20',
  'weeksworked21']])
missingsbyrows

# 对分类列进行频率计算
nls97.loc[:, nls97.dtypes == 'object'] = \
  nls97.select_dtypes(['object']). \
  apply(lambda x: x.astype('category'))
bd.makefreqs(nls97, "views/nlsfreqs.txt")

# 按组进行计数和百分比计算
bd.getcnts(nls97, 
  ['maritalstatus','colenroct00'])
bd.getcnts(nls97, 
  ['maritalstatus','colenroct00'],
  "colenroct00.str[0:1]=='1'")

# 检查异常值

In [None]:
import pandas as pd
import os
import sys
import pprint
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index('personid', inplace=True)
covidtotals = pd.read_csv("data/covidtotals.csv")

# 导入 outliers
sys.path.append(os.getcwd() + "/helperfunctions")
import outliers as ol
import importlib
importlib.reload(ol)
pd.set_option('display.width', 72)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 100)

# 获取变量的分布
dist = ol.getdistprops(covidtotals.total_cases_pm)
pprint.pprint(dist)

# 显示离群行
sumvars = ['satmath','wageincome20']
othervars = ['originalid','highestdegree','gender','maritalstatus']
outliers = ol.getoutliers(nls97, sumvars, othervars)
outliers.varname.value_counts(sort=False)
outliers.loc[outliers.varname=='satmath', othervars + sumvars]
outliers.to_csv("views/nlsoutliers.csv")

# 绘制序列的直方图或方框图
ol.makeplot(nls97.satmath, "Histogram of SAT Math", "SAT Math")
ol.makeplot(nls97.satmath, "Boxplot of SAT Math", "SAT Math", "box")

# 组合汇总

In [13]:
import pandas as pd
import os
import sys

# import combineagg module
sys.path.append(os.getcwd() + "/helperfunctions")
import combineagg as ca
import importlib
importlib.reload(ca)
pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.0f}'.format

# 加载数据帧
coviddaily = pd.read_csv("data/coviddaily.csv")
ltbrazil = pd.read_csv("data/ltbrazil.csv")
countries = pd.read_csv("data/ltcountries.csv")
locations = pd.read_csv("data/ltlocations.csv")

# 按组和时间段汇总面板数据，但不包括在内
ca.adjmeans(coviddaily, 'location','new_cases','casedate')
ca.adjmeans(coviddaily, 'location','new_cases','casedate', 5000)

# 检查数据帧中按值合并的匹配情况
ca.checkmerge(countries.copy(), locations.copy(),\
  "countryid", "countryid")

# 将文件夹中的所有pickle文件连接起来，假设它们具有相同的结构
landtemps = ca.addfiles("data/ltcountry")
landtemps.country.value_counts()


inright  N      Y
inleft           
N        0      1
Y        2  27472
      countryid inleft inright
7363         FO      N       Y
9716         LQ      Y       N
13104        ST      Y       N
ltbrazil.csv has 1008 rows.
ltcameroon.csv has 48 rows.
ltindia.csv has 1116 rows.
ltpoland.csv has 120 rows.
ltjapan.csv has 1800 rows.
ltmexico.csv has 852 rows.
ltoman.csv has 288 rows.

Different column names for:
ltoman.csv
Index(['latabs'], dtype='object')

Columns Matched: False


country
Japan       1800
India       1116
Brazil      1008
Mexico       852
Oman         288
Poland       120
Cameroon      48
Name: count, dtype: int64

# Class_清理

In [None]:
import pandas as pd
import os
import sys
import pprint

# import the respondent class
sys.path.append(os.getcwd() + "/helperfunctions")
import respondent as rp
import importlib
importlib.reload(rp)

pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 100)

# 加载NLS数据，然后创建字典列表
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97list = nls97.to_dict('records')
nls97.shape
len(nls97list)
pprint.pprint(nls97list[0:1])

# 遍历列表，每次创建一个响应实例
analysislist = []
for respdict in nls97list:
  resp = rp.Respondent(respdict)
  newdict = dict(originalid=respdict['originalid'],
    childnum=resp.childnum(),
    avgweeksworked=resp.avgweeksworked(),
    age=resp.ageby('20201015'),
    baenrollment=resp.baenrollment())
  analysislist.append(newdict)

# 创建pandas数据帧
len(analysislist)
resp.respondentcnt
pprint.pprint(analysislist[0:2])
analysis = pd.DataFrame(analysislist)
analysis.head(2)

# Json_清理

In [None]:
import pandas as pd
import json
import os
import sys
import pprint
import requests

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 8)

# import the collection items module
sys.path.append(os.getcwd() + "/helperfunctions")
import collectionitem as ci

# 加载美术馆的json数据
response = requests.get("https://openaccess-api.clevelandart.org/api/artworks/?african_american_artists")
camcollections = json.loads(response.text)
camcollections = camcollections['data']

# 循环遍历列表，每次创建一个集合项实例
analysislist = []
for colldict in camcollections:
  coll = ci.Collectionitem(colldict)
  newdict = dict(id=colldict['id'],
    title=colldict['title'],
    type=colldict['type'],
    creationdate=colldict['creation_date'],
    ncreators=coll.ncreators(),
    ncitations=coll.ncitations(),
    birthyearsall=coll.birthyearsall(),
    birthyear=coll.birthyearcreator1())
  analysislist.append(newdict)

# 创建pandas数据帧
len(camcollections)
len(analysislist)
pprint.pprint(analysislist[0:1])
analysis = pd.DataFrame(analysislist)
analysis.birthyearsall.value_counts().head()
analysis.head(2).T

# 数据检查

In [None]:
import pandas as pd
import numpy as np
import os
import sys
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
dc = pd.read_csv("data/datacheckingtargets.csv")
dc.set_index('varname', inplace=True)

pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 200)

sys.path.append(os.getcwd() + "/helperfunctions")
import runchecks as rc

nls97.originalid.head(7)
nls97.loc[nls97.originalid==2,"originalid"] = 1
nls97.loc[nls97.originalid.between(3,7), "originalid"] = np.nan
nls97.originalid.head(7)
nls97["highestgradecompleted"] = nls97.highestgradecompleted.replace(95, np.nan)

dc = dc.loc[dc.include=="Y"]
numvars = dc.loc[dc.type=="numeric"].index.to_list()
catvars = dc.loc[dc.type=="categorical"].index.to_list()
idvars = dc.loc[dc.type=="unique"].index.to_list()

rc.runchecks(nls97,dc,numvars,catvars,idvars)

# 流水线_处理_简单

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.3f}'.format

# 加载NLS数据
landtemps = pd.read_csv("data/landtemps2023avgs.csv")

feature_cols = ['latabs','elevation']

X_train, X_test, y_train, y_test =  \
  train_test_split(landtemps[feature_cols],\
  landtemps[['avgtemp']], test_size=0.1, random_state=0)

kf = KFold(n_splits=5, shuffle=True, random_state=0)
type(kf)
      
pipeline = \
  make_pipeline(StandardScaler(),
  SimpleImputer(strategy="mean"),LinearRegression(), memory=None)

scores = \
  cross_validate(pipeline, X=X_train, y=y_train.values,
  cv=kf, scoring=['r2','neg_mean_absolute_error'], 
  n_jobs=1)

print("Mean Absolute Error: %.2f, R-squared: %.2f" % 
  (scores['test_neg_mean_absolute_error'].mean(),
  scores['test_r2'].mean()))

# 流水线_预处理_复杂

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from feature_engine.encoding import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate, KFold
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor

import os
import sys
sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

# 加载NLS数据并创建训练和测试DataFrames
nls97wages = pd.read_csv("data/nls97wages.csv", low_memory=False)
nls97wages.set_index("personid", inplace=True)

# 设置功能和目标
num_cols = ['gpascience','gpaenglish','gpamath',
  'gpaoverall','motherhighgrade','fatherhighgrade',
  'parentincome','weeksworked20']
cat_cols = ['gender']
bin_cols = ['completedba']

target = nls97wages[['wageincome20']]
features = nls97wages[num_cols + cat_cols + bin_cols]

X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.2, random_state=0)

# 为列转换设置管道
standtrans = make_pipeline(OutlierTrans(2),
  StandardScaler())
cattrans = \
  make_pipeline(SimpleImputer(strategy=\
  "most_frequent"),OneHotEncoder(drop_last=True))
bintrans = \
  make_pipeline(SimpleImputer(strategy=\
  "most_frequent"))

coltrans = ColumnTransformer(
  transformers=[
    ("stand", standtrans, num_cols),
    ("cat", cattrans, ['gender']),
    ("bin", bintrans, ['completedba'])
  ]
)

# 在管道中添加特征选择和线性模型，并查看参数估算值
lr = LinearRegression()

pipe1 = make_pipeline(coltrans,
  KNNImputer(n_neighbors=5), lr)

ttr=TransformedTargetRegressor(regressor=pipe1,
  transformer=StandardScaler())

# 运行 kfold 交叉验证
kf = KFold(n_splits=10, shuffle=True, random_state=0)

scores = cross_validate(ttr, X=X_train, y=y_train,
  cv=kf, scoring=('r2', 'neg_mean_absolute_error'),
  n_jobs=1)

print("Mean Absolute Error: %.2f, R-squared: %.2f" % 
  (scores['test_neg_mean_absolute_error'].mean(),
  scores['test_r2'].mean()))