Identifying and Fixing Missing Values

# 识别缺失值

In [None]:
import pandas as pd

读取数据


df = pd.read_csv('data.csv')


查看缺失值情况


print(df.isnull().sum())


删除包含缺失值的行


df_cleaned = df.dropna()


用平均值填充缺失值


df_filled = df.fillna(df.mean())

## 空值检测

In [None]:

import pandas as pd
import numpy as np

pd.set_option('display.width', 70)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format

# 在读取read_csv（）之类的文件的函数中，空字符串（空白）和字符串’NaN’，'null’默认情况下视为缺失值。
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

covidtotals = pd.read_csv("data/covidtotalswithmissings.csv", low_memory=False)
covidtotals.set_index("iso_code", inplace=True)

# 检查任意列上是否有空值
print(covidtotals.isnull().sum())
covidtotals.isnull().any(axis=1)

# 检查人口统计栏是否缺失
covidtotals.shape
demovars = ['pop_density','aged_65_older',
   'gdp_per_capita','life_expectancy','hum_dev_ind']
covidtotals[demovars].isnull().sum(axis=0)
demovarsmisscnt = covidtotals[demovars].isnull().sum(axis=1)
demovarsmisscnt.value_counts().sort_index()
covidtotals.loc[demovarsmisscnt>=4, ['location'] + demovars].\
  sample(5, random_state=1).T

# 检查累积栏是否有缺失
totvars = ['location','total_cases_pm','total_deaths_pm']
covidtotals[totvars].isnull().sum(axis=0)
totvarsmisscnt = covidtotals[totvars].isnull().sum(axis=1)
totvarsmisscnt.value_counts().sort_index()

# 将逻辑缺失设置为实际缺失
nlsparents = nls97.iloc[:,-4:]
nlsparents.loc[nlsparents.motherhighgrade.between(-5,-1),
   'motherhighgrade'].value_counts()
nlsparents.loc[nlsparents.transform(lambda x: x.between(-5,-1)).\
  any(axis=1)]

nlsparents.transform(lambda x: x.between(-5,-1)).sum()

nlsparents.replace(list(range(-5,0)), np.nan, inplace=True)

print(nlsparents.isnull().sum())

# 清洗缺失

In [None]:
import pandas as pd

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 12)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 根据 NLS 数据建立学籍和人口数据框架
schoolrecordlist = ['satverbal','satmath','gpaoverall',
  'gpaenglish',  'gpamath','gpascience','highestdegree',
  'highestgradecompleted']

schoolrecord = nls97[schoolrecordlist]
schoolrecord.shape

# 检查学籍数据是否有遗漏
schoolrecord.isnull().sum(axis=0)
misscnt = schoolrecord.isnull().sum(axis=1)
misscnt.value_counts().sort_index()
schoolrecord.loc[misscnt>=7].head(4).T

# 删除几乎所有缺失数据的行
schoolrecord = schoolrecord.dropna(thresh=2)
schoolrecord.shape
schoolrecord.isnull().sum(axis=1).value_counts().sort_index()

# 为缺失值赋值
schoolrecord = nls97[schoolrecordlist]
print(schoolrecord.gpaoverall.isnull().sum())
schoolrecord.gpaoverall.agg(['mean','std','count'])
print(schoolrecord.gpaoverall.agg(['mean','std','count']))
schoolrecord.fillna({"gpaoverall":\
 schoolrecord.gpaoverall.mean()}, 
 inplace=True)
print(schoolrecord.gpaoverall.isnull().sum())
print(schoolrecord.gpaoverall.agg(['mean','std','count']))

schoolrecord.fillna({"gpaoverall":\
 schoolrecord.gpaoverall.mean()}, 
 inplace=True)

# 使用正向填充
wageincome20 = nls97.wageincome20.copy(deep=True)
print(wageincome20.isnull().sum())
wageincome20.head().T
wageincome20.ffill(inplace=True)
wageincome20.head().T
print(wageincome20.isnull().sum())

# 使用反向填充
wageincome20 = nls97.wageincome20.copy(deep=True)
wageincome20.head().T
print(wageincome20.std())
wageincome20.bfill(inplace=True)
wageincome20.head().T
print(wageincome20.std())

# 用各组平均值填补缺漏

# print(nls97.weeksworked20.mean())

# print(nls97.groupby(['highestdegree'])['weeksworked20'].mean())

nls97.loc[nls97.highestdegree.notnull(), 'weeksworked20imp'] = \
  nls97.loc[nls97.highestdegree.notnull()].\
  groupby(['highestdegree'])['weeksworked20'].\
  transform(lambda x: x.fillna(x.mean()))

print(nls97[['highestdegree','weeksworked20imp','weeksworked20','highestdegree']].\
  head(10))
print(nls97[['weeksworked20imp','weeksworked20']].\
  agg(['mean','std', 'count']))

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 12)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

# set up school record and demographic data frames from the NLS data
schoolrecordlist = ['satverbal','satmath','gpaoverall','gpaenglish',
  'gpamath','gpascience','highestdegree','highestgradecompleted']

schoolrecord = nls97[schoolrecordlist]
schoolrecord.shape

# check the school record data for missings
schoolrecord.isnull().sum(axis=0)
misscnt = schoolrecord.isnull().sum(axis=1)
misscnt.value_counts().sort_index()
schoolrecord.loc[misscnt>=7].head(4).T

# remove rows with almost all missing data
schoolrecord = schoolrecord.dropna(thresh=2)
schoolrecord.shape
schoolrecord.isnull().sum(axis=1).value_counts().sort_index()

# assign mean values to missings
schoolrecord.gpaoverall.agg(['mean','std','count'])
schoolrecord.gpaoverall.\
  fillna(schoolrecord.gpaoverall.\
  mean(), inplace=True)
schoolrecord.gpaoverall.isnull().sum()
schoolrecord.gpaoverall.agg(['mean','std','count'])

# use forward fill
wageincome = nls97.wageincome.copy(deep=True)
wageincome.isnull().sum()
wageincome.agg(['mean','std','count'])
wageincome.head().T

wageincome.fillna(method='ffill', inplace=True)
wageincome.head().T
wageincome.isnull().sum()
wageincome.agg(['mean','std','count'])

wageincome = nls97.wageincome.copy(deep=True)
wageincome.fillna(method='bfill', inplace=True)
wageincome.head().T
wageincome.agg(['mean','std','count'])

# fill missings with the average by group
nls97.weeksworked17.mean()
nls97.groupby(['highestdegree'])['weeksworked17'].mean()
# nls97.loc[~nls97.highestdegree.isnull(), 'weeksworked17imp'] = \
#   nls97.loc[~nls97.highestdegree.isnull()].\
#   groupby(['highestdegree'])['weeksworked17'].\
#   apply(lambda group: group.fillna(np.mean(group)))

# nls97[['weeksworked17imp','weeksworked17','highestdegree']].\
#   head(10)
# nls97[['weeksworked17imp','weeksworked17']].\
#   agg(['mean','count'])

## 分组填充

In [None]:
#-*- encoding=utf8 -*-

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

# 分组填充：按指定列对目标数据分组，取分组中要填充属性值的平均值或中值来替换或生成中的属性值；

# 导入相应数据库客户端
import mysql.connector as mdb

# 引入并配置 pandas 显示参数
import pandas as pd
pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 10)
pd.options.display.float_format = '{:,.2f}'.format

# 设置目标 table 查询语句以提取数据
targettable = "name"

# 使用 mysql api 和 read_sql() 从 mysql 获取和加载数据
host = "pdccmysql"
user = "pdccuser"
password = "pdccpass"
database = "pdccschema"
connmysql = mdb.connect(host=host,database=database,user=user,password=password)
df = pd.read_sql_table(table_name=targettable,con=connmysql)    # 生成 DataFrame
connmysql.close()

# 将 DataFrame 中的一个或多个列设置为索引
df.set_index(["column1_name",'column2_name'], inplace=True)

# 基于 column２ 分组计算目标 column３ 的 mean/median 值并对该列的空值进行填充 
df.loc[df.column２.notnull(), 'column３'] = \
  df.loc[df.column２.notnull()].\
  groupby(['column２'])['column３'].\
  transform(lambda x: x.fillna(x.mean()))

# 写入数据库，method = 'multi' 需要数据库支持 
df.to_sql(name=targettable,con=connmysql,if_exists='replace',chunksize=1000)
connmysql.close()

## 分箱填充

In [None]:
#-*- encoding=utf8 -*-

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

# 分箱填充：根据等级或样本量化值将变量离散成大小相等的桶，取箱中属性值的平均值或中值来替换或生成“箱”中的属性值；

# 导入相应数据库客户端
import mysql.connector as mdb

# 引入并配置 pandas 显示参数
import pandas as pd
pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 10)
pd.options.display.float_format = '{:,.2f}'.format

# 设置目标 table 查询语句以提取数据
targettable = "name"

# 使用 mysql api 和 read_sql() 从 mysql 获取和加载数据
host = "pdccmysql"
user = "pdccuser"
password = "pdccpass"
database = "pdccschema"
connmysql = mdb.connect(host=host,database=database,user=user,password=password)
df = pd.read_sql_table(table_name=targettable,con=connmysql)    # 生成 DataFrame
connmysql.close()

# 将 DataFrame 中的一个或多个列设置为索引
df.set_index(["column1_name",'column2_name'], inplace=True)

# 指定 column 进行分箱操作，并新增分箱数据列 column_qcut，分箱数据 q 需按数据实际情况取
df['column3_qcut'] = pd.\
  qcut(df['column3'],
       q=100, precision=0, duplicates='drop')

# 通过 column_qcut 分组计算目标 column 的 mean/median 值并对该列的空值进行填充 
df.loc[df.column3_qcut.notnull(), 'column4'] = \
  df.loc[df.column3_qcut.notnull()].\
  groupby(['column3_qcut'])['column4'].\
  transform(lambda x: x.fillna(x.median()))

# 写入数据库，method = 'multi' 需要数据库支持 
df.to_sql(name=targettable,con=connmysql,if_exists='replace',chunksize=1000)
connmysql.close()

# 回归计算

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.set_option('display.width', 74)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 检查与工资收入的相关性

nls97[['wageincome20','highestdegree','weeksworked20','parentincome']].info()
nls97['hdegnum'] = nls97.highestdegree.str[0:1].astype('float')
nls97.groupby(['highestdegree','hdegnum']).size()
nls97.parentincome.replace(list(range(-5,0)), np.nan, inplace=True)
nls97[['wageincome20','hdegnum','weeksworked20','parentincome']].corr()

# 检查缺少工资收入数据的人是否有所不同
# nls97.wageincome20.describe()
# nls97.loc[nls97.weeksworked20==0,'wageincome20'] = 0
nls97weeksworked = nls97.loc[nls97.weeksworked20>0]
nls97weeksworked.shape
nls97weeksworked['missingwageincome'] = \
  np.where(nls97weeksworked.wageincome20.isnull(),1,0)
nls97weeksworked.groupby(['missingwageincome'])[['hdegnum',
  'parentincome','weeksworked20']].\
  agg(['mean','count'])

# 准备回归数据 
# nls97.weeksworked20.fillna(nls97.weeksworked20.mean(), inplace=True)
nls97weeksworked.parentincome. \
  fillna(nls97weeksworked.parentincome.mean(), inplace=True)
nls97weeksworked['degltcol'] = \
  np.where(nls97weeksworked.hdegnum<=2,1,0)
nls97weeksworked['degcol'] = \
  np.where(nls97weeksworked.hdegnum.between(3,4),1,0)
nls97weeksworked['degadv'] = \
  np.where(nls97weeksworked.hdegnum>4,1,0)

# 拟合线性回归模型：返回每个观测值的影响，同时返回模型系数
def getlm(df, ycolname, xcolnames):
  df = df[[ycolname] + xcolnames].dropna()
  y = df[ycolname]
  X = df[xcolnames]
  X = sm.add_constant(X)
  lm = sm.OLS(y, X).fit()
  coefficients = pd.DataFrame(zip(['constant'] + xcolnames,
    lm.params, lm.pvalues), columns=['features','params',
    'pvalues'])
  return coefficients, lm

# nls97 = nls97.loc[nls97.weeksworked20>0]
xvars = ['weeksworked20','parentincome','degcol','degadv']
coefficients, lm = getlm(nls97weeksworked, 'wageincome20', xvars)
coefficients

nls97weeksworked.dtypes

# 生成预测
pred = lm.predict(sm.add_constant(nls97weeksworked[xvars])).\
  to_frame().rename(columns= {0: 'pred'})
nls97weeksworked = nls97weeksworked.join(pred)

nls97weeksworked['wageincomeimp'] = \
  np.where(nls97weeksworked.wageincome20.isnull(),\
  nls97weeksworked.pred, nls97weeksworked.wageincome20)
pd.options.display.float_format = '{:,.0f}'.format
nls97weeksworked[['wageincomeimp','wageincome20'] + xvars].\
  sample(10, random_state=7)
nls97weeksworked[['wageincomeimp','wageincome20']].\
  agg(['count','mean','std'])

# 增加一个误差项
np.random.seed(0)
randomadd = np.random.normal(0, lm.resid.std(), 
   nls97weeksworked.shape[0])
randomadddf = pd.DataFrame(randomadd, columns=['randomadd'],
   index=nls97weeksworked.index)
nls97weeksworked = nls97weeksworked.join(randomadddf)
nls97weeksworked['stochasticpred'] = \
   nls97weeksworked.pred + nls97weeksworked.randomadd

nls97weeksworked['wageincomeimpstoc'] = \
  np.where(nls97weeksworked.wageincome20.isnull(),
  nls97weeksworked.stochasticpred, nls97weeksworked.wageincome20)

nls97weeksworked[['wageincomeimpstoc','wageincome20']].\
  agg(['count','mean','std'])

# nls97weeksworked = nls97weeksworked.drop(columns=['randomadd','stochasticpred','wageincomeimpstoc'], axis=1)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

# check correlations with wageincome

nls97[['wageincome','highestdegree','weeksworked16','parentincome']].info()
nls97['hdegnum'] = nls97.highestdegree.str[0:1].astype('float')
nls97.groupby(['highestdegree','hdegnum']).size()
nls97.parentincome.replace(list(range(-5,0)), np.nan, inplace=True)
nls97[['wageincome','hdegnum','weeksworked16','parentincome']].corr()

# check to see if folks with missing wage income data are different
nls97['missingwageincome'] = np.where(nls97.wageincome.isnull(),1,0)
nls97.groupby(['missingwageincome'])[['hdegnum','parentincome',\
  'weeksworked16']].agg(['mean','count'])

# prepare data to run regression
nls97.weeksworked16.fillna(nls97.weeksworked16.mean(), inplace=True)
nls97.parentincome.fillna(nls97.parentincome.mean(), inplace=True)
nls97['degltcol'] = np.where(nls97.hdegnum<=2,1,0)
nls97['degcol'] = np.where(nls97.hdegnum.between(3,4),1,0)
nls97['degadv'] = np.where(nls97.hdegnum>4,1,0)

# fit a linear regression model
# return the influence of each observation
# also return model coefficients
def getlm(df, ycolname, xcolnames):
  df = df[[ycolname] + xcolnames].dropna()
  y = df[ycolname]
  X = df[xcolnames]
  X = sm.add_constant(X)
  lm = sm.OLS(y, X).fit()
  coefficients = pd.DataFrame(zip(['constant'] + xcolnames,
    lm.params, lm.pvalues), columns=['features','params',
    'pvalues'])
  return coefficients, lm

xvars = ['weeksworked16','parentincome','degcol','degadv']
coefficients, lm = getlm(nls97, 'wageincome', xvars)
coefficients

# generate predictions
pred = lm.predict(sm.add_constant(nls97[xvars])).\
  to_frame().rename(columns= {0: 'pred'})
nls97 = nls97.join(pred, how="left", on=None, validate="many_to_many")
nls97['wageincomeimp'] = np.where(nls97.wageincome.isnull(),\
  nls97.pred, nls97.wageincome)
pd.options.display.float_format = '{:,.0f}'.format
nls97[['wageincomeimp','wageincome'] + xvars].head(10)
nls97[['wageincomeimp','wageincome']].\
  agg(['count','mean','std'])

# add an error term
randomadd = np.random.normal(0, lm.resid.std(), nls97.shape[0])
randomadddf = pd.DataFrame(randomadd, columns=['randomadd'], index=nls97.index)
nls97 = nls97.join(randomadddf, how="left", on=None, validate="many_to_many")
nls97['stochasticpred'] = nls97.pred + nls97.randomadd
nls97['wageincomeimpstoc'] = np.where(nls97.wageincome.isnull(),\
  nls97.stochasticpred, nls97.wageincome)

# 输入缺失-KNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

pd.set_option('display.width', 74)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 准备 NLS 数据

nls97['hdegnum'] = \
  nls97.highestdegree.str[0:1].astype('float')
nls97['parentincome'] = \
  nls97.parentincome.\
     replace(list(range(-5,0)), 
      np.nan)

wagedatalist = ['wageincome20','weeksworked20',
   'parentincome','hdegnum']
wagedata = \
  nls97.loc[nls97.weeksworked20>0, wagedatalist]
wagedata.shape

# 初始化 KNN 估算模型并填充数值
impKNN = KNNImputer(n_neighbors=38)
newvalues = impKNN.fit_transform(wagedata)
wagedatalistimp = ['wageincomeimp','weeksworked20imp',
  'parentincomeimp','hdegnumimp']
wagedataimp = pd.DataFrame(newvalues,
  columns=wagedatalistimp, index=wagedata.index)

# 查看估算值
wagedata = wagedata.\
  join(wagedataimp[['wageincomeimp','weeksworked20imp']])
wagedata[['wageincome20','wageincomeimp','weeksworked20',
  'weeksworked20imp']].sample(10, random_state=7)

wagedata[['wageincome20','wageincomeimp']].\
  agg(['count','mean','std'])

wagedata[['wageincome20','wageincomeimp','weeksworked20',
  'weeksworked20imp']].sample(10, random_state=1)


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

# prepare the NLS data

nls97['hdegnum'] = nls97.highestdegree.str[0:1].astype('float')
nls97['degltcol'] = np.where(nls97.hdegnum<=2,1,0)
nls97['degcol'] = np.where(nls97.hdegnum.between(3,4),1,0)
nls97['degadv'] = np.where(nls97.hdegnum>4,1,0)
nls97.parentincome.replace(list(range(-5,0)), np.nan, inplace=True)

wagedatalist = ['wageincome','weeksworked16',
   'parentincome','degltcol','degcol','degadv']
wagedata = nls97[wagedatalist]

# initialize a KNN imputation model and fill values
impKNN = KNNImputer(n_neighbors=47)
newvalues = impKNN.fit_transform(wagedata)
wagedatalistimp = ['wageincomeimp','weeksworked16imp',
  'parentincomeimp','degltcol','degcol','degadv']
wagedataimp = pd.DataFrame(newvalues,
  columns=wagedatalistimp, index=wagedata.index)

# view imputed values
wagedata = wagedata.\
  join(wagedataimp[['wageincomeimp','weeksworked16imp']])
wagedata[['wageincome','weeksworked16','parentincome',
  'degcol','degadv','wageincomeimp']].head(10)

wagedata[['wageincome','wageincomeimp']].\
  agg(['count','mean','std'])

# 隐性缺失数据

利用FuzzyWuzzy，我们可以尝试识别那些由于打字错误或不一致格式而导致的“隐性”缺失数据。

例如，假设我们有一个客户信息的DataFrame，其中一个重要的字段是电话号码。电话号码由于格式不一或某些数字的缺失而可能被错误地标记为缺失。我们可以使用FuzzyWuzzy对这些疑似缺失的数据进行模糊匹配，尝试找到最接近的完整电话号码。

在这段代码中，我们首先创建了一个包含疑似缺失电话号码的DataFrame。然后定义了一个函数find_similar_phone_numbers，该函数将遍历DataFrame中的电话号码列，并与给定的电话号码进行比较。如果找到相似度超过阈值的电话号码，它们将被添加到结果列表中。

通过这种方式，我们可以有效地识别并处理那些看似缺失但实际上只是格式不一致或不完整的数据，从而提高数据集的准确性和完整性。

In [None]:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

# 字符串相似度-编辑距离：指定列数据(String)，识别相似度高的重复项并删除

# 导入相应数据库客户端
import mysql.connector as mdb
import pandas as pd
from fuzzywuzzy import process

# 设置目标 table 查询语句以提取数据
targettable = "name"

# 使用 mysql api 和 read_sql() 从 mysql 获取和加载数据
host = "pdccmysql"
user = "pdccuser"
password = "pdccpass"
database = "pdccschema"
connmysql = mdb.connect(host=host,database=database,user=user,password=password)
df = pd.read_sql_table(table_name=targettable,con=connmysql)    # 生成 DataFrame
connmysql.close()

# 将 DataFrame 中的一个或多个列设置为索引
df.set_index(["column1_name"], inplace=True)

# 假设我们有一个包含疑似缺失电话号码的DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Carol', 'Dave'],
    'Phone': ['(123) 456-7890', '(123) 456-780', None, '(123) 456-0000']
})

def find_similar_phone_numbers(dataframe, column_name, phone_number, threshold=80):
    """
    查找与给定电话号码相似的电话号码。
    :param dataframe: 要处理的DataFrame。
    :param column_name: 包含电话号码的列名。
    :param phone_number: 给定的电话号码。
    :param threshold: 相似度阈值，默认为80。
    :return: 相似度超过阈值的电话号码列表。
    """
    similar_numbers = []
    for index, row in dataframe.iterrows():
        if pd.notna(row[column_name]):
            match = process.extractOne(phone_number, dataframe[column_name])
            if match[1] > threshold:
                similar_numbers.append(match)
    return similar_numbers

# 假设我们怀疑电话号码列的某个值被错误地标记为缺失
suspected_missing_number = '1398888888'

# 查找相似的电话号码
similar_numbers = find_similar_phone_numbers(df, 'column_phone', suspected_missing_number)

# 输入缺失-缺失森林

MacOS需要：brew install libomp

In [None]:
import pandas as pd
import numpy as np
from missforest.missforest import MissForest
import miceforest as mf
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 清理 NLS 工资数据
nls97['hdegnum'] = \
  nls97.highestdegree.str[0:1].astype('float')

nls97['parentincome'] = \
  nls97.parentincome.\
     replace(list(range(-5,0)), 
      np.nan)

# 加载工资收入和相关数据
wagedatalist = ['wageincome20','weeksworked20','parentincome',
  'hdegnum']
wagedata = \
  nls97.loc[nls97.weeksworked20>0, wagedatalist]

# 使用缺失森林来估算数值
imputer = MissForest()
wagedataimp = imputer.fit_transform(wagedata)
wagedatalistimp = \
  ['wageincomeimp','weeksworked20imp','parentincomeimp']
wagedataimp.rename(columns=\
   {'wageincome20':'wageincome20imp',
   'weeksworked20':'weeksworked20imp',
   'parentincome':'parentincomeimp'}, inplace=True)

# 查看估算值
wagedata = \
  wagedata.join(wagedataimp[['wageincome20imp',
    'weeksworked20imp']])
wagedata[['wageincome20','wageincome20imp',
  'weeksworked20','weeksworked20imp']].\
  sample(10, random_state=7)

wagedata[['wageincome20','wageincome20imp',
  'weeksworked20','weeksworked20imp']].\
  agg(['count','mean','std'])

# 改为运行多重插值法(miceforest)
kernel = mf.ImputationKernel(
  data=wagedata[wagedatalist].reset_index(),
  save_all_iterations_data=True,
  random_state=1,
  num_datasets=3,
  mean_match_candidates=5
)
kernel.mice(3,verbose=True)

wagedataimpmice = kernel.complete_data()

wagedataimpmice.rename(columns=\
  {'wageincome20':'wageincome20impmice',
  'weeksworked20':'weeksworked20impmice',
  'parentincome':'parentincomeimpmice'}, 
  inplace=True)

wagedata = wagedata[wagedatalist].\
 join(wagedataimpmice[['wageincome20impmice',
   'weeksworked20impmice']])
 
wagedata[['wageincome20','wageincome20impmice',
  'weeksworked20','weeksworked20impmice']].\
  agg(['count','mean','std'])

In [None]:
# import pandas and scikit learn's KNNImputer module
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.1f}'.format
import os
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
nls97 = pd.read_csv("/data/nls97b.csv")
nls97.set_index("personid", inplace=True)

# clean the NLS wage data
nls97['hdegnum'] = nls97.highestdegree.str[0:1].astype('float')
nls97.parentincome.replace(list(range(-5,0)), np.nan, inplace=True)
nls97['degltcol'] = np.where(nls97.hdegnum<=2,1,0)
nls97['degcol'] = np.where(nls97.hdegnum.between(3,4),1,0)
nls97['degadv'] = np.where(nls97.hdegnum>4,1,0)

# load the wage income and associated data
wagedatalist = ['wageincome','weeksworked16','parentincome',
  'degltcol','degcol','degadv']
wagedata = nls97[wagedatalist]

# use miss forest to impute values
imputer = MissForest()
newvalues = imputer.fit_transform(wagedata)
wagedatalistimp = ['wageincomeimp','weeksworked16imp','parentincomeimp',
  'degltcol','degcol','degadv']
wagedataimp = pd.DataFrame(newvalues, columns=wagedatalistimp, index=wagedata.index)


# view imputed values
wagedataimp
wagedata = wagedata.join(wagedataimp[['wageincomeimp','weeksworked16imp']])
wagedata[['wageincome','weeksworked16','parentincome',
  'degcol','degadv','wageincomeimp']].head(10)

wagedata[['wageincome','wageincomeimp','weeksworked16','weeksworked16imp']].\
  agg(['count','mean','std'])

# 输入缺失-AI

In [None]:
import pandas as pd
import numpy as np
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
llm = OpenAI(api_token="Your API Key")

pd.set_option('display.width', 72)
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 220)
pd.options.display.float_format = '{:,.0f}'.format

# 加载数据帧并创建智能数据帧对象
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 设置学位和父母收入变量
nls97['hdegnum'] = nls97.highestdegree.str[0:1].astype('category')
nls97['parentincome'] = \
  nls97.parentincome.\
  replace(list(range(-5,0)),
  np.nan)

wagedatalist = ['wageincome20','weeksworked20',
   'parentincome','hdegnum']
wagedata = nls97[wagedatalist]

wagedatasdf = SmartDataframe(wagedata, config={"llm": llm})

# 显示摘要统计
wagedatasdf.chat("Show the counts, means, and standard deviations as table")

# 根据平均数推算遗漏
wagedatasdf = \
  wagedatasdf.chat("Impute missing values based on average.")
  
wagedatasdf.chat("Show the counts, means, and standard deviations as table")

wagedatasdf.hdegnum.value_counts(dropna=False).sort_index()
wagedatasdf = \
  wagedatasdf.chat("Impute missings based on most frequent value")
wagedatasdf.hdegnum.value_counts(dropna=False).sort_index()

# 改用估算缺失率函数
wagedatasdf = SmartDataframe(wagedata, config={"llm": llm})

wagedatasdf = \
  wagedatasdf.impute_missing_values()
wagedatasdf.chat("Show the counts, means, and standard deviations as table")

# 用 KNN 计算
wagedatasdf = SmartDataframe(wagedata, config={"llm": llm})
wagedatasdf = wagedatasdf.chat("Impute missings for float variables based on knn with 47 neighbors")
wagedatasdf.chat("Show the counts, means, and standard deviations as table")