Outliers Multivariate

# 单变量离群值

Outliers variate

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as scistat
pd.set_option('display.width', 62)
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format
covidtotals = pd.read_csv("data/covidtotals.csv")
covidtotals.set_index("iso_code", inplace=True)

# 设置累计栏和人口统计栏
totvars = ['location','total_cases',
  'total_deaths','total_cases_pm',
  'total_deaths_pm']
demovars = ['population','pop_density',
  'median_age','gdp_per_capita',
   'hosp_beds','hum_dev_ind']

covidtotals.info()

# 获取累积值的描述性统计
covidtotalsonly = covidtotals.loc[:, totvars]
covidtotalsonly.describe()
pd.options.display.float_format = '{:,.1f}'.format
covidtotalsonly.quantile(np.arange(0.0, 1.1, 0.1),
   numeric_only=True)
covidtotalsonly.skew(numeric_only=True)
covidtotalsonly.kurtosis(numeric_only=True)

# 正态性检验
def testnorm(var, df):
  stat, p = scistat.shapiro(df[var])
  return p

print("total cases: %.5f" % testnorm("total_cases", covidtotalsonly))
print("total deaths: %.5f" % testnorm("total_deaths", covidtotalsonly))
print("total cases pm: %.5f" % testnorm("total_cases_pm", covidtotalsonly))
print("total deaths pm: %.5f" % testnorm("total_deaths_pm", covidtotalsonly))

# 显示病例总数和每百万病例总数的 qqplot 图
sm.qqplot(covidtotalsonly[['total_cases']]. \
  sort_values(['total_cases']), line='s')
plt.title("QQ Plot of Total Cases")

sm.qqplot(covidtotals[['total_cases_pm']]. \
  sort_values(['total_cases_pm']), line='s')
plt.title("QQ Plot of Total Cases Per Million")
plt.show()

# 显示总病例的异常值
thirdq, firstq = covidtotalsonly.total_cases.quantile(0.75), covidtotalsonly.total_cases.quantile(0.25)
interquartilerange = 1.5*(thirdq-firstq)
outlierhigh, outlierlow = interquartilerange+thirdq, firstq-interquartilerange
print(outlierlow, outlierhigh, sep=" <--> ")

# 生成离群值表格并保存到 Excel/CSV 中
def getoutliers():
  dfout = pd.DataFrame(columns=covidtotals. \
    columns, data=None)
  for col in covidtotalsonly.columns[1:]:
    thirdq, firstq = covidtotalsonly[col].\
      quantile(0.75),covidtotalsonly[col].\
      quantile(0.25)
    interquartilerange = 1.5*(thirdq-firstq)
    outlierhigh, outlierlow = \
      interquartilerange+thirdq, \
      firstq-interquartilerange
    df = covidtotals.loc[(covidtotals[col]> \
      outlierhigh) | (covidtotals[col]< \
      outlierlow)]
    df = df.assign(varname = col,
      threshlow = outlierlow,
      threshhigh = outlierhigh)
    dfout = pd.concat([dfout, df])
  return dfout

outliers = getoutliers()
outliers.varname.value_counts()
outliers.to_excel("views/outlierscases.xlsx")
outliers.to_csv("views/outlierscases.csv")

# 再仔细观察一下每百万人死亡数的异常值
outliers.loc[outliers.varname=="total_deaths_pm",
  ['location','total_deaths_pm','total_cases_pm',
   'median_age','hum_dev_ind']]. \
  sort_values(['total_deaths_pm'], ascending=False)

covidtotals[['total_deaths_pm','median_age',
  'hum_dev_ind']]. \
  quantile([0.25,0.5,0.75])

# 再次显示病例总数柱状图
plt.hist(covidtotalsonly['total_cases']/1000, bins=7)
plt.title("Total Covid Cases (thousands)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

# 对 covid 数据进行对数变换
covidlogs = covidtotalsonly.copy()
for col in covidlogs.columns[1:]:
  covidlogs[col] = np.log1p(covidlogs[col])

plt.hist(covidlogs['total_cases'], bins=7)
plt.title("Total Covid Cases (log)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

# 二维变量离群值

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.width', 65)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
covidtotals.set_index("iso_code", inplace=True)

# 生成累积数据和人口统计数据的相关矩阵

covidtotals.corr(method="pearson", numeric_only=True)

# 看看某些国家的病例数是否会出现意想不到的低死亡率或高死亡率

# 分箱操作：根据等级或样本量化值将变量离散成大小相等的桶。
covidtotals['total_cases_q'] = pd.\
  qcut(covidtotals['total_cases'],
  labels=['very low','low','medium',
  'high','very high'], q=5, precision=0)
  
covidtotals.loc[covidtotals.total_cases_q.notnull(),'hosp_beds'] = \
  covidtotals.loc[covidtotals.total_cases_q.notnull()].\
    groupby(['total_cases_q'])['hospital_beds_thous'].transform(lambda x: x.fillna(x.mean()))

print(covidtotals[['location','hospital_beds_thous']].head(10))

covidtotals['total_deaths_q'] = pd.\
  qcut(covidtotals['total_deaths'],
  labels=['very low','low','medium',
  'high','very high'], q=5, precision=0)

pd.crosstab(covidtotals.total_cases_q,
  covidtotals.total_deaths_q)

covidtotals.loc[(covidtotals. \
  total_cases_q=="high") & \
  (covidtotals.total_deaths_q=="low")].T

# 将病例总数与死亡总数进行散点图分析
ax = sns.regplot(x=covidtotals.total_cases/1000, y=covidtotals.total_deaths)
ax.set(xlabel="Cases (thousands)", ylabel="Deaths", title="Total Covid Cases and Deaths by Country")
plt.show()

covidtotals.loc[(covidtotals.total_cases<40000000) \
  & (covidtotals.total_deaths>400000)].T
covidtotals.loc[(covidtotals.total_cases>30000000) \
  & (covidtotals.total_deaths<100000)].T

# 将病例总数与死亡总数进行散点图分析
ax = sns.regplot(x="total_cases_mill", y="total_deaths_mill", data=covidtotals)
ax.set(xlabel="Cases Per Million", ylabel="Deaths Per Million", title="Total Covid Cases per Million and Deaths per Million by Country")
plt.show()

In [None]:
#-*- encoding=utf8 -*-

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

# 分箱填充：根据等级或样本量化值将变量离散成大小相等的桶，取箱中属性值的平均值或中值来替换或生成“箱”中的属性值；

# 导入相应数据库客户端
import mysql.connector as mdb

# 引入并配置 pandas 显示参数
import pandas as pd
pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 10)
pd.options.display.float_format = '{:,.2f}'.format

# 设置目标 table 查询语句以提取数据
targettable = "name"

# 设置 sql 写入语句以修改数据
insertsql = 'insert into t_ds_wash_log(wash_type,wash_result,wash_state,process_definition_id,\
    process_instance_id,task_instance_id,create_time)values (%s,%s,%s,%s,%s,%s,%s)'

# 使用 mysql api 和 read_sql() 从 mysql 获取和加载数据
host = "pdccmysql"
user = "pdccuser"
password = "pdccpass"
database = "pdccschema"
connmysql = mdb.connect(host=host,database=database,user=user,password=password)
df = pd.read_sql_table(table_name=targettable,con=connmysql)    # 生成 DataFrame
connmysql.close()

# 将 DataFrame 中的一个或多个列设置为索引
df.set_index(["column1_name",'column2_name'], inplace=True)

# 指定 column 进行分箱操作，并新增分箱数据列 column_qcut，分箱数据 q 需按数据实际情况取
df['column3_qcut'] = pd.\
  qcut(df['column3'],
       q=100, precision=0, duplicates='drop')

# 通过 column_qcut 分组计算目标 column 的 mean/median 值并对该列的空值进行填充 
df.loc[df.column3_qcut.notnull(), 'column4'] = \
  df.loc[df.column3_qcut.notnull()].\
  groupby(['column3_qcut'])['column4'].\
  transform(lambda x: x.fillna(x.median()))

# 写入数据库，method = 'multi' 需要数据库支持 
df.to_sql(name=targettable,con=connmysql,if_exists='replace',chunksize=1000)
connmysql.close()


# 条件选择

In [None]:
import pandas as pd
pd.set_option('display.width', 55)
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97f.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

# 查看一些 NLS 数据
nls97[['wageincome20','highestgradecompleted',
  'highestdegree']].head(3).T

nls97.dtypes

nls97.loc[:, "weeksworked18":"weeksworked22"].head(3).T
nls97.loc[:, "colenroct15":"colenrfeb22"].head(2).T

# 显示有工资收入但无工作周数的个人
nls97.loc[(nls97.weeksworked20==0) &
   (nls97.wageincome20>0), 
  ['weeksworked20','wageincome20']]

# 检查是否曾就读于四年制大学
nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='3').\
  head(2).T
nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='3').\
  any(axis=1).head(2)

# 显示有研究生注册但无学士注册的个人
nobach = nls97.loc[nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='4').\
  any(axis=1) & ~nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='3').\
  any(axis=1), "colenrfeb97":"colenrfeb22"]
len(nobach)
nobach.head(2).T

# 显示拥有学士或以上学位但未就读四年制大学的个人
nls97.highestdegree.value_counts().sort_index()
no4yearenrollment = \
  nls97.loc[nls97.highestdegree.str[0:1].\
  isin(['4','5','6','7']) & \
  ~nls97.filter(like="colenr").\
  apply(lambda x: x.str[0:1]=='3').\
  any(axis=1), "colenrfeb97":"colenrfeb22"]
len(no4yearenrollment)
no4yearenrollment.head(2).T

# 显示工资收入高于或低于平均值三个标准差以上的个人
highwages = \
 nls97.loc[nls97.wageincome20 > 
 nls97.wageincome20.mean()+ \
 (nls97.wageincome20.std()*3),
 ['wageincome20']]
highwages

# 显示最近一年工作周数变化较大的个人
workchanges = nls97.loc[~nls97.loc[:,
  "weeksworked16":"weeksworked20"].mean(axis=1).\
  between(nls97.weeksworked21*0.5,\
  nls97.weeksworked21*2) \
  & ~nls97.weeksworked21.isnull(), 
  "weeksworked16":"weeksworked21"]
len(workchanges)
workchanges.head(6).T

# 显示最高完成年级与最高学位不一致
ltgrade12 = nls97.loc[nls97.highestgradecompleted<12, ['highestgradecompleted','highestdegree']]
pd.crosstab(ltgrade12.highestgradecompleted, ltgrade12.highestdegree)



# 回归影响

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
pd.set_option('display.width', 85)
pd.options.display.float_format = '{:,.2f}'.format
covidtotals = pd.read_csv("data/covidtotals.csv")
covidtotals.set_index("iso_code", inplace=True)

# 创建分析文件
xvars = ['pop_density','median_age','gdp_per_capita']
covidanalysis = covidtotals.loc[:,['total_cases_pm'] + xvars].dropna()

covidanalysis.describe()

# 拟合线性回归模型
def getlm(df):
  Y = df.total_cases_pm
  X = df[['pop_density','median_age','gdp_per_capita']]
  X = sm.add_constant(X)
  return sm.OLS(Y, X).fit()

lm = getlm(covidanalysis)
lm.summary()

# 确定对该模式有重大影响的国家
influence = lm.get_influence().summary_frame()
influence.loc[influence.cooks_d>0.5, ['cooks_d']]
covidanalysis.loc[influence.cooks_d>0.5]

# 做影响图
fig, ax = plt.subplots(figsize=(8,8))
sm.graphics.influence_plot(lm, ax = ax, alpha=5, criterion="cooks")
plt.show()

# 显示没有异常值的模型
covidanalysisminusoutliers = covidanalysis.loc[influence.cooks_d<0.5]

lm = getlm(covidanalysisminusoutliers)
lm.summary()


# KNN离群值

In [None]:
import pandas as pd
from pyod.models.knn import KNN
from sklearn.preprocessing import StandardScaler
pd.set_option('display.width', 53)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format
covidtotals = pd.read_csv("data/covidtotals.csv")
covidtotals.set_index("iso_code", inplace=True)

# 创建分析变量的标准化数据集

standardizer = StandardScaler()
analysisvars = ['location','total_cases_pm',
  'total_deaths_pm','pop_density',
  'median_age','gdp_per_capita']
covidanalysis = covidtotals.loc[:, analysisvars].dropna()
covidanalysisstand = standardizer.fit_transform(covidanalysis.iloc[:, 1:])

# 运行 KNN 模型并生成异常分数
clf_name = 'KNN'
clf = KNN(contamination=0.1)
clf.fit(covidanalysisstand)
y_pred = clf.labels_
y_scores = clf.decision_scores_

# 显示模型的预测结果
pred = pd.DataFrame(zip(y_pred, y_scores), 
  columns=['outlier','scores'], 
  index=covidanalysis.index)
pred.sample(10, random_state=2)
pred.outlier.value_counts()
pred.groupby(['outlier'])[['scores']].agg(['min','median','max'])

# 显示离群值的 covid 数据
covidanalysis.join(pred).\
  loc[pred.outlier==1,\
  ['location','total_cases_pm',
  'total_deaths_pm','scores']].\
  sort_values(['scores'],
  ascending=False).head(10)


# 孤立森林

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.width', 58)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 6)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

covidtotals = pd.read_csv("data/covidtotals.csv")
covidtotals.set_index("iso_code", inplace=True)

# 创建标准化分析数据框架
analysisvars = ['location','total_cases_pm','total_deaths_pm',
  'pop_density','median_age','gdp_per_capita']
standardizer = StandardScaler()
covidtotals.isnull().sum()
covidanalysis = covidtotals.loc[:, analysisvars].dropna()
covidanalysisstand = standardizer.fit_transform(covidanalysis.iloc[:, 1:])

# 运行孤立森林模型检测异常值
clf=IsolationForest(n_estimators=100, 
  max_samples='auto', contamination=.1, 
  max_features=1.0, random_state=12345)
clf.fit(covidanalysisstand)
covidanalysis['anomaly'] = \
  clf.predict(covidanalysisstand)
covidanalysis['scores'] = \
  clf.decision_function(covidanalysisstand)
covidanalysis.anomaly.value_counts()

# 查看异常值
inlier, outlier = \
  covidanalysis.loc[covidanalysis.anomaly==1],\
  covidanalysis.loc[covidanalysis.anomaly==-1]
analysisvars.append('scores')
outlier[analysisvars].sort_values(['scores'])

# 绘制离群值和异常值
ax = plt.axes(projection='3d')
ax.set_title('Isolation Forest Anomaly Detection')
ax.set_zlabel("Cases Per Million")
ax.set_xlabel("GDP Per Capita")
ax.set_ylabel("Median Age")
ax.scatter3D(inlier.gdp_per_capita, inlier.median_age, inlier.total_cases_pm, label="inliers", c="blue")
ax.scatter3D(outlier.gdp_per_capita, outlier.median_age, outlier.total_cases_pm, label="outliers", c="red")
ax.legend()
plt.tight_layout()
plt.show()


## 识别异常值-统计分析-孤立森林

In [None]:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

# 离群值识别-孤立森林

# 导入相应数据库客户端
import mysql.connector as mdb
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# 设置目标 table 查询语句以提取数据
targettable = "name"

# 使用 mysql api 和 read_sql() 从 mysql 获取和加载数据
host = "pdccmysql"
user = "pdccuser"
password = "pdccpass"
database = "pdccschema"
connmysql = mdb.connect(host=host,database=database,user=user,password=password)
df = pd.read_sql_table(table_name=targettable,con=connmysql)    # 生成 DataFrame
connmysql.close()

# 将 DataFrame 中的一个或多个列设置为索引
df.set_index("column1", inplace=True)

# 创建标准化分析数据框架-明确具有显著相关性的数据列
analysisvars = ['column2','column3','column4','column5','column6','column7']
# 去均值和方差归一化
standardizer = StandardScaler()
df.isnull().sum()
dfanalysis = covidtotals.loc[:, analysisvars].dropna()
dfanalysisstand = standardizer.fit_transform(df.iloc[:, 1:])

# 运行孤立森林模型检测异常值
clf=IsolationForest(n_estimators=100, 
  max_samples='auto', contamination=.1, 
  max_features=1.0, random_state=12345)
clf.fit(dfanalysisstand)
dfanalysis['anomaly'] = \
  clf.predict(dfanalysisstand)
dfanalysis['scores'] = \
  clf.decision_function(dfanalysisstand)
dfanalysis.anomaly.value_counts()

# 输出异常值CSV
inlier, outlier = \
  dfanalysis.loc[dfanalysis.anomaly==1],\
  dfanalysis.loc[dfanalysis.anomaly==-1]
analysisvars.append('scores')
outlier[analysisvars].sort_values(['scores'])
outlier.to_csv('path/views/Outliers_name.csv')

# 绘制离群值和异常值并保存为图片
ax = plt.axes(projection='3d')
ax.set_title('Isolation Forest Anomaly Detection')
ax.set_zlabel("Cases Per Million")
ax.set_xlabel("GDP Per Capita")
ax.set_ylabel("Median Age")
ax.scatter3D(inlier.gdp_per_capita, inlier.median_age, inlier.total_cases_pm, label="inliers", c="blue")
ax.scatter3D(outlier.gdp_per_capita, outlier.median_age, outlier.total_cases_pm, label="outliers", c="red")
ax.legend()
plt.tight_layout()
plt.savefig("path/views/Outliers_name.png")

# AI

In [None]:
import pandas as pd
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
llm = OpenAI(api_token="Your API key")

pd.set_option('display.width', 70)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 220)
pd.options.display.float_format = '{:,.0f}'.format

# 加载数据帧并创建智能数据帧对象
covidtotals = pd.read_csv("data/covidtotals.csv")

covidtotalssdf = SmartDataframe(covidtotals, config={"llm": llm})

# 运行一些查询
covidtotalssdf.chat("Plot histogram of total cases per million")
covidtotalssdf.chat("Show boxplot of total cases per million")
covidtotalssdf.chat("regplot total_deaths_pm on total_cases_pm")
covidtotalssdf.chat("Show total cases per million 7 highest values and 7 lowest values of total cases per million sorted by total cases per million")
covidtotalssdf.chat("Show total cases per million for locations with highest total cases per million in each region")
covidtotalssdf.chat("Show total cases per million and total deaths per million for locationss with high total_cases_pm and low total_deaths_pm")
covidtotalssdf.chat("What variables are highly correlated with total cases")