# 异常检验与处理

In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
# 基于岭回归将异常值去除
def find_outliers(model, X, y, sigma=3):
    
    # 预测模型
    try:
        y_pred = pd.Series(model.predict(X), index=y.index)
    except:
        # 预测失败，就先调整模型
        model.fit(X, y)
        y_pred = pd.Series(model.predict(X), index=y.index)
    
    # 计算预测和真实的偏差
    resid = y - y_pred  
    mean_resid = resid.mean()
    std_resid = resid.std()
    
    # 计算 z-score
    z = (resid - mean_resid) / std_resid
    
    outliers = z[abs(z)>sigma].index
    
    if len(outliers) == 0:
        print("未发现异常值")
        return []  # 返回空列表
    
    else:
        # 计算 R方 和 MSE
        print(f"R2 = {model.score(X, y)}")
        print(f"MSE = {mean_squared_error(y, y_pred)}")
        print(f"----------------------------------------")

        print(f"mean_resid = {mean_resid}")
        print(f"std_resid = {std_resid}")
        print(f"----------------------------------------")

        print(f"outliers数量: {len(outliers)}")
        print(f"outliers: {outliers.tolist()}")
        print(f"----------------------------------------")

        plt.figure(figsize=(15, 5))
        ax_131 = plt.subplot(1, 3, 1)
        plt.plot(y, y_pred, '.')
        plt.plot(y.loc[outliers, ], y_pred.loc[outliers, ], 'ro')
        plt.legend(['Accepted', 'Outlier'])
        plt.xlabel('y')
        plt.ylabel('y_pred')

        ax_132 = plt.subplot(1, 3, 2)
        plt.plot(y, y-y_pred, '.')
        plt.plot(y.loc[outliers, ], y.loc[outliers, ]-y_pred.loc[outliers, ], 'ro')
        plt.legend(['Accepted', 'Outlier'])
        plt.xlabel('y')
        plt.ylabel('y-y_pred')

        ax_133 = plt.subplot(1, 3, 3)
        z.plot.hist(bins=50, ax=ax_133)
        z.loc[outliers, ].plot.hist(color='r', bins=50, ax=ax_133)
        plt.legend(['Accepted', 'Outlier'])
        plt.xlabel('z')
        
        return outliers

## 删除 y 异常值

In [4]:
from sklearn.linear_model import Ridge

outliers = find_outliers(Ridge(), X_train, y_train, sigma=3)
X_train = X_train.drop(outliers)
y_train = y_train.drop(outliers)
X_train.shape, y_train.shape

NameError: name 'X_train' is not defined

## 删除 特征 异常值

In [None]:
from sklearn.linear_model import Ridge
import tqdm

f = X_train.columns[0]
f

In [None]:
X_ = X_train.drop(f, axis=1)
y_ = X_train[f]

outliers = find_outliers(Ridge(), X_, y_, sigma=4)
X_train = X_train.drop(outliers)
y_train = y_train.drop(outliers)
X_train.shape, y_train.shape

In [None]:
# 对于所有特征
for f in tqdm.tqdm(X_train.columns):
    X_ = X_train.drop(f, axis=1)
    y_ = X_train[f]

    outliers = find_outliers(Ridge(), X_, y_, sigma=4.5)
    X_train = X_train.drop(outliers)
    y_train = y_train.drop(outliers)

In [None]:
X_train.shape, y_train.shape