In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 读取数据集
data = pd.read_csv('credit_data.csv')

In [5]:
# 1. 数据完整性审核
missing_values = data.isnull().sum()       #数据缺失值统计 2分
duplicate_values = data.duplicated().sum()   #数据重复值统计 2分
# 输出结果
print("缺失值统计:")
print(missing_values)
print("重复值统计:")
print(duplicate_values)

缺失值统计:
CustomerID            0
Name                  0
Age                   1
Income                1
LoanAmount            0
LoanTerm              0
CreditScore           0
Default               0
TransactionHistory    0
dtype: int64
重复值统计:
0


In [6]:
# 2. 数据合理性审核
data['is_age_valid'] = data['Age'].between(18, 70)              #Age数据的合理性审核 2分
data['is_income_valid'] = data['Income'] > 2000                 #Income数据的合理性审核 2分
data['is_loan_amount_valid'] = data['LoanAmount'] < (data['Income'] * 5)      #LoanAmount数据的合理性审核 2分
data['is_credit_score_valid'] = data['CreditScore'].between(300, 850)   #CreditScore数据的合理性审核 2分
# 合理性检查结果
validity_checks = data[['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid']].all(axis=1)
data['is_valid'] = validity_checks
# 输出结果
print("数据合理性检查:")
print(data[['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid', 'is_valid']].describe())

数据合理性检查:
       is_age_valid is_income_valid is_loan_amount_valid  \
count          1000            1000                 1000   
unique            2               2                    2   
top            True            True                 True   
freq            999             999                  796   

       is_credit_score_valid is_valid  
count                   1000     1000  
unique                     1        2  
top                     True     True  
freq                    1000      795  


In [7]:
# 3. 数据清洗和异常值处理
# 标记不合理数据
invalid_rows = data[~data['is_valid']]
# 删除不合理数据行
cleaned_data = data[data['is_valid']]
# 删除标记列
cleaned_data = cleaned_data.drop(columns=['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid', 'is_valid'])
# 保存清洗后的数据
cleaned_data.to_csv('cleaned_credit_data.csv', index=False)
print("数据清洗完成，已保存为 'cleaned_credit_data.csv'")

数据清洗完成，已保存为 'cleaned_credit_data.csv'
