In [12]:
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('american_bankruptcy_dataset.csv')

# 检查缺失值
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

df['status_label'] = df['status_label'].map({'alive': 0, 'failed': 1})

columns_to_drop = ['company_name','fyear','Division','MajorGroup']
df = df.drop(columns=columns_to_drop)

X = df.drop('status_label', axis=1)
y = df['status_label']

print("alive数量", df[df['status_label']==0].shape[0])
print("fail数量", df[df['status_label']==1].shape[0])

# 归一化
scaler = MinMaxScaler()

X_normalized = scaler.fit_transform(X)
X_normalized = DataFrame(X_normalized)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

train_data = pd.concat([X_train, y_train], axis=1)

fail = train_data[train_data['status_label'] == 1]
alive = train_data[train_data['status_label'] == 0]

# 欠采样
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)  # 2:1 比例
X_train_balanced, y_train_balanced = rus.fit_resample(X_train, y_train)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('test_dataset.csv', index=False)

# 欠采样后的训练集
train_balanced_data = pd.concat([X_train_balanced, y_train_balanced], axis=1)
train_balanced_data.to_csv('balanced_train_dataset.csv', index=False)

balanced_train_df = pd.read_csv('balanced_train_dataset.csv')
print(balanced_train_df[balanced_train_df['status_label']==0].shape[0])
print(balanced_train_df[balanced_train_df['status_label']==1].shape[0])

Missing values in each column:
 company_name    0
fyear           0
status_label    0
X1              0
X2              0
X3              0
X4              0
X5              0
X6              0
X7              0
X8              0
X9              0
X10             0
X11             0
X12             0
X13             0
X14             0
X15             0
X16             0
X17             0
X18             0
Division        0
MajorGroup      0
dtype: int64
alive数量 73462
fail数量 5220
Training set size: (62945, 18)
Test set size: (15737, 18)
8304
4152
