### 数据集处理

In [1]:
# 去掉不需要的列

import pandas as pd

df = pd.read_csv('originDataset.csv')

columns_to_drop = ['newbalanceOrig', 'oldbalanceOrg', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']
df = df.drop(columns=columns_to_drop)

print(df.head())

# 去掉不需要的行

print(df.shape[0])
df = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])]
print(df.shape[0])

# 去掉 nameDest 以 M 开头的行
df = df[~df['nameDest'].str.startswith('M')]
print(df.shape[0])

# 去掉所有重复 nameDest 的行
repeated_nameDest = df['nameDest'].value_counts()
non_repeated_dest = repeated_nameDest[repeated_nameDest == 1].index
df = df[df['nameDest'].isin(non_repeated_dest)]
print(df.shape[0])

# 之后由于name没有实际意义去掉这两列
name_columns_to_drop = ['nameOrig', 'nameDest']
df = df.drop(columns=name_columns_to_drop)

# 去除超出200，000金额的TRANSFER行
df = df[~((df['type'] == 'TRANSFER') & (df['amount'] > 200000))]

# 将CASH_OUT编码为0，TRANSFER编码为1
df['type'] = df['type'].map({'CASH_OUT': 0, 'TRANSFER': 1})

print(df.head())
print(df.shape[0])
print(df[df['isFraud']==0].shape[0])
print(df[df['isFraud']==1].shape[0])

df.to_csv('cleaned_dataset.csv', index=False)

   step      type    amount     nameOrig     nameDest  isFraud
0     1   PAYMENT   9839.64  C1231006815  M1979787155        0
1     1   PAYMENT   1864.28  C1666544295  M2044282225        0
2     1  TRANSFER    181.00  C1305486145   C553264065        1
3     1  CASH_OUT    181.00   C840083671    C38997010        1
4     1   PAYMENT  11668.14  C2048537720  M1230701703        0
6362620
2770409
2770409
128331
        step  type     amount  isFraud
49023      9     0  257852.70        0
82631     10     0  215404.79        0
96693     10     0   55289.01        0
102296    10     0  139784.63        0
138199    11     0   41118.00        0
116522
115061
1461


剩余116522条交易记录，其中1461条fraud
#### 处理不平衡数据
    先划分出测试集，再欠拟合处理



In [2]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_csv('cleaned_dataset.csv')

X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_data = pd.concat([X_train, y_train], axis=1)

fraud = train_data[train_data['isFraud'] == 1]
non_fraud = train_data[train_data['isFraud'] == 0]

# 使用 RandomUnderSampler 进行欠采样
rus = RandomUnderSampler(sampling_strategy=0.33, random_state=42)  # 3:1 比例
X_train_balanced, y_train_balanced = rus.fit_resample(X_train, y_train)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('test_dataset.csv', index=False)

# 欠采样后的训练集
train_balanced_data = pd.concat([X_train_balanced, y_train_balanced], axis=1)
train_balanced_data.to_csv('balanced_train_dataset.csv', index=False)

balanced_train_df = pd.read_csv('balanced_train_dataset.csv')
print(balanced_train_df[balanced_train_df['isFraud']==0].shape[0])
print(balanced_train_df[balanced_train_df['isFraud']==1].shape[0])

3542
1169
