In [14]:
import pandas as pd
df = pd.read_csv('originDataset.csv')
print(df[df['isFraud']==1].shape[0])

8213


### 数据集处理

In [10]:
# 去掉不需要的列

import pandas as pd

df = pd.read_csv('originDataset.csv')

columns_to_drop = ['newbalanceOrig', 'oldbalanceOrg', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']
df = df.drop(columns=columns_to_drop)

print(df.head())

name_columns_to_drop = ['nameOrig', 'nameDest']
df = df.drop(columns=name_columns_to_drop)

# 对 'type' 列进行独热编码
one_hot_encoded = pd.get_dummies(df['type'], prefix='type', dtype=int)

# 将独热编码后的数据与原始DataFrame合并
df = pd.concat([df, one_hot_encoded], axis=1)

# 你可能也想删除原来的 'type' 列
df = df.drop('type', axis=1)

print(df.head())
print(df.shape[0])
print(df[df['isFraud']==0].shape[0])
print(df[df['isFraud']==1].shape[0])

df.to_csv('uncleaned_dataset.csv', index=False)

   step      type    amount     nameOrig     nameDest  isFraud
0     1   PAYMENT   9839.64  C1231006815  M1979787155        0
1     1   PAYMENT   1864.28  C1666544295  M2044282225        0
2     1  TRANSFER    181.00  C1305486145   C553264065        1
3     1  CASH_OUT    181.00   C840083671    C38997010        1
4     1   PAYMENT  11668.14  C2048537720  M1230701703        0
   step    amount  isFraud  type_CASH_IN  type_CASH_OUT  type_DEBIT  \
0     1   9839.64        0             0              0           0   
1     1   1864.28        0             0              0           0   
2     1    181.00        1             0              0           0   
3     1    181.00        1             0              1           0   
4     1  11668.14        0             0              0           0   

   type_PAYMENT  type_TRANSFER  
0             1              0  
1             1              0  
2             0              1  
3             0              0  
4             1              0

剩余116522条交易记录，其中1461条fraud
#### 处理不平衡数据
    先划分出测试集，再欠拟合处理



In [2]:
print(one_hot_encoded)

         CASH_IN  CASH_OUT  DEBIT  PAYMENT  TRANSFER
0          False     False  False     True     False
1          False     False  False     True     False
2          False     False  False    False      True
3          False      True  False    False     False
4          False     False  False     True     False
...          ...       ...    ...      ...       ...
6362615    False      True  False    False     False
6362616    False     False  False    False      True
6362617    False      True  False    False     False
6362618    False     False  False    False      True
6362619    False      True  False    False     False

[6362620 rows x 5 columns]


In [15]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_csv('uncleaned_dataset.csv')

X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_data = pd.concat([X_train, y_train], axis=1)

train_data.to_csv('uncleaned_imbalanced_train_dataset.csv', index=False)

fraud = train_data[train_data['isFraud'] == 1]
non_fraud = train_data[train_data['isFraud'] == 0]

# 使用 RandomUnderSampler 进行欠采样
rus = RandomUnderSampler(sampling_strategy=0.33, random_state=42)  # 3:1 比例
X_train_balanced, y_train_balanced = rus.fit_resample(X_train, y_train)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('uncleaned_test_dataset.csv', index=False)

# 欠采样后的训练集
train_balanced_data = pd.concat([X_train_balanced, y_train_balanced], axis=1)
train_balanced_data.to_csv('uncleaned_balanced_train_dataset.csv', index=False)

balanced_train_df = pd.read_csv('uncleaned_balanced_train_dataset.csv')
print(balanced_train_df[balanced_train_df['isFraud']==0].shape[0])
print(balanced_train_df[balanced_train_df['isFraud']==1].shape[0])

19909
6570


In [4]:
import pandas as pd
balanced_train_df = pd.read_csv('uncleaned_imbalanced_train_dataset.csv')
print(balanced_train_df[balanced_train_df['isFraud']==0].shape[0])
print(balanced_train_df[balanced_train_df['isFraud']==1].shape[0])

5083526
6570


In [1]:
import pandas as pd
balanced_train_df = pd.read_csv('uncleaned_test_dataset.csv')
print(balanced_train_df[balanced_train_df['isFraud']==0].shape[0])
print(balanced_train_df[balanced_train_df['isFraud']==1].shape[0])

1270881
1643
