In [21]:
!pip install ctgan
!pip install sdv
!pip install sdmetrics



In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import os
from sdv.single_table import CTGANSynthesizer

In [4]:
import tensorflow as tf

print("TensorFlow version:", tf.__version__)
print("Is GPU available:", tf.test.is_gpu_available())

gpus = tf.config.list_physical_devices('GPU')
print("GPUs:", gpus)

2024-03-09 10:26:33.426341: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-09 10:26:33.426446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-09 10:26:33.549751: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow version: 2.15.0
Is GPU available: True
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
# 指定要使用的 GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [41]:
from ctgan import CTGAN

In [7]:
# 加载数据
file_path = "/kaggle/input/fraud-detection/data/A08.csv"
data = pd.read_csv(file_path)

In [8]:
# 删除含有NaN值的行
data.dropna(inplace=True)

In [8]:
# 分离数值类和连续性数据
# 假设数值类数据是整数类型，连续性数据是浮点类型
numeric_data = data.select_dtypes(include=['int64'])
continuous_data = data.select_dtypes(include=['float64'])

In [9]:
# 对数值类数据进行标准化
scaler_numeric = StandardScaler()
numeric_data_scaled = scaler_numeric.fit_transform(numeric_data)

# 对连续性数据进行归一化
scaler_continuous = MinMaxScaler()
continuous_data_scaled = scaler_continuous.fit_transform(continuous_data)


In [10]:
# 将处理后的数据合并回原数据框架
data[numeric_data.columns] = numeric_data_scaled
data[continuous_data.columns] = continuous_data_scaled

In [9]:
data.head()

Unnamed: 0,个人编码,一天去两家医院的天数,就诊的月数,月就诊天数_MAX,月就诊天数_AVG,月就诊医院数_MAX,月就诊医院数_AVG,就诊次数_SUM,月就诊次数_MAX,月就诊次数_AVG,...,药品在总金额中的占比,个人支付的药品占比,检查总费用在总金额占比,个人支付检查费用占比,治疗费用在总金额占比,个人支付治疗费用占比,BZ_民政救助,BZ_城乡优抚,是否挂号,RES
0,352120000000000.0,0,6,7,5.666667,3,2.166667,34,7,5.666667,...,0.939194,0.004262,0.050817,0.0,0.007434,0.0,0,0,1,0
1,352120000000000.0,0,6,4,2.5,2,1.333333,15,4,2.5,...,0.955626,0.002982,0.030815,0.0,0.013398,0.0,0,0,1,0
2,352120000000000.0,8,6,8,6.166667,3,2.166667,45,9,7.5,...,0.78361,0.000332,0.0,0.0,0.195087,0.0,0,0,0,0
3,352120000000000.0,0,6,6,3.666667,2,1.833333,23,6,3.833333,...,0.458649,0.000184,0.0,0.0,0.541351,0.0,0,0,0,0
4,352120000000000.0,0,6,5,4.333333,1,1.0,26,5,4.333333,...,0.983726,0.000316,0.0,0.0,0.016274,0.0,0,0,0,0


log_frequency：是否在条件采样中使用分类级别的对数频率。默认为 True。这个参数会影响模型处理分类值频率的方式，而这些频率会被用来作为其他值的条件。在某些情况下，将其改为 False 可能会带来更好的性能。

primary_key:设置主键

In [10]:
# CTGAN模型配置
ctgan_config = {
    "embedding_dim": 128,  # 嵌入层维度
    "generator_dim": (256, 256, 256),  # 生成器层的维度
    "discriminator_dim": (256, 256, 256),  # 判别器层的维度
    "generator_lr": 2e-4,  # 生成器学习率
    "discriminator_lr": 2e-4,  # 判别器学习率
    "batch_size": 500,  # 批量大小
    "epochs": 300  # 训练轮数
}

# 创建CTGAN模型实例
ctgan = CTGAN(**ctgan_config,verbose=True,cuda=True)

In [11]:
discrete_columns = [
'就诊的月数',

'医院编码_NN',

'交易时间YYYY_NN',

'交易时间YYYYMM_NN',

'出院诊断病种名称_NN',

'出院诊断LENTH_MAX',

'BZ_民政救助',

'BZ_城乡优抚',

'是否挂号'
]

In [12]:
# 训练模型
ctgan.fit(data,discrete_columns)

Gen. (-0.00) | Discrim. (-1.02): 100%|██████████| 300/300 [15:39<00:00,  3.13s/it]


In [13]:
# 生成合成数据
synthetic_data = ctgan.sample(
#     num_rows=1000
    len(data)
)

# 如果需要，您可以将生成的数据保存到文件中
synthetic_data.to_csv('/kaggle/working/add_data.csv', index=False)

In [14]:
ctgan.save('/kaggle/working/my_ctgan_model.pkl')

In [15]:
df = pd.read_csv('/kaggle/working/add_data.csv')
df.head()

Unnamed: 0,个人编码,一天去两家医院的天数,就诊的月数,月就诊天数_MAX,月就诊天数_AVG,月就诊医院数_MAX,月就诊医院数_AVG,就诊次数_SUM,月就诊次数_MAX,月就诊次数_AVG,...,药品在总金额中的占比,个人支付的药品占比,检查总费用在总金额占比,个人支付检查费用占比,治疗费用在总金额占比,个人支付治疗费用占比,BZ_民政救助,BZ_城乡优抚,是否挂号,RES
0,352120000000000.0,0,5,6,2.307725,3,1.975519,22,8,1.811951,...,0.888016,-0.001589,-0.00488,0.002766,-0.003645,0.001187,0,0,0,0
1,352120000000000.0,0,6,7,2.289094,1,1.039444,2,7,5.677205,...,0.654849,0.045839,-0.00506,-0.000358,-0.011408,0.000793,0,0,1,0
2,352121000000000.0,0,6,8,6.452612,2,1.408328,34,7,7.012288,...,0.850131,0.001813,0.033685,0.001141,0.025144,0.003592,0,0,0,0
3,352120000000000.0,0,6,20,10.024856,5,4.175493,51,11,12.329955,...,0.7703,0.000358,-0.000858,0.143936,0.442206,0.024842,0,1,1,1
4,352120000000000.0,0,3,6,2.439662,2,1.258084,8,5,5.192105,...,0.776277,0.000834,0.012422,0.002082,0.023363,-0.000894,0,0,0,0


In [45]:
## 编写元数据
import json

# Update to the expected metadata format
metadata_json_corrected = {
    "primary_key": "个人编码",
    "columns": {}
}

for col in data.columns:
    if col in discrete_columns:
        metadata_json_corrected["columns"][col] = {"type": "categorical", "sdtype": "categorical"}
    else:
        metadata_json_corrected["columns"][col] = {"type": "numerical", "subtype": "float", "sdtype": "numerical"}

# Adjusting for the primary key
metadata_json_corrected["columns"]["个人编码"] = {"type": "id", "subtype": "integer", "sdtype": "id"}

# Save the corrected metadata
metadata_file_path_corrected = '/kaggle/working/A08_metadata_corrected.json'
with open(metadata_file_path_corrected, 'w') as file:
    json.dump(metadata_json_corrected, file, ensure_ascii=False, indent=4)


In [46]:
with open('/kaggle/working/A08_metadata_corrected.json') as f:
    metadata = json.load(f)

In [47]:
metadata

{'primary_key': '个人编码',
 'columns': {'个人编码': {'type': 'id', 'subtype': 'integer', 'sdtype': 'id'},
  '一天去两家医院的天数': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '就诊的月数': {'type': 'categorical', 'sdtype': 'categorical'},
  '月就诊天数_MAX': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '月就诊天数_AVG': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '月就诊医院数_MAX': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '月就诊医院数_AVG': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '就诊次数_SUM': {'type': 'numerical', 'subtype': 'float', 'sdtype': 'numerical'},
  '月就诊次数_MAX': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '月就诊次数_AVG': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '月统筹金额_MAX': {'type': 'numerical',
   'subtype': 'float',
   'sdtype': 'numerical'},
  '月统筹金额_AVG': {'type': 'numerical',
   'subtype': 'float',
 

In [48]:
# 统计报告
from sdmetrics.reports.single_table import QualityReport

report = QualityReport()

report.generate(data, df, metadata,verbose=True)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 82/82 [00:00<00:00, 158.67it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 3321/3321 [01:48<00:00, 30.69it/s]

Overall Score: 72.59%

Properties:
- Column Shapes: 70.75%
- Column Pair Trends: 74.43%


In [49]:
# 诊断报告

from sdmetrics.reports.single_table import DiagnosticReport

report = DiagnosticReport()

report.generate(data, df, metadata,verbose=True)

Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 82/82 [00:00<00:00, 423.08it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 295.96it/s]

Overall Score: 90.41%

Properties:
- Data Validity: 80.83%
- Data Structure: 100.0%
