In [None]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
    'select_all': False,   # Whether to use all features.
    'valid_ratio': 0.1,   # validation_size = train_size * valid_ratio
    'n_epochs': 3000,     # Number of epochs.            
    'batch_size': 1024, 
    'learning_rate': 1e-3,  #10的负5次方            
    'early_stop': 400,    # If model has not improved for this many consecutive epochs, stop training.     
    'save_path': './models/model.ckpt'  # Your model will be saved here.
}

In [None]:
class Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        #如果y为None，则表示没有目标数据，只进行预测。
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
            
        self.x = torch.FloatTensor(x)
    #获取数据集中特定索引（idx）的样本。
    def __getitem__(self, idx):
        #如果self.y为None，表示只进行预测，此时返回self.x[idx]，即返回特征数据。
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

In [None]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    #从训练数据集和验证数据集中提取目标数据，假设目标数据位于每个数据集的最后一列。 :：表示选择所有的行。-1：表示选择最后一列。
    y_train, y_valid = train_data[:,1], valid_data[:,1]
    
    #[0] + list(range(2, 50))
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,[0] + list(range(2, 82))], valid_data[:,[0] + list(range(2, 82))], test_data
    
    #创建一个包含所有特征列的索引列表
    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = list(range(1,81))
  
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

In [None]:
def same_seed(seed): 
    #A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.确定性卷积算法
    torch.backends.cudnn.deterministic = True
    #A bool that, if True, causes cuDNN to benchmark multiple convolution algorithmsand select the fastest.
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    #设置CUDA的随机数生成器的种子.通过设置相同的种子，确保了在相同的种子下，使用CUDA生成的随机数也是一致的。
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

#valid_ratio=0.2,data_set=train_data       
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    #使用random_split函数将数据集划分为训练集和验证集，并将它们分配给train_set和valid_set变量。
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)


In [None]:
#设置随机种子
same_seed(config['seed'])


train_data, test_data = pd.read_csv('../output/数据/train_data.csv').values, pd.read_csv('../output/数据/tt_data.csv').values
#使用train_data来划分训练集和验证集
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])
# Print out the data size.将划分的结果打印出来
print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} 
test_data size: {test_data.shape}""")

x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')

#创建自定义数据集对象，即你的数据集需要继承自torch.utils.data.Dataset类，并实现__len__和__getitem__方法。
train_dataset, valid_dataset, test_dataset = Dataset(x_train, y_train),Dataset(x_valid, y_valid),Dataset(x_test)

# Pytorch data loader loads pytorch dataset into batches.
'''
在PyTorch的DataLoader中，pin_memory是一个参数，用于控制是否将加载的数据存储在固定内存中。
当pin_memory=True时，数据将被存储在固定内存（pinned memory）中，这对于使用GPU加速训练过程非常有用。
当pin_memory=True时，DataLoader将会在返回每个批次的数据之前，将数据从主机内存（host memory）复制到固定内存。
这样可以减少从主机内存到GPU内存的数据传输时间，从而加快训练过程。
但是，需要注意的是，如果你的数据集非常大，将所有数据复制到固定内存可能会导致内存不足的问题。
因此，在使用pin_memory=True时，确保你的系统具有足够的固定内存可供使用。
'''
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

### Model

In [None]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            #nn.Linear(input_dim, 256),
            #nn.LeakyReLU(negative_slope=0.01),
            #nn.Dropout(p=0.4),
            #nn.Linear(256,256),
            #nn.LeakyReLU(negative_slope=0.01),
            #nn.Dropout(p=0.2),
            #nn.Linear(256, 16),
            #nn.LeakyReLU(negative_slope=0.01),
            #nn.Dropout(p=0.2),
            #nn.Linear(16,2)
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),  # 添加批量归一化
            nn.ReLU(),  # 更换为ReLU激活函数
            nn.Dropout(p=0.2),
            nn.Linear(256,128), # 调整隐藏层神经元数量
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(p=0.2), # 调整dropout率
            nn.Linear(128, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(16,2)
        )

    def forward(self, x):
        x = self.layers(x)
        #x = x.squeeze(1) # (B, 1) -> (B)
        return x

### trainer

In [None]:
def trainer(train_loader, valid_loader, model, config, device):
    criterion =nn.CrossEntropyLoss()
    
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'],weight_decay = 0.001)
    
    # Writer of tensoboard.
    #writer = SummaryWriter() 

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    #初始化参数
    n_epochs=config['n_epochs']
    best_loss=math.inf
    step=0
    early_stop_count=0
    
    #开始训练
    for epoch in range(n_epochs):
        #设置训练模式
        model.train() 
        #创建了一个空列表loss_record，用于存储每个训练批次的损失值。
        loss_record = []
        # tqdm is a package to visualize your training progress.用于训练进度显示与加载训练集
        train_pbar = tqdm(train_loader, position=0, leave=True)
        for X, y in train_pbar:
            #设置零度归零
            optimizer.zero_grad()              
            X, y = X.to(device), y.to(device)   
            pred = model(X)
            #使用交叉熵计算loss        
            loss = criterion(pred, y.to(torch.int64))
            #loss反向传播
            loss.backward()                     # Compute gradient(backpropagation).
            #进行优化，根据计算得到的梯度更新模型的参数。
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            # 可视化训练过程
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})
        #平均训练损失值。    
        mean_train_loss = sum(loss_record)/len(loss_record)
        
        #保存训练Loss
        #writer.add_scalar('Loss/train', mean_train_loss, step)
        
        model.eval() # Set your model to evaluation mode.
        loss_record = []
        #开始验证
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            #取消梯度计算
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y.to(torch.int64))
            loss_record.append(loss.item())
        
        #计算平均测试损失值    
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        
        #保存验证Loss
        #writer.add_scalar('Loss/valid', mean_valid_loss, step)
        
        #writer.close()

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            writer.close()
            return
    writer.close()

### 开始训练

In [None]:
#数值初始化
def init_weights(m):
    if type(m)==nn.Linear:
        nn.init.normal_(m.weight,std=0.01)
    
model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.
model.apply(init_weights);
#model.apply(xavier_he_init);
trainer(train_loader, valid_loader, model, config, device)

### 预测

In [None]:
def predict(test_loader, model, device, id_list):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():
            pred_probs = torch.softmax(model(x), dim=1)
            #pred_probs =model(x)
            #print(pred_probs)
            preds.append(pred_probs.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()

    # Convert the predicted probabilities to the final format
    results = []
    for i, id_no in enumerate(id_list):
        # pred_type = preds[i].argmax()  # Get the predicted class index (0 or 1)
        pred_type = preds[i].argmax()  # Get the predicted class index (0 or 1)
        #print(pred_type)
        pred_prob = preds[i, pred_type]  # Get the predicted probability for the predicted class
        results.append([id_no, pred_type.item(), pred_prob.item()])

    return results


In [None]:
def format_decimal(num):
    # Convert a number from scientific notation to a string with 13 decimal places
    return "{:.13f}".format(num)

def save_pred(preds, file):
    ''' Save predictions to the specified file '''
    with open(file, 'w', newline='') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id_no', 'predtype', 'predprob'])
        for pred in preds:
            id_no = pred[0]  # Convert ID number to scientific notation
            pred_type = pred[1]  # Predicted class index
            if pred_type==0:
                pred_prob = 1.0-pred[2] # Format predicted probability
            else:
                #pred_prob =format_scientific_notation(pred[2])
                pred_prob =pred[2]
            #pred_prob =pred[2]
            #pred_prob = format_decimal(pred_prob)
            writer.writerow([id_no, pred_type, pred_prob])


df = pd.read_csv('./test_data.csv')
id_list = df.iloc[:, 0].values.tolist()        
model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device,id_list) 
#print(preds)
save_pred(preds, 'pred.csv')   