In [1]:
from pytorch_lightning import Trainer as EasyTrainer
import torchvision
import torch
import os
import json
from k12libs.utils.nb_easy import k12ai_start_html
from k12libs.utils.nb_easy import MDURL

from k12libs.utils.nb_easy import k12ai_build_model

ImportError: cannot import name 'k12ai_build_model'

In [None]:
DATA_ROOT = '/data/datasets'

def k12ai_start_viznet(height):
    url = f'{MDURL}?jfile=simple&flask=http://116.85.5.40:8117/k12ai/notebook/message&tag=cls_custom_base_mnist'
    return k12ai_start_html(url, height=height)

In [None]:
## 4行代码即可训练
dataset = k12ai_load_dataset('mnist', num_workers=4)
model   = k12ai_load_model('mnist', pretrained=True)
trainer = k12ai_trainer(max_epochs=100)
trainer.fit(model, Dataloader(dataset, batch_size=32))

-----------------------------

# <div align="center"> Code-1 </dvi>

-----------------------------

## 预处理数据集

In [None]:
class EasyDataset(torch.utils.data.Dataset):
    def __init__(self, data_root=None, phase=None, transforms=None):
        self.transforms = transforms            # 数据变换
        self.img_list, self.label_list = [], [] # 分类任务的图片和标签列表
        
        # 读取json数据集文件
        with open(os.path.join(data_root, '{}.json'.format(phase)), 'r') as f:
            items = json.load(f)
            for item in items:
                img_path = os.path.join(data_root, item['image_path'])
                if not os.path.exists(img_path):
                    continue
                self.img_list.append(img_path)
                self.label_list.append(item['label'])

    def __getitem__(self, index):
        img = ImageHelper.read_image(self.img_list[index])
        label = self.label_list[index]

        if self.transforms is not None:
            img = self.transforms(img)

        return dict(
            img=DataContainer(img, stack=True),
            label=DataContainer(label, stack=True)
        )

    def __len__(self):
        return len(self.img_list)

## 数据变换

In [None]:
transforms = {
    'train': torchvision.transforms.Compose( # 训练阶段的数据变换
        [
            torchvision.transforms.RandomHorizontalFlip(), # 随机水平翻转
            torchvision.transforms.Resize((256, 256)),     # 图片缩放变换
            torchvision.transforms.ToTensor(),             # PIL转为张量
            torchvision.transforms.Normalize(              # 图片标准化
                mean=[0.485, 0.456, 0.406],  # 数据集样本的均值
                std=[0.229, 0.224, 0.225]    # 数据集样本的方差 
            )
        ]
    ),
    'valid': torchvision.transforms.Compose( # 验证阶段的数据变换
        [
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.Resize((256, 256)),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ]
    ),
    'test': torchvision.transforms.Compose( # 测试阶段的数据变换
        [
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.Resize((256, 256)),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ]
    ),
}

## 配置超参数

In [None]:
### 设置训练轮回(max_epoch)
max_epoch = 10

### 设置损失函数(交叉熵CE)
reduction = 'mean' # 约简方式为mean(张量各个维度上的元素的平均值)
criterion = nn.CrossEntropyLoss(reduction=reduction)

### 设置优化器(随机梯度下降SGD)
optimizer = SGD(custom_model.parameters(),
                lr=0.01,           # 基础学习率
                weight_decay=1e-6, # 权重衰减, 使得模型参数值更小, 有效防止过拟合
                momentum=0.9,      # 动量因子, 更快局部收敛
                nesterov=True      # 使用Nesterov动量, 加快收敛速度
               )

### 设置学习率衰减策略(可选, 固定步长衰减StepLR)
scheduler = StepLR(optimizer,
                   step_size=2, # 每间隔2次epoch进行一次LR调整
                   gamma=0.6    # LR调整为原来0.6倍
                  )

## 定义模型

In [None]:
class EasyModel(torch.nn.Module):
    def __init__(self):
        super(EasyNetwork, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1)  # 卷积层, 图片特征提取
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)   # Dropout正则化, 减少模型过拟合
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)        # 全连接层, 图片线性变换

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1) # 每个分类的概率分布

## 完整训练配置

In [None]:
### 设置数据集
datasets = {
    'train': EasyDataset(DATA_ROOT, 'train', transforms['train']), # 训练数据集
    'valid': EasyDataset(DATA_ROOT, 'valid', transforms['valid']), # 验证数据集
    'test' : EasyDataset(DATA_ROOT, 'test',  transforms['test']),  # 测试数据集
}

### 设置超参数
hyparams = {
    'optimizer': optimizer, # 梯度优化方法
    'scheduler': scheduler, # 学习率调度策略
    'criterion': criterion, # 损失函数
}

In [None]:
class EasyModule(LightningModule)
    def __init__(self, datasets, hyparams, model):
        super(EasyModel, self).__init__()
        self.datasets = datasets
        self.hyparams = hyparams
        self.model = model
        
    def train_dataloader(self):
        return DataLoader(self.datasets['train'], batch_size=64, num_workers=4)

    def valid_dataloader(self):
        return DataLoader(self.datasets['valid'], batch_size=64, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.datasets['test'], batch_size=64)
    
    # 启动前配置优化器和学习率策略
    def configure_optimizers(self):
        return self.hyparams['optimizer'], self.hyparams['scheduler']

    # 训练时每次迭代回调
    def training_step(self, batch, batch_idx):
        images, targets = batch
        targets = [{k: v for k, v in t.items()} for t in targets]
        outputs = self.model(images, targets)
        criterion = self.hyparams['criterion'](outputs, targets)
        losses = sum(loss for loss in criterion.values())
        return {'loss': losses}
    
    # 验证时每次迭代回调
    def validation_step(self, batch, batch_idx):
        images, targets = batch
        targets = [{k: v for k, v in t.items()} for t in targets]
        outputs = self.model(images, targets)
        criterion = self.hyparams['criterion'](outputs, targets)
        losses = sum(loss for loss in criterion.values())
        return {'val_loss': losses}
    
    def validation_epoch_end(self, outputs):
        pass
    
    def forward(self, x, *args, **kwargs):
        return self.model(x)

## 训练启动/测试

In [None]:
trainer = k12ai_trainer(
    max_epochs=100,        # 训练最大循环次数
    val_check_interval=10, # 验证网络模型的周期, 每10次验证一次.
    early_stop_callback=EarlyStopping(monitor='val_loss', min_delta=1.0), # 监控loss, 用来提前终止训练
    checkpoint_callback=ModelCheckpoint('/cache') # 模型检查点, 保存模型
)

model = EasyModel(datasets, hyparams, EasyNetwork())

# 启动训练
trainer.fit(model)

# 启动测试
trainer.test(model)

-----------------------------

# <div align="center"> Code-2 </dvi>

-----------------------------

## 面罩探测数据集

In [None]:
class MaskDetectionDataset(torch.utils.data.Dataset):
    
    def __init__(self, dataframe, phase='train', transforms=None):
        super().__init__()
        
        self.image_names = dataframe["name"].unique()
        self.df = dataframe
        self.image_dir = image_dir
        self.transforms = transforms
        self.phase = phase
        
    def __getitem__(self, index: int):
        
        # 获取图片名字及图片相关记录信息
        image_name = self.image_names[index]
        records = self.df[self.df["name"] == image_name]
        
        # 加载图片
        image = cv2.imread(self.image_dir + image_name, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
                    
        # 从记录中取出口罩(mask)所在的坐标boxes信息
        boxes = records[['x1', 'y1', 'x2', 'y2']].values

        # 获取口罩边框的标签信息
        temp_labels = records[['classname']].values
        labels = []
        for label in temp_labels:
            label = class_to_int[label[0]]
            labels.append(label)

        # 将数据类型转换为张量类型
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {}
        target['boxes'] = boxes
        target['labels'] = labels

        if self.transforms:
            image = self.transforms(image)

        return image, target, image_name
    
    def __len__(self):
        return len(self.image_names)

## 采用预制的fast_rcnn模型

In [None]:
# 加载预制模型
model = k12ai_load_model('fasterrcnn_resnet50', pretrained=True)
head  = k12ai_load_model('fasterrcnn_predictor')

# 获取分类器的输入特征
in_features = model.roi_heads.box_predictor.cls_score.in_features

# 替换预制模型的head
model.roi_heads.box_predictor = head(in_features, 2)

## FasterRCNN

In [None]:
class FasterRCNN(nn.Module):

    def __init__(self, configer):
        super(FasterRCNN, self).__init__()
        self.configer = configer
        self.backbone, self.classifier = VGGModel(configer)()
        self.rpn = NaiveRPN(configer)
        self.rpn_target_assigner = RPNTargetAssigner(configer)
        self.roi_generator = FRROIGenerator(configer)
        self.roi_sampler = FRROISampler(configer)
        self.bbox_head = BBoxHead(configer, self.classifier)
        self.valid_loss_dict = configer.get('loss', 'loss_weights', configer.get('loss.loss_type'))

    def forward(self, data_dict):
        out_dict = dict()
        x = self.backbone(data_dict['img'])
        feat_list, rpn_locs, rpn_scores = self.rpn(x)
        if not self.training:
            indices_and_rois, test_rois_num = self.roi_generator(feat_list, rpn_locs, rpn_scores,
                                                                 self.configer.get('rpn', 'n_test_pre_nms'),
                                                                 self.configer.get('rpn', 'n_test_post_nms'),
                                                                 data_dict['meta'])
            roi_cls_locs, roi_scores = self.bbox_head(x, indices_and_rois, data_dict['meta'])
            out_dict['test_group'] = [indices_and_rois, roi_cls_locs, roi_scores, test_rois_num]

        if self.configer.get('phase') == 'test':
            return out_dict

        gt_rpn_locs, gt_rpn_labels = self.rpn_target_assigner(feat_list, data_dict['bboxes'], data_dict['meta'])
        train_indices_and_rois, _ = self.roi_generator(feat_list, rpn_locs, rpn_scores,
                                                       self.configer.get('rpn', 'n_train_pre_nms'),
                                                       self.configer.get('rpn', 'n_train_post_nms'),
                                                       data_dict['meta'])
        sample_rois, gt_roi_bboxes, gt_roi_labels = self.roi_sampler(train_indices_and_rois,
                                                                     data_dict['bboxes'],
                                                                     data_dict['labels'],
                                                                     data_dict['meta'])

        sample_roi_locs, sample_roi_scores = self.bbox_head(x, sample_rois, data_dict['meta'])
        sample_roi_locs = sample_roi_locs.contiguous().view(-1, self.configer.get('data', 'num_classes'), 4)
        sample_roi_locs = sample_roi_locs[
            torch.arange(0, sample_roi_locs.size()[0]).long().to(sample_roi_locs.device),
            gt_roi_labels.long().to(sample_roi_locs.device)].contiguous().view(-1, 4)
        out_dict['train_group'] = [sample_rois, sample_roi_locs, sample_roi_scores]
        loss_dict = dict()
        if 'rpn_loc_loss' in self.valid_loss_dict:
            loss_dict['rpn_loc_loss'] = dict(
                params=[rpn_locs, gt_rpn_locs, gt_rpn_labels, self.configer.get('loss.params.rpn_sigma')],
                type=torch.cuda.LongTensor([BASE_LOSS_DICT['smooth_l1_loss']]),
                weight=torch.cuda.FloatTensor([self.valid_loss_dict['rpn_loc_loss']])
            )
        if 'rpn_cls_loss' in self.valid_loss_dict:
            loss_dict['rpn_cls_loss'] = dict(
                params=[rpn_scores, gt_rpn_labels],
                type=torch.cuda.LongTensor([BASE_LOSS_DICT['ce_loss']]),
                weight=torch.cuda.FloatTensor([self.valid_loss_dict['rpn_cls_loss']])
            )
        if 'roi_loc_loss' in self.valid_loss_dict:
            loss_dict['roi_loc_loss'] = dict(
                params=[sample_roi_locs, gt_roi_bboxes, gt_roi_labels, self.configer.get('loss.params.roi_sigma')],
                type=torch.cuda.LongTensor([BASE_LOSS_DICT['smooth_l1_loss']]),
                weight=torch.cuda.FloatTensor([self.valid_loss_dict['roi_loc_loss']])
            )
        if 'roi_cls_loss' in self.valid_loss_dict:
            loss_dict['roi_cls_loss'] = dict(
                params=[sample_roi_scores, gt_roi_labels],
                type=torch.cuda.LongTensor([BASE_LOSS_DICT['ce_loss']]),
                weight=torch.cuda.FloatTensor([self.valid_loss_dict['roi_cls_loss']])
            )
        return out_dict, loss_dict

## 训练回调函数

In [None]:
# 模型训练过程中, 没有改善时提前终止
early_stopping = EarlyStopping(
    monitor='val_loss', # 监控指标
    mode='min',         # auto, min(指标值下降), max(指标值上升)
    patience=2,         # 容忍度, 当n(2)个epoch指标没有改善时, 停止训练
    min_delta=0.2       # 最小该变量
)

# 模型检查点相关参数设置
model_checkpoint = ModelCheckpoint(
    filepath='/cache/det',     # 检查点目录
    save_last=True,            # 是否保存最后一次epoch模型
    save_weights_only=False,   # 是否只保存模型权重
    save_top_k=2,              # 0: 无模型保存, -1: 模型都保存, 其他: 正常保存
    period=10,                 # 模型保存周期
)

## 启动训练

In [None]:
trainer = k12ai_trainer(
    max_epochs=100,     # 训练外循环最大次数
    max_steps=10000,    # 训练迭代最大步数
    resume_from_checkpoint='/cache/det/best.pk', # 恢复训练
    early_stop_callback=early_stopping,
    checkpoint_callback=model_checkpoint,
)

# 从磁盘读取数据集
df = pd.read_csv(DATA_ROOT + "train.csv")

trainloader = Dataloader(
    MaskDetectionDataset(df), 
    batch_size = 32,  # 一次进入模型的数据量
    shuffle = True,   # 数据是否重新排序
    num_workers = 4   # 读取数据的进程数
)

trainer.fit(model, trainloader)

-----------------------------

# <div align="center"> Code-3 </dvi>

-----------------------------

## 用户自定义模型

In [None]:
k12ai_start_viznet(height=800)

In [None]:
custom_model_str = '''plain_net {
 name: "hproject_model_name"

layer{
  conv{
    name : "Conv2d_6831"
    layer_builder : "NNTorchLayer"
    layer_mode : CONV2D
    inputs : "x"
    outputs : "Conv2d_7510"
    layer_params {
      in_channels : "3"
      out_channels : "30"
      kernel_size : "3"
    }
  }
}
layer{
  conv{
    name : "Conv2d_7510"
    layer_builder : "NNTorchLayer"
    layer_mode : CONV2D
    inputs : "Conv2d_6831"
    outputs : "Conv2d_5900"
    layer_params {
      in_channels : "30"
      out_channels : "30"
      kernel_size : "3"
    }
  }
}
layer{
  conv{
    name : "Conv2d_5900"
    layer_builder : "NNTorchLayer"
    layer_mode : CONV2D
    inputs : "Conv2d_7510"
    outputs : "MaxPool2d_7843"
    layer_params {
      in_channels : "30"
      out_channels : "30"
      kernel_size : "3"
    }
  }
}
layer{
  pool{
    name : "MaxPool2d_7843"
    layer_builder : "NNTorchLayer"
    layer_mode : MAXPOOL2D
    inputs : "Conv2d_5900"
    outputs : "Conv2d_4074"
    layer_params {
      kernel_size : "3"
      stride : "1"
    }
  }
}
layer{
  conv{
    name : "Conv2d_4074"
    layer_builder : "NNTorchLayer"
    layer_mode : CONV2D
    inputs : "MaxPool2d_7843"
    outputs : "MaxPool2d_5315"
    layer_params {
      in_channels : "30"
      out_channels : "50"
      kernel_size : "3"
    }
  }
}
layer{
  pool{
    name : "MaxPool2d_5315"
    layer_builder : "NNTorchLayer"
    layer_mode : MAXPOOL2D
    inputs : "Conv2d_4074"
    outputs : "Flatten_5235"
    layer_params {
      kernel_size : "3"
      stride : "1"
    }
  }
}
layer{
  vulkan{
    name : "Flatten_5235"
    layer_builder : "NNTorchLayer"
    layer_mode : FLATTEN
    inputs : "MaxPool2d_5315"
    outputs : "Linear_2653"
    layer_params {
      start_dim : "1"
    }
  }
}
layer{
  linear{
    name : "Linear_2653"
    layer_builder : "NNTorchLayer"
    layer_mode : LINEAR
    inputs : "Flatten_5235"
    outputs : "Linear_5215"
    layer_params {
      in_features : "12800"
      out_features : "1000"
    }
  }
}
layer{
  linear{
    name : "Linear_5215"
    layer_builder : "NNTorchLayer"
    layer_mode : LINEAR
    inputs : "Linear_2653"
    outputs : "Linear_9970"
    layer_params {
      in_features : "1000"
      out_features : "100"
    }
  }
}
layer{
  linear{
    name : "Linear_9970"
    layer_builder : "NNTorchLayer"
    layer_mode : LINEAR
    inputs : "Linear_5215"
    outputs : "Output_6954"
    layer_params {
      in_features : "100"
      out_features : "10"
    }
  }
}
}'''

In [None]:
# 将用户模型配置翻译成模型对象
custom_model = k12ai_build_model(custom_model_str)

In [None]:
# 根据自己需要配置数据集预处理方式
class MaskDataset(Dataset):
    """
    0 = '无面罩'
    1 = '戴面罩'
    """
    def __init__(self, dataFrame, transforms=None):
        self.dataFrame = dataFrame
        
        if transforms is None:
            self.transforms = Compose([
                ToTensor() # 输入模型前必须转换为Tensor
            ])
        else:
            self.transforms = transforms
    
    def __getitem__(self, key):
        row = self.dataFrame.iloc[key]
        return {
            'image': self.transforms(row['image']),
            'mask': tensor([row['mask']], dtype=long),
        }
    
    def __len__(self):
        return len(self.dataFrame.index)
    
# 可以实现自己的模型框架    
class MaskModelWrapper(LightningModule):
    
    def __init__(self, model, phase='train'):
        super(MaskModelWrapper, self).__init__()
        self.df = pd.read_csv(DATA_ROOT + f"{phase}.csv")
        self.model = model
        self.phase = phase
        self.trainDF, self.validDF, self.testDF = None, None, None
    
    def prepare_data(self) -> None:
        # 分割数据train/val
        if self.phase == 'train':
            train, valid = train_test_split(
                self.df,
                test_size=0.3,           # 7(train) vs 3(test)
                random_state=41,         # 随机
                stratify=self.df['mask'] # 数据集不平衡, 在分割数据时需要指定stratify
            )
            self.trainDF = MaskDataset(train)
            self.validDF = MaskDataset(valid)
        else:
            self.testDF = MaskDataset(self.df)

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.trainDF, batch_size=32, shuffle=True, num_workers=4)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.validateDF, batch_size=32, num_workers=4)
    
    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.validateDF, batch_size=32, num_workers=4)
    
    def configure_optimizers(self) -> Optimizer:
        return Adam(self.parameters(), lr=0.00001)
    
    # 训练时, 每次迭代会有一批(batchsize)样本数据送入模型, 计算损失
    def training_step(self, batch: dict, _batch_idx: int) -> Dict[str, Tensor]:
        inputs, labels = batch['image'], batch['mask']
        labels = labels.flatten()
        outputs = self.forward(inputs)
        loss = self.crossEntropyLoss(outputs, labels)

        tensorboardLogs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboardLogs}
    
    # 验证时, 每次迭代会有一批(batchsize)本数据送入模型, 计算损失, 可以计算相应的metrics如准确率等.
    def validation_step(self, batch: dict, _batch_idx: int) -> Dict[str, Tensor]:
        inputs, labels = batch['image'], batch['mask']
        labels = labels.flatten()
        outputs = self.forward(inputs)
        loss = self.crossEntropyLoss(outputs, labels)

        _, outputs = torch.max(outputs, dim=1)
        valAcc = accuracy_score(outputs.cpu(), labels.cpu())
        valAcc = torch.tensor(valAcc)

        return {'val_loss': loss, 'val_acc':valAcc}
    
    # 验证结束时, 输入为最后的模型的输出, 在此函数可以实现自己的metrics.
    def validation_epoch_end(self, outputs: List[Dict[str, Tensor]]) \
            -> Dict[str, Union[Tensor, Dict[str, Tensor]]]:
        avgLoss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avgAcc = torch.stack([x['val_acc'] for x in outputs]).mean()
        tensorboardLogs = {'val_loss': avgLoss, 'val_acc':avgAcc}
        return {'val_loss': avgLoss, 'log': tensorboardLogs}
    
    # 定义自己的优化方式
    def configure_optimizers(self) -> Optimizer:
        return Adam(self.parameters(), lr=0.00001)
    
    def forward(self, x, *args, **kwargs):
        return self.model(x)

In [None]:
## 训练
tb_logger = TensorBoardLogger(save_dir='/cache/logdir')
trainer = k12ai_trainer(
    logger=[tb_logger], # 日志
    gpus=1, # 选用哪个GPU训练
    max_epochs=100
)
trainer.fit(MaskModelWrapper(custom_model, 'train'))

In [None]:
## 测试
trainer.test(MaskModelWrapper(custom_model, 'test'))