In [2]:
%matplotlib inline
import torch
import torchvision
from torch import nn
import torchvision.transforms as transforms
from PIL import Image
import d2l_torch as d2l
import cnn_base as base
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import shutil
from torchvision.datasets import ImageFolder
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader
import time
from matplotlib import pyplot as plt
import math
import torch.nn.functional as F



In [2]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'
os.environ['TORCH_USE_CUDA_DSA']='1'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
data_dir = '../data/kaggle_leaves/classify-leaves/'

In [4]:
train_data = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [5]:
# train_data.iloc[0:4]

In [6]:
# len(train_data)

In [7]:
# test_data.iloc[0:4]

In [8]:
train_data.groupby('label').count()

Unnamed: 0_level_0,image
label,Unnamed: 1_level_1
abies_concolor,176
abies_nordmanniana,93
acer_campestre,80
acer_ginnala,86
acer_griseum,64
...,...
ulmus_parvifolia,99
ulmus_procera,58
ulmus_pumila,189
ulmus_rubra,235


In [9]:
# train_data.describe()

In [10]:
train_img_label, test_img_label = train_test_split(train_data, test_size=0.3, stratify=train_data.iloc[:, 1])

In [11]:
train_img_label.describe()

Unnamed: 0,image,label
count,12847,12847
unique,12847,176
top,images/2324.jpg,maclura_pomifera
freq,1,247


In [12]:
test_img_label.describe()

Unnamed: 0,image,label
count,5506,5506
unique,5506,176
top,images/6960.jpg,maclura_pomifera
freq,1,106


In [13]:
train_img_label.head()

Unnamed: 0,image,label
13556,images/13556.jpg,quercus_nigra
13216,images/13216.jpg,castanea_dentata
198,images/198.jpg,ulmus_pumila
12090,images/12090.jpg,pinus_virginiana
119,images/119.jpg,broussonettia_papyrifera


In [14]:
# 将划分好的数据写入相应文件
train_img_label.to_csv(os.path.join(data_dir, 'split_train.csv'), index=False)
test_img_label.to_csv(os.path.join(data_dir, 'split_test.csv'), index=False)

In [15]:
#@save
def reorg_train_valid(img_label, train=True):
    if train:
        image_folder = 'train_image'
    else:
        image_folder = 'test_image'
        
    split_image_folder = os.path.join(data_dir, image_folder)
    print('split_image_folder:', split_image_folder)
    if not os.path.exists(split_image_folder):
        os.makedirs(split_image_folder)
    
    for row in img_label.itertuples():
        # 安装分类创建目录
        image_label_folder = os.path.join(split_image_folder, row[2])        
        img_path_ori = os.path.join(data_dir, row[1])
        if not os.path.exists(image_label_folder):
            os.makedirs(image_label_folder)  # makedirs 创建文件时如果路径不存在会创建这个路径
        shutil.copy(img_path_ori, image_label_folder)

In [16]:
reorg_train_valid(train_img_label, train=True)

split_image_folder: ../data/kaggle_leaves/classify-leaves/train_image


In [25]:
reorg_train_valid(test_img_label, train=False)

split_image_folder: ../data/kaggle_leaves/classify-leaves/test_image


## 创建划分好的训练集和测试集、数据增强

In [18]:
# # 创建划分好的训练集和测试集
h_flip = transforms.RandomHorizontalFlip(p=0.5)
v_flip = transforms.RandomVerticalFlip(p=0.5)
shape_aug = transforms.RandomResizedCrop((224, 224), scale=(0.1, 1), ratio=(0.5, 2))
brightness_aug = transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0)
train_augs = transforms.Compose([h_flip, v_flip])  # 图像增广
train_data_trans = transforms.Compose([transforms.Resize(224),
                                       train_augs,
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
test_data_trans = transforms.Compose([transforms.Resize(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])


In [26]:
train_data = ImageFolder(os.path.join(data_dir, 'train_image'),
                         transform=train_data_trans, target_transform=None)
test_data = ImageFolder(os.path.join(data_dir, 'test_image'),
                        transform=test_data_trans, target_transform=None)

In [20]:
# 将ImageFolder的映射关系存到csv
id_code = pd.DataFrame(list(train_data.class_to_idx.items()),
                       columns=['label', 'id'])
id_code.to_csv(os.path.join(data_dir, 'id_code.csv'), index=False)

In [21]:
train_data.class_to_idx.items()

dict_items([('abies_concolor', 0), ('abies_nordmanniana', 1), ('acer_campestre', 2), ('acer_ginnala', 3), ('acer_griseum', 4), ('acer_negundo', 5), ('acer_palmatum', 6), ('acer_pensylvanicum', 7), ('acer_platanoides', 8), ('acer_pseudoplatanus', 9), ('acer_rubrum', 10), ('acer_saccharinum', 11), ('acer_saccharum', 12), ('aesculus_flava', 13), ('aesculus_glabra', 14), ('aesculus_hippocastamon', 15), ('aesculus_pavi', 16), ('ailanthus_altissima', 17), ('albizia_julibrissin', 18), ('amelanchier_arborea', 19), ('amelanchier_canadensis', 20), ('amelanchier_laevis', 21), ('asimina_triloba', 22), ('betula_alleghaniensis', 23), ('betula_jacqemontii', 24), ('betula_lenta', 25), ('betula_nigra', 26), ('betula_populifolia', 27), ('broussonettia_papyrifera', 28), ('carpinus_betulus', 29), ('carpinus_caroliniana', 30), ('carya_cordiformis', 31), ('carya_glabra', 32), ('carya_ovata', 33), ('carya_tomentosa', 34), ('castanea_dentata', 35), ('catalpa_bignonioides', 36), ('catalpa_speciosa', 37), ('ced

In [27]:
batch_size=128
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [28]:
print('Train_data:')
print('Number of train_dataloader:\t', len(train_dataloader))  # 显示batch的数量
print('Number of train_dataset:\t', len(train_dataloader.dataset))  # 显示训练集样本总数量
print('Test_data:')
print('Number of test_dataloader:\t', len(test_dataloader))  # 显示batch的数量
print('Number of test_dataset:\t', len(test_dataloader.dataset))  # 显示测试集样本总数量
# 每个batch的数据形状
train_X, train_y = next(iter(train_dataloader))
print('Shape:')
print('The shape of train_features in a batch run:\t', train_X.shape)
print('The shape of train_labels in a batch run:\t', train_y.shape, '\n')

Train_data:
Number of train_dataloader:	 101
Number of train_dataset:	 12847
Test_data:
Number of test_dataloader:	 44
Number of test_dataset:	 5506
Shape:
The shape of train_features in a batch run:	 torch.Size([128, 3, 224, 224])
The shape of train_labels in a batch run:	 torch.Size([128]) 



<font color='red'> 定义累加器类和记录多次运行时间的Timer类（来自d2l）</font>

In [6]:
df = pd.DataFrame(columns=['name', 'number'])
df.loc[len(df)] = [5, 6]
df
df.loc[len(df)] = [5, 6]
df
df.loc[len(df)] = [5, 6]
df

Unnamed: 0,name,number
0,5,6
1,5,6
2,5,6
