# 1. 모듈 로딩

In [None]:
#@title
# 0. 다운그레이드
!pip install -U efficientnet
!pip install tensorflow==2.1.0
!pip install keras==2.3.1
!pip uninstall h5py
!pip install h5py==2.10.0

In [None]:
#@title
# 1. 모듈 로딩
import os
import glob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

import seaborn as sns
sns.set_style('darkgrid')

from keras.preprocessing import image
from keras import (Input, Model, layers, losses, optimizers, metrics, utils, models)
from keras.applications.imagenet_utils import decode_predictions
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D,GlobalAveragePooling2D 
from keras.layers import Dense, Dropout, Flatten

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

import efficientnet.keras as efn
from efficientnet.keras import EfficientNetB0, EfficientNetB3
from efficientnet.keras import center_crop_and_resize, preprocess_input

Using TensorFlow backend.


# 2. 데이터 불러오기

In [None]:
# 2. 데이터 불러오기
# Kaggle dataset 다운로드
os.environ['KAGGLE_CONFIG_DIR'] = "/content" 
!kaggle datasets download -d gpiosenka/sports-classification

Downloading sports-classification.zip to /content
 94% 451M/477M [00:17<00:01, 21.6MB/s]
100% 477M/477M [00:17<00:00, 28.2MB/s]


In [None]:
# dataset 압축해제
!unzip sports-classification.zip

In [None]:
# setting

train = '/content/train'
test = '/content/test'
valid = '/content/valid'

filepaths_train = list(glob.glob(train+'/**/*.jpg'))
filepaths_valid = list(glob.glob(valid+'/**/*.jpg'))
filepaths_test = list(glob.glob(test+'/**/*.jpg'))

In [None]:
#@title
def proc_img(filepath):
    """ Create a DataFrame with the filepath and the labels of the pictures
    """

    labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepath))

    filepath = pd.Series(filepath, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # Concatenate filepaths and labels
    df = pd.concat([filepath, labels], axis=1)

    # Shuffle the DataFrame and reset index
    df = df.sample(frac=1).reset_index(drop = True)
    
    return df

train_df = proc_img(filepaths_train)
valid_df = proc_img(filepaths_valid)
test_df = proc_img(filepaths_test)


print(f'Number of training pictures: {train_df.shape[0]}')
print(f'Number of test pictures: {test_df.shape[0]}')
'''
Number of training pictures: 13572
Number of test pictures: 500
'''

In [None]:
#@title
print('-- Training set --\n')
print(f'Number of pictures: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Label.unique())}\n')
print(f'Labels: {train_df.Label.unique()}')

'''
-- Training set --

Number of pictures: 13572

Number of different labels: 100

Labels: ['sidecar racing' 'polo' 'figure skating men' 'swimming' 'frisbee'
 'formula 1 racing' 'log rolling' 'hammer throw' 'snowmobile racing'
 'canoe slamon' 'bike polo' 'water polo' 'jai alai' 'horse jumping'
 'wingsuit flying' 'water cycling' 'rowing' 'hockey' 'chuckwagon racing'
 'croquet' 'judo' 'nascar racing' 'football' 'bobsled' 'parallel bar'
 'pommel horse' 'harness racing' 'hurdles' 'figure skating pairs'
 'pole vault' 'bungee jumping' 'roller derby' 'cricket' 'ice climbing'
 'curling' 'tennis' 'mushing' 'skydiving' 'shot put' 'luge' 'fencing'
 'horseshoe pitching' 'archery' 'ampute football' 'balance beam'
 'axe throwing' 'rings' 'rock climbing' 'sumo wrestling' 'field hockey'
 'wheelchair racing' 'baseball' 'uneven bars' 'volleyball' 'javelin'
 'speed skating' 'surfing' 'cheerleading' 'tug of war' 'high jump'
 'motorcycle racing' 'weightlifting' 'snow boarding' 'jousting'
 'billiards' 'pole climbing' 'shuffleboard' 'bmx' 'baton twirling'
 'bowling' 'boxing' 'barell racing' 'rollerblade racing' 'disc golf'
 'trapeze' 'ultimate' 'hang gliding' 'golf' 'pole dancing' 'fly fishing'
 'lacrosse' 'track bicycle' 'figure skating women' 'olympic wrestling'
 'rugby' 'giant slalom' 'air hockey' 'horse racing' 'table tennis'
 'ice yachting' 'sailboat racing' 'bull riding' 'ski jumping'
 'steer wrestling' 'gaga' 'basketball' 'sky surfing'
 'wheelchair basketball' 'hydroplane racing' 'arm wrestling']
 '''

In [None]:
# train data 확인

fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(10, 10),
                        subplot_kw={'xticks': [], 'yticks': []})

for i, ax in enumerate(axes.flat):
    ax.imshow(plt.imread(train_df.Filepath[i]))
    ax.set_title(train_df.Label[i])
plt.tight_layout()
plt.show()

<img src ="https://drive.google.com/uc?id=1f0bk74XtQNCjQTQPU7eduJr15t9-CwrQ" height = 600 width = 800>

# 3. 파라미터 지정

In [None]:
# 파라미터 설정

img_height = 224
img_width = 224
image_size = (img_height, img_width)
input_shape = (img_height, img_width, 3) 

num_classes = 100

epochs = 30
batch = 64
callbacks = EarlyStopping(monitor='val_loss', patience=10)

# 4. 이미지 제너레이터 : 이미지 증식

In [None]:
#@title
# 4. 이미지 제너레이터 : 이미지 증식

train_data_generator = image.ImageDataGenerator(
                                            rescale=1./255,
                                            rotation_range=20,
                                            width_shift_range=0.2,
                                            height_shift_range=0.2,
                                            shear_range=0.2,
                                            zoom_range=0.3,
                                            fill_mode='nearest',
                                            horizontal_flip=True,
                                            vertical_flip=False,
                                            brightness_range=(0.75,1.25)
                                            )
test_data_generator = image.ImageDataGenerator(rescale=1./255)

train_generator = train_data_generator.flow_from_dataframe(train_df,
                                                           x_col='Filepath',
                                                           y_col='Label',
                                                           target_size= image_size,
                                                           color_mode= 'rgb',
                                                           class_mode= 'categorical',
                                                           batch_size= batch,
                                                           shuffle=True,
                                                           seed=0)

val_generator = train_data_generator.flow_from_dataframe(valid_df,
                                                         x_col='Filepath',
                                                         y_col='Label',
                                                         target_size= image_size,
                                                         color_mode= 'rgb',
                                                         class_mode= 'categorical',
                                                         batch_size= batch,
                                                         shuffle=True,
                                                         seed=0)

test_generator = test_data_generator.flow_from_dataframe(test_df,
                                                         x_col='Filepath',
                                                         y_col='Label',
                                                         target_size= image_size,
                                                         color_mode= 'rgb',
                                                         class_mode= 'categorical',
                                                         batch_size= batch)


Found 13572 validated image filenames belonging to 100 classes.
Found 500 validated image filenames belonging to 100 classes.
Found 500 validated image filenames belonging to 100 classes.


# 5. 모델 생성

In [None]:
#@title
# 베이스 모델 : EfficientNetB0

base_model = EfficientNetB0(weights='imagenet', 
                            input_shape= input_shape,
                            include_top = False)
base_model.trainable = True

for layer in base_model.layers[:200]: # 전체 레이어 갯수 230개 중 200까지는 가중치 값을 조정 하지 않고, 그 다음부터 조정할 수 있게 설정 
  layer.trainable = False

base_model.summary()

In [None]:
#@title
model = Sequential()
model.add(base_model)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation = 'softmax'))

model.summary()
'''
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
efficientnet-b0 (Model)      (None, 7, 7, 1280)        4049564   
_________________________________________________________________
global_average_pooling2d_1 ( (None, 1280)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1280)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               128100    
=================================================================
Total params: 4,177,664
Trainable params: 1,624,260
Non-trainable params: 2,553,404
_________________________________________________________________
'''


In [None]:
#@title
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(train_generator, 
                      validation_data = val_generator,
                      callbacks = [callbacks], # EarlyStopping Patience: 10
                      epochs = epochs, verbose = 1) # 30에폭
'''
Epoch 26/30 (EarlyStopping)
213/213 [==============================] - 184s 862ms/step - loss: 0.0844 - accuracy: 0.9759 - val_loss: 1.4326 - val_accuracy: 0.7520
'''

# 6. 히스토리 시각화

In [None]:
# 6. 히스토리 시각화
epochs = range(1, 27) 

# acc vs val_acc   
plt.plot(epochs, history.history['accuracy'], 'b--', label='train acc')
plt.plot(epochs, history.history['val_accuracy'], 'r', label='val acc')
plt.title('Training vs validation accuracy')
plt.xlabel('epoch')
plt.ylabel('accuray')
plt.legend(loc='best')
plt.show()

# loss vs val_loss 
plt.plot(epochs, history.history['loss'], 'b--', label='train loss')
plt.plot(epochs, history.history['val_loss'], 'r', label='val loss')
plt.title('Training vs validation loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(loc='best')
plt.show()

<img src ="https://drive.google.com/uc?id=1hee2zcu5Xh-v_hHviw8lX79CpUyQ6A8k" height = 600 width = 400>

# 7. 모델 평가

## - 테스트 데이터 예측 결과 평가

In [None]:
# 1) 테스트 데이터 예측 결과 평가

score = model.evaluate(test_generator)
print('acc = ', score[1], 'loss = ', score[0])
'''
8/8 [==============================] - 2s 203ms/step
acc =  0.8759999871253967 loss =  0.5379654765129089
'''

## - 분류 결과 평가

In [None]:
# Test Result
test_generator = test_data_generator.flow_from_directory(directory= test,
                                                    target_size= image_size,
                                                    color_mode= 'rgb',
                                                    class_mode= 'sparse',
                                                    batch_size= 500,
                                                    shuffle= False
                                                    )
test_images, test_labels = test_generator.next()
predictions = model.predict(test_images).argmax(axis = 1)

Found 500 images belonging to 100 classes.


In [None]:
## 숫자형 -> 문자형 변경
# 예측 결과
labels = (test_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred = [labels[k] for k in predictions]

# 실제 결과
y_test = [labels[k] for k in test_labels]

# 분류 결과 평가
print(classification_report(y_test, pred))

<img src ="https://drive.google.com/uc?id=1ZATlOJ4CF493kHGhjOnTSl2EzeghpQaG" height = 600 width = 400>
<img src ="https://drive.google.com/uc?id=1gcKMCs0AAJ2E1MjMc94sMe2ZQA56SRDc" height = 600 width = 400>


## - 혼동행렬 시각화

In [None]:
# 혼동행렬 시각화

cf_matrix = confusion_matrix(y_test, pred, normalize='true')
plt.figure(figsize = (20,15))
sns.heatmap(cf_matrix, annot=False, xticklabels = sorted(set(y_test)), yticklabels = sorted(set(y_test)))
plt.title('Normalized Confusion Matrix')
plt.show()

<img src ="https://drive.google.com/uc?id=1FVpUgY2pIPa9c7LwaobSxsqhK22GYCVt" height = 800 width = 850>


## - 오분류 이미지 시각화

In [None]:
# 테스트 이미지 경로 및 실제 결과 DataFrame 생성
Filepath = test_generator.filepaths # 테스트 이미지 경로
test_df = pd.DataFrame({'Filepath':Filepath,'Label':y_test},columns=['Filepath','Label'])

In [None]:
# 오분류된 이미지 추출
diff = []
for i in range(0,500):
  if test_df.Label.iloc[i] != pred[i]:
    diff.append(i)

print('오분류된 이미지 개수 :',len(diff))
# 오분류된 이미지 개수 : 62
# 500개 중 62개의 이미지 오분류 됨

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(15, 7),
                        subplot_kw={'xticks': [], 'yticks': []})
diff_15 = diff[0:15] # 오분류된 이미지 15개 지정

# 오분류된 이미지 상단 15개 추출
for ax, num in zip(axes.flat,diff_15):
  ax.imshow(plt.imread(test_df.Filepath.iloc[num]))
  ax.set_title(f"True: {test_df.Label.iloc[num]}\nPredicted: {pred[num]}")
plt.tight_layout()
plt.show()

<img src ="https://drive.google.com/uc?id=1wsf_gDiyjglbFSTibnM7CMkZ7cOiu73U" height = 600 width = 1500>

