## 캐글 사용을 위한 설정

In [0]:
# 캐글 API 버전 갱신을 위한 작업... 반드시 버전이 1.5.6 이어야 함

!ls -lha kaggle.json

!pip uninstall -y kaggle
!pip install --upgrade pip
!pip install kaggle==1.5.6

-rw-r--r-- 1 root root 69 Dec 10 06:31 kaggle.json
Uninstalling kaggle-1.5.6:
  Successfully uninstalled kaggle-1.5.6
Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (19.3.1)
Collecting kaggle==1.5.6
[?25l  Downloading https://files.pythonhosted.org/packages/62/ab/bb20f9b9e24f9a6250f95a432f8d9a7d745f8d24039d7a5a6eaadb7783ba/kaggle-1.5.6.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 9.2MB/s 
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.6-cp36-none-any.whl size=72859 sha256=fed6e900437ad2674763880ba4398a569022d0128d99ea29c7132759692161c5
  Stored in directory: /root/.cache/pip/wheels/57/4e/e8/bb28d035162fb8f17f8ca5d42c3230e284c6aa565b42b72674
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.6


In [0]:
# 캐글연동을 위한 토큰 입력
! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json


# 버전이 1.5.6 이 아니면, 진행할 수 없다
! kaggle -v

Kaggle API 1.5.6


## 데이터 셋 다운로드 후 압축 풀기

In [0]:
! kaggle competitions download -c 2019-ml-finalproject

import zipfile
import os

os.mkdir('/content/input2')

Downloading 2019-ml-finalproject.zip to /content
 75% 49.0M/65.0M [00:03<00:01, 9.14MB/s]
100% 65.0M/65.0M [00:03<00:00, 19.7MB/s]


In [0]:
zip_ref = zipfile.ZipFile("/content/2019-ml-finalproject.zip", 'r')
zip_ref.extractall("/content/input2")
zip_ref.close()

## SIFT 사용을 위한 설정

In [0]:
! yes | pip3 uninstall opencv-python
! yes | pip3 uninstall opencv-contrib-python
! yes | pip3 install opencv-python==3.4.2.16
! yes | pip3 install opencv-contrib-python==3.4.2.16

Uninstalling opencv-python-4.1.2.30:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/cv2/*
    /usr/local/lib/python3.6/dist-packages/opencv_python-4.1.2.30.dist-info/*
Proceed (y/n)?   Successfully uninstalled opencv-python-4.1.2.30
Uninstalling opencv-contrib-python-4.1.2.30:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/opencv_contrib_python-4.1.2.30.dist-info/*
Proceed (y/n)?   Successfully uninstalled opencv-contrib-python-4.1.2.30
Collecting opencv-python==3.4.2.16
[?25l  Downloading https://files.pythonhosted.org/packages/fa/7d/5042b668a8ed41d2a80b8c172f5efcd572e3c046c75ae029407e19b7fc68/opencv_python-3.4.2.16-cp36-cp36m-manylinux1_x86_64.whl (25.0MB)
[K     |████████████████████████████████| 25.0MB 1.6MB/s 
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
Installing collected packages: opencv-python
Successfully installed opencv-python-3.4.2.16
Collecting opencv-contrib-pyth

## 라이브러리 로드

In [0]:
from imutils import paths
import numpy as np
import imutils 
import cv2 
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm

from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

## 학습 데이터 준비

In [0]:
#df_train = pd.read_csv("/content/input2/Label2Names.csv", header=None)

In [0]:
# df_train

Unnamed: 0,0,1
0,1,Faces
1,2,Faces_easy
2,3,Leopards
3,4,Motorbikes
4,5,accordion
...,...,...
96,97,wheelchair
97,98,wild_cat
98,99,windsor_chair
99,100,wrench


In [0]:
# df_train.index[df_train[1]=='wrench'].tolist() # 해당 클래스의 인덱스를 출력하는 방법

### SIFT를 이용하여 feature 검출

In [0]:
def weak_sift_each(img_gray):
  sift = cv2.xfeatures2d.SIFT_create()

  # kp = sift.detect(image_gray, None)
  # kp, des = sift.compute(image_gray, kp)
  kp, des = sift.detectAndCompute(image_gray, None)

  return kp, des

In [0]:
data_path_train = "/content/input2/train/"

y = []
images = []
labels = []
des_list = []

for i in tqdm(os.listdir(data_path_train)):
  img_cls_path = data_path_train + i + "/"  # 이미지 클래스 path
  img_path = [img_cls_path + j for j in os.listdir(img_cls_path)]  # 각 클래스 내의 학습이미지 path

  if i == "BACKGROUND_Google":       # Label2Names.csv에 빠져있는 카테고리 label 지정
    label = 102
  else:
    label = (df_train.index[df_train[1]==i] + 1).tolist()[0]  # 나머지 label 지정
  labels.append(label)

  for img in img_path:
    image = cv2.imread(img)
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #image = cv2.resize(image, (300,300))  # <--  그냥 넣어봤는데 굳이 필요?

    kp, des = weak_sift_each(image_gray)
    des_list.append(des)
    y.append(label)

HBox(children=(IntProgress(value=0, max=102), HTML(value='')))




In [0]:
# image_gray.shape

In [0]:
# des_list.append(des) 요놈에 대한 실험용 코드
# des_array = np.array(des_list)
# des_array[3059].shape

In [0]:
des_tot = np.vstack((descriptor for descriptor in des_list))

# 결과는 같은데 아래와 같이도 할 수 있음(시간 더 걸림)
# des_tot = des_list[0]
# for i in des_list[1:]:
#   des_tot = np.vstack([des_tot, i])

  """Entry point for launching an IPython kernel.


In [0]:
des_tot.shape

(1382647, 128)

## K-Means Clustering을 이용하여 CodeBook 만들기


In [0]:
kmeans = KMeans(n_clusters=16, init='k-means++', random_state=0).fit(des_tot)

In [0]:
codeBook = kmeans.cluster_centers_
print(codeBook.shape)
print(codeBook)

(16, 128)
[[21.340075  42.56463   53.23497   ... 23.883245  31.679596  26.208311 ]
 [15.931325  16.578133  20.645033  ... 14.563525  16.107052  18.918446 ]
 [21.229584  19.666706  19.327774  ... 22.29497   15.845911  15.133426 ]
 ...
 [17.283901  12.462314  11.716347  ... 24.895805   5.6408787  6.7464075]
 [26.936361  16.99792   12.757876  ... 10.890846   8.945466  30.15292  ]
 [42.650185  24.570862  12.732935  ... 27.250633  13.077035  12.170553 ]]


## CodeBook을 통해 각각의 이미지마다 Histogram 작성

In [0]:
from scipy.cluster.vq import *

des_hist = []  # 클래스 당 30장. 총 3060의 이미지(SIFT 추출된)를
                # 위에서 제작된 codeBook 통하여 histogram으로 표현

for des in des_list:
  code, _ = vq(des, codeBook)
  code_hist, _ = np.histogram(code, bins=range(codeBook.shape[0]+1))
  des_hist.append(code_hist)

# for i in range(len(des_list)):
#   code, _ = vq(des_list[i], codeBook)
#   code_hist, _ = np.histogram(code, bins=range(codeBook.shape[0]+1))
#   des_hist.append(code_hist)

In [0]:
X = np.array(des_hist)
y = np.array(y)
print(X.shape)
print(y.shape)

(3060, 16)
(3060,)


##데이터셋 서브 샘플링 & Train-Validation 나누기(일단 임시)

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

##SVM 분류기 설계

In [0]:
pca = PCA(n_components=None)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

param_grid = {'svc__C': [0.1, 0.5, 1, 5, 10],
              'svc__gamma': [0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]}

grid = GridSearchCV(model, param_grid)
%time grid.fit(X_train, y_train)
print(grid.best_params_)



CPU times: user 1min 16s, sys: 9.08 s, total: 1min 25s
Wall time: 1min 14s
{'svc__C': 5, 'svc__gamma': 0.0001}


## 테스트 데이터 준비

In [0]:
data_root_test = "/content/input2/testAll_v2/"

img_list = os.listdir(data_root_test)
des_list_test = []

img_path_test = [data_root_test + i for i in img_list]

for img in tqdm(img_path_test):
  image = cv2.imread(img)
  image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

  kp, des = weak_sift_each(image_gray)
  des_list_test.append(des)

HBox(children=(IntProgress(value=0, max=1692), HTML(value='')))




In [0]:
des_hist_test = [] 

for des in des_list_test:
  code, _ = vq(des, codeBook)
  code_hist, _ = np.histogram(code, bins=range(codeBook.shape[0]+1))
  des_hist_test.append(code_hist)

In [0]:
X_test = np.array(des_hist_test)
X_test.shape

(1692, 16)

##학습한 모델을 이용하여 테스트 데이터의 라벨 구하기

In [0]:
model = grid.best_estimator_
y_pred = model.predict(X_test)

In [0]:
y_pred = y_pred.reshape(-1,1)
result_img_list = np.array(img_list).reshape(-1,1)
result = np.hstack([result_img_list,y_pred])

In [0]:
print(result)

[['image_0413.jpg' '32']
 ['image_0022.jpg' '47']
 ['image_0639.jpg' '45']
 ...
 ['image_0613.jpg' '70']
 ['image_1375.jpg' '100']
 ['image_0480.jpg' '8']]


##결과 파일로 저장하여 Kaggle 제출 준비

In [0]:
df = pd.DataFrame(result, columns=['id','Category'])
df.to_csv('result-jhhwang.csv', index=False, header=True)

In [0]:
! kaggle competitions submit -c 2019-ml-finalproject -f result-jhhwang.csv -m "Junghyun Hwang"

100% 29.6k/29.6k [00:05<00:00, 5.24kB/s]
Successfully submitted to 2019.Fall.PatternRecognition 