사진 url정보 및 사진을 구글에 검색하여 수집 

In [None]:
#사진 url 수집을 위해 bing_image_downloader 패키지 내 코드 수정
from pathlib import Path
import os
import sys
import urllib.request
import urllib
import imghdr
import posixpath
import re

'''
Python api to download image form Bing.
Author: Guru Prasad (g.gaurav541@gmail.com)
'''


class Bing:
    def __init__(self, query, limit, output_dir, adult, timeout, filters=''):
        self.download_count = 0
        self.query = query
        self.output_dir = output_dir
        self.adult = adult
        self.filters = filters

        assert type(limit) == int, "limit must be integer"
        self.limit = limit
        assert type(timeout) == int, "timeout must be integer"
        self.timeout = timeout

        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
        self.page_counter = 0

    def save_image(self, link, file_path):
        request = urllib.request.Request(link, None, self.headers)
        image = urllib.request.urlopen(request, timeout=self.timeout).read()
        if not imghdr.what(None, image):
            print('[Error] Invalid image, not saving {}'.format(link))
            raise
        with open(file_path, 'wb') as f:
            f.write(image)

    def download_image(self, link):
        self.download_count += 1

        # Get the image link
        try:
            path = urllib.parse.urlsplit(link).path
            filename = posixpath.basename(path).split('?')[0]
            file_type = filename.split(".")[-1]
            if file_type.lower() not in ["jpe", "jpeg", "jfif", "exif", "tiff", "gif", "bmp", "png", "webp", "jpg"]:
                file_type = "jpg"

            # Download the image
            print("#{} 이미지 다운로드 ({})".format(self.query, link))
            
            with open("./photo_url_list.txt", 'a', encoding="utf-8") as f:
              f.write(link)
              print('='*50)
              print('txt file added!')

            self.save_image(link, "{}/{}/{}/".format(os.getcwd(), self.output_dir, self.query) + "image_{}.{}".format(
                str(self.download_count), file_type))
            print("#{} 파일 다운로드가 완료되었습니다.".format(self.download_count))
        except Exception as e:
            self.download_count -= 1
            print("[Info] Issue getting: {}\n[Error] {}".format(link, e))

    def run(self):
        while self.download_count < self.limit:
            print('[Info] Indexing page: {}'.format(self.page_counter + 1))
            # Parse the page source and download pics
            request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(self.query) \
                          + '&first=' + str(self.page_counter) + '&count=' + str(self.limit) \
                          + '&adlt=' + self.adult + '&qft=' + self.filters
            request = urllib.request.Request(request_url, None, headers=self.headers)
            response = urllib.request.urlopen(request)
            html = response.read().decode('utf8')
            links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)

            print("[Info] Indexed {} Images on Page {}.".format(len(links), self.page_counter + 1))
            print("===============================================")

            for link in links:
                if self.download_count < self.limit:
                    self.download_image(link)
                else:
                    print("===============================================")
                    break
            print("[Info] Done. Downloaded {} images.".format(self.download_count))

            self.page_counter += 1


In [None]:
# 한글 폰트 설치하기 (꼭! 설치가 완료되면 [런타임 다시 시작]을 누르고 다시 실행하기)
!apt install fonts-nanum -y

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 한글 폰트 설정하기
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=10)
plt.rc('font', family='NanumBarunGothic')
matplotlib.font_manager._rebuild()

In [None]:
# 필요한 라이브러리 설치하기
!git clone https://github.com/ndb796/bing_image_downloader

In [None]:
import os
import shutil
from bing_image_downloader import downloader


directory_list = [
    './custom_dataset/train/',
    './custom_dataset/test/',
]

# 초기 디렉토리 만들기
for directory in directory_list:
    if not os.path.isdir(directory):
        os.makedirs(directory)

# 수집한 이미지를 학습 데이터와 평가 데이터로 구분하는 함수
def dataset_split(query, train_cnt):
    # 학습 및 평가 데이터셋 디렉토리 만들기
    for directory in directory_list:
        if not os.path.isdir(directory + '/' + query):
            os.makedirs(directory + '/' + query)
    # 학습 및 평가 데이터셋 준비하기
    cnt = 0
    for file_name in os.listdir(query):
        if cnt < train_cnt:
            print(f'[Train Dataset] {file_name}')
            shutil.move(query + '/' + file_name, './custom_dataset/train/' + query + '/' + file_name)
        else:
            print(f'[Test Dataset] {file_name}')
            shutil.move(query + '/' + file_name, './custom_dataset/test/' + query + '/' + file_name)
        cnt += 1
    shutil.rmtree(query)

In [None]:
import pandas as pd
company = pd.read_csv('company_info.csv', encoding = 'cp949')

In [None]:
company = company['company'].tolist()

In [None]:
for firm in company: 
    query = firm + ' 로고 고화질'
    downloader.download(query, limit=1,  output_dir='./', adult_filter_off=True, force_replace=False, timeout=60)
    dataset_split(query, 30)