In [1]:
import os
import io
import re
import time
import requests
import threading
import pandas as pd
from urllib import request, error
from PIL import Image
from os.path import basename
from bs4 import BeautifulSoup
from collections import OrderedDict

## Class implementation
- **Label**: Crawler for article urls(total 12 pages) we need by each label.
     - Labels we need: each room for interior('living-room', 'bedroom', 'bathroom', 'kitchen')
- **Resource**: Crawler for image urls and article text we need from total 120 articles per label.
- **Convert**: Converter for image format(webp -> jpeg) and raw strings to dataframe.

In [2]:
class Labels():
    def __init__(self, label):
        self.label = label
        #url attributes
        self.page_url = 'http://www.homify.co.kr/magazine/{0}'.format(self.label)
        self.last = 14
        self.urls = []
    
    
    def get_page_urls(self):
        for idx in range(self.last):
            page = '{0}?page={1}'.format(self.page_url, idx+1)
            res = requests.get(page)
            soup = BeautifulSoup(res.content, 'html5lib')
            articles = soup.select('div.ideabook--items a.link')
            for article in articles:
                link = article['href']
                self.urls.append('http://www.homify.co.kr{0}'.format(link))
                
        return self.urls
  

In [3]:
class Resource():  
    def __init__(self, links):
        self.links = links
        #image urls for train which have paired texts
        self.train_src = []
        #image urls for test which doesn't have paired texts
        self.test_src = []
        #paired texts for each images
        self.articles = []
    
    def imgUrl_generator(self, url, src_type):
        height = url.get('height')
        width = url.get('width')
        srcUrl_tail= re.search(r'/v.*$', url.get('src')).group()

        if height > width:
            srcUrl = 'https://images.homify.com/c_fill,f_auto,q_auto:eco,{0}_{1}{2}'.format('w', 224, srcUrl_tail)
            src_type.append(srcUrl)
        else:
            srcUrl = 'https://images.homify.com/c_fill,f_auto,q_auto:eco,{0}_{1}{2}'.format('h', 224, srcUrl_tail)
            src_type.append(srcUrl)
    
    
    #get image url and text of each paragraph
    def get_imgUrl_text(self):
        for link in self.links:
            try:
                parent_res = requests.get(link)
                parent_soup = BeautifulSoup(parent_res.content, 'html5lib', from_encoding='utf-8')
                parent_tags = parent_soup.select('div.ideabook--item.ideabook-photo')
                
                #crawler for titles
                for tag in parent_tags:
                    #p_parent = tag.select('div.text-container.text.-mt-line-')
                    h2_text = tag.select('h2.text-container.subheading.-mt-line-.-centered-')

                    for h2 in h2_text:
                        h2 = h2.text.strip(' \t\n')
                        #the case that doesn't have paired texts
                        if not h2:
                            image_urls = tag.select('div.photo img.js-photo-link')
                            for url in image_urls:
                                if not url:
                                    continue

                                self.imgUrl_generator(url, self.test_src)
                            continue
                        else:
                            image_urls = tag.select('div.photo img.js-photo-link')

                            for url in image_urls:
                                #some case that there might be absent of images
                                if not url:
                                    continue

                                #the case we want!
                                else:            
                                    self.articles.append(h2)
                                    self.imgUrl_generator(url, self.train_src)
            
            except requests.exceptions.ConnectionError as e:
                print(e)
                continue
            finally:
                time.sleep(0.1)

In [4]:
class Convert():
    def __init__(self, label, srcUrl, srcType, articles=None):
        #attributes for saving images
        self.srcUrl = srcUrl
        self.srcType = srcType
        self.labels = {'bathroom':0, 'bedroom':1, 'kitchen':2, 'livingroom':3}
        self.label = label    
        #attributes for article texts
        self.articles = articles
        if articles:
            self.df = pd.DataFrame()
        
    #에러가 발생하는 부분에서는 lock을 걸지 않아서 소켓에 에러가 발생했다.
    #image download webp and convert to jpeg
    def dl_cvt_Image(self, url, index):        
        with request.urlopen(url) as run_url:
            f = io.BytesIO(run_url.read())
            #folder name by train or test / label by room type / index
            image_index = '{}_{}'.format(self.labels[self.label], index)
            Image.open(f).convert('RGB').save('images_{0}/{1}/{2}.jpg'.format(self.srcType, self.label, image_index), 'jpeg')
        
        
    def converter_train(self):
        orderedDictList = []

        for index, url in enumerate(self.srcUrl):
            try:
                self.dl_cvt_Image(url, index)   

                od = OrderedDict()
                txt_index = '{}_{}'.format(self.labels[self.label], index)

                od['pic_num'] = txt_index 
                od['article'] = self.articles[index]

                orderedDictList.append(od)           
                self.df = pd.DataFrame(orderedDictList, columns=list([items.keys() for items in orderedDictList][0]))
            
            except error.URLError as e:
                print(e.args)
                pass
            except error.HTTPError as e:
                print(e.code)
                pass
            finally:
                time.sleep(0.3)
          
        
    def converter_test(self):
        for index, url in enumerate(self.srcUrl):
            try:
                self.dl_cvt_Image(url, index)
            except error.URLError as e:
                print(e.args)
                pass
            except error.HTTPError as e:
                print(e.code)
                pass
            finally:
                time.sleep(0.3)

##  Get 4 label page urls with 4 Threads

In [5]:
#make each Label class instances
labels = {}

labels['bathroom'] = Labels('bathroom')
labels['bedroom'] = Labels('bedroom')
labels['kitchen'] = Labels('kitchen')
labels['livingroom'] = Labels('livingroom')

In [6]:
def run_label(label):
    label.get_page_urls()
    
threads =[]
for label in labels.values():
    t = threading.Thread(target=run_label, args=(label,))
    threads.append(t)
    t.start()

for t in threads:
    t.join()
                         

In [7]:
bathroom_pages = labels['bathroom'].urls
bedroom_pages = labels['bedroom'].urls
kitchen_pages = labels['kitchen'].urls
living_pages = labels['livingroom'].urls

### Result

In [8]:
print(len(bathroom_pages))
print(len(bedroom_pages))
print(len(kitchen_pages))
print(len(living_pages))

140
140
140
140


## Get article text and image url from 120 page urls on each label

In [9]:
resources = {}

resources['bathroom'] = Resource(bathroom_pages)
resources['bedroom'] = Resource(bedroom_pages)
resources['kitchen'] = Resource(kitchen_pages)
resources['livingroom'] = Resource(living_pages)

In [10]:
def run_resource(resource):
    resource.get_imgUrl_text()
    
threads =[]
for resource in resources.values():
    t = threading.Thread(target=run_resource, args=(resource,))
    threads.append(t)
    t.start()

for t in threads:
    t.join()

### results

In [11]:
bath_articles = resources['bathroom'].articles
bath_train = resources['bathroom'].train_src
bath_test = resources['bathroom'].test_src
print(len(bath_articles), len(bath_train), len(bath_test))

1336 1336 14


In [12]:
bed_articles = resources['bedroom'].articles
bed_train = resources['bedroom'].train_src
bed_test = resources['bedroom'].test_src
print(len(bed_articles), len(bed_train), len(bed_test))

1259 1259 14


In [13]:
kit_articles = resources['kitchen'].articles
kit_train = resources['kitchen'].train_src
kit_test = resources['kitchen'].test_src
print(len(kit_articles), len(kit_train), len(kit_test))

1478 1478 20


In [14]:
liv_articles = resources['livingroom'].articles
liv_train = resources['livingroom'].train_src
liv_test = resources['livingroom'].test_src
print(len(liv_articles), len(liv_train), len(liv_test))

1078 1078 112


## Download image and convert text to dataframe from each resources over 1k

In [15]:
converters = {}
#label, srcUrl, srcType, articles
converters['bathroom_train'] = Convert('bathroom', bath_train, 'train', bath_articles)
converters['bathroom_test'] = Convert('bathroom', bath_test, 'test')

converters['bedroom_train'] = Convert('bedroom', bed_train, 'train', bed_articles)
converters['bedroom_test'] = Convert('bedroom', bed_test, 'test')

converters['kitchen_train'] = Convert('kitchen', kit_train, 'train', kit_articles)
converters['kitchen_test'] = Convert('kitchen', kit_test, 'test')

converters['livingroom_train'] = Convert('livingroom', liv_train, 'train', liv_articles)
converters['livingroom_test'] = Convert('livingroom', liv_test, 'test')

In [16]:
def run_train(converter):
        converter.converter_train()

trains = ['bathroom_train', 'bedroom_train', 'kitchen_train', 'livingroom_train']
train_converters = [converters[train] for train in trains]
                    
threads =[]

for converter in train_converters:
    t = threading.Thread(target=run_train, args=(converter,))
    threads.append(t)
    t.start()
    
time.sleep(0.3)

for t in threads:
    t.join()
    

In [17]:
def run_test(converter):
    converter.converter_test()

tests = ['bathroom_test', 'bedroom_test', 'kitchen_test', 'livingroom_test']

threads =[]    
    
for test in tests:
    t = threading.Thread(target=run_test, args=(converters[test],))
    threads.append(t)
    t.start()

for t in threads:
    t.join()

### result

In [18]:
bath_dataFrame = converters['bathroom_train'].df
print(len(bath_dataFrame))
bath_dataFrame.tail()

1336


Unnamed: 0,pic_num,article
1331,0_1331,바닥소재2-라미네이트
1332,0_1332,창문
1333,0_1333,라디에이터
1334,0_1334,습기 제거 방법!
1335,0_1335,건식 욕실


In [19]:
bath_dataFrame.to_csv('text/by_category/bathroom.csv', columns=['pic_num', 'article'], mode='w')

In [20]:
bed_dataFrame = converters['bedroom_train'].df
print(len(bed_dataFrame))
bed_dataFrame.tail()

1259


Unnamed: 0,pic_num,article
1254,1_1254,상큼한 침실
1255,1_1255,나무 안에서 자는 느낌
1256,1_1256,아이 방의 아늑한 수면 공간
1257,1_1257,자연 가까이에 둔 침실
1258,1_1258,지중해식 침실


In [21]:
bed_dataFrame.to_csv('text/by_category/bedroom.csv', columns=['pic_num', 'article'], mode='w')

In [22]:
kit_dataFrame = converters['kitchen_train'].df
print(len(kit_dataFrame))
kit_dataFrame.tail()

1478


Unnamed: 0,pic_num,article
1473,2_1473,5. 낡은 주방 패브릭 제품
1474,2_1474,7. 사용하지 않는 주방 용품
1475,2_1475,8. 자리만 차지하는 낡은 주방 칼
1476,2_1476,9. 주방 미관을 해치는 빈 병 및 캔
1477,2_1477,10. 필요 이상의 타파웨어


In [23]:
kit_dataFrame.to_csv('text/by_category/kitchen.csv', columns=['pic_num', 'article'], mode='w')

In [24]:
liv_dataFrame = converters['livingroom_train'].df
print(len(liv_dataFrame))
liv_dataFrame.tail()

1078


Unnamed: 0,pic_num,article
1073,3_1073,개방적인 오픈 주방
1074,3_1074,기존의 틀을 벗어나는 아이템들
1075,3_1075,공간감을 넓혀주는 천장과 조명 디자인
1076,3_1076,간접 조명으로 생동감 있게
1077,3_1077,멋스러운 빈티지 소품들


In [25]:
liv_dataFrame.to_csv('text/by_category/livingroom.csv', columns=['pic_num', 'article'], mode='w')

## Export dataFrame to csv file

In [26]:
#concatenate whole dataFrames to one
total_df = pd.concat([bath_dataFrame, bed_dataFrame, 
                        kit_dataFrame, liv_dataFrame], ignore_index=True)

total_df.tail()

Unnamed: 0,pic_num,article
5146,3_1073,개방적인 오픈 주방
5147,3_1074,기존의 틀을 벗어나는 아이템들
5148,3_1075,공간감을 넓혀주는 천장과 조명 디자인
5149,3_1076,간접 조명으로 생동감 있게
5150,3_1077,멋스러운 빈티지 소품들


In [27]:
#export to csv file
total_df.to_csv('text/article.csv', columns=['pic_num', 'article'], mode='w')

# Data Preparation done!