-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider_image.py
126 lines (119 loc) · 6.8 KB
/
spider_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
import json
import re
import random
import os
from login import LoginHuaBan
from urllib.parse import unquote
import time
class SpiderImage(LoginHuaBan):
"""
爬取图片的的地址
"""
def __init__(self):
super(SpiderImage, self).__init__()
# self.session = requests.session()
self.headers['User-Agent'] = '''Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'''
self.cookie = self.get_cookie() # 获取cookie
def get_home_frist_id(self):
"""获取第一轮的第一个图片id,为下一轮做准备"""
url = "https://huaban.com/"
html = self.session.get(url, cookies=self.cookie).content.decode('utf-8')
re_pins = re.compile(r'"pin_id":(.*?),') # 获取刚开始的图片id,为了下一轮的图片做准备
get_pins = re_pins.search(html)
return str(get_pins.groups()[0]) # 返回用户信息, 第一张图片的id
# 获取关注的作者发布的图片
def get_home_image(self):
def check_raw_text(text): # 部分作者的这一个值的空的
if text.strip() == "":
return "No raw_text"
else:
return text
last_img_pin_id = self.get_home_frist_id() # 得到第一轮的第一个id
num = 1
while True:
randoms = "".join([chr(random.randint(65, 90)).lower() for r in range(3)]) + str(random.randint(0, 9)) # 随机生成4个字母和1个数字
url = "http://huaban.com/?jd2{}&max={}&limit=20&wfl=1".format(randoms, last_img_pin_id)
get_json = json.loads(self.session.get(url, cookies=self.cookie).content.decode('utf-8'))
if get_json['pins'] == []: # 如果为空列表,则判断爬取完成~
print("爬完啦~共计: {} 张图片".format(num))
break
for img_date in get_json['pins']:
num += 1
img_username = img_date['user']['username'] # 发布图片的作者
pin_id = img_date['pin_id'] # 图片的id
img_key = img_date['file']['key'] # 图片的key,用于拼接图片的url地址
img_raw_text = check_raw_text(img_date['raw_text'].replace('\n', '')) # 图片介绍, 部分为空
img_description = img_date['board']['description'].replace('\n', '') # 描述
img_title = img_date['board']['title'].replace('\n', '') # 图片的标题
img_type = img_date['file']['type'].split('/')[-1] # 图片类型
floder = "home" # 默认爬取关注的采集
print("正在爬取 发布者: {} 的图片 id: {} Key: {} 标题: {} 描述: {}".format(img_username, pin_id, img_key, img_title, img_description))
self.download(pin_id, img_key, img_type, floder) # 下载图片
last_img_pin_id = get_json['pins'][-1]['pin_id']
def get_all_image(self, args):
if args == "all":
url = "https://huaban.com/{}".format(args)
folder = "all"
else:
folder = unquote(args)
url = "https://huaban.com/search/?q={}&jd5gz4om&page=1&per_page=20&wfl=1".format(args)
get_total = self.session.get(url, cookies=self.cookie).content.decode('utf-8')
re_search_total = re.compile(r'app.page\["pin_count"\] = (.*?);')
get_search_total = re_search_total.search(get_total)
search_total = int(get_search_total.groups()[0])
if search_total == 0:
print("没有搜索到关于 '{}' 的图片".format(args))
exit()
else:
print("搜索到的 '{}' 共计 {} 张".format(folder, search_total))
total = 1
snum = 2
while True:
html = self.session.get(url, cookies=self.cookie).content.decode('utf-8')
re_search_total = re.compile(r'app.page["pin_count"] = (.*?);')
re_pins = re.compile(r'app.page\["pins"\] = \[(.*?)\];') # 所有数据
get_pins = re_pins.findall(html)
re_pin_id = re.compile(r'"pin_id":(.*?),') # 所有的图片id
re_user_id = re.compile(r'"file":(.*?})')
re_username = re.compile(r'"username":"(.*?)",') # 用户名
re_title_description = re.compile(r'"title":"(.*?)", "description":"(.*?)", ') # 标题和描述
get_pin_id_key = [i for i in re_pin_id.findall(get_pins[0]) if '}' not in i] # 用户id和图片的key
if get_pin_id_key == []:
print("爬完啦~共计: {}".format(total-1))
break
get_title_description = re_title_description.findall(get_pins[0]) # 图片的标题和描述
get_username = re_username.findall(get_pins[0])
for i, t, pid, u in zip(re_user_id.findall(get_pins[0]), get_title_description, get_pin_id_key, get_username):
get_data_json = json.loads(i.replace('[{"color":', ''))
get_data_json['title'] = t[0]
get_data_json['description'] = t[1]
get_data_json['pin_id'] = pid
get_data_json['username'] = u
pin_id = get_data_json['pin_id']
username = get_data_json['username']
key = get_data_json['key']
title = get_data_json['title']
description = get_data_json['description']
types = get_data_json['type'].split('/')[-1]
print("正在爬取 第 {} 张 发布者: {} 的图片 id: {} Key: {} 标题: {} 描述: {}".format(total, username, pin_id, key, title, description))
self.download(pin_id, key, types, folder)
total += 1
randoms = "".join([chr(random.randint(65, 90)).lower() for i in range(3)]) + str(random.randint(0, 9)) # 随机生成4个字母和1个数字
last_id = get_data_json['pin_id']
if args == "all":
url = "https://huaban.com/all/?jd4{}&max={}&limit=20&wfl=1".format(randoms, last_id)
else:
url = "https://huaban.com/search/?q={}&jd5{}&page={}&per_page=20&wfl=1".format(args, randoms, snum)
snum+=1
time.sleep(1+random.random())
def download(self, pin_id, key, img_type, folder):
try:
os.mkdir("{}".format(folder))
except FileExistsError:
pass
url = "http://hbimg.b0.upaiyun.com/" + key
req = requests.get(url, cookies=self.cookie) # 下载图片需要登陆才行,所以需要cookie,
with open('{}/{}.{}'.format(folder, pin_id, img_type), 'wb') as fp:
fp.write(req.content)