-
Notifications
You must be signed in to change notification settings - Fork 18
/
userSpider.py
205 lines (191 loc) · 8.42 KB
/
userSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import re
def weibo_login(driver, login_path, username, password):
''' 执行登录操作 '''
username_path = '/html/body/div[4]/div[2]/div[3]/div[3]/div[1]/input'
password_path = '/html/body/div[4]/div[2]/div[3]/div[3]/div[2]/input'
WebDriverWait(driver, 5, 0.5).until(EC.element_to_be_clickable((By.XPATH, login_path)))
driver.find_element_by_xpath(login_path).click() # 点击'登录'按钮
WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located((By.XPATH, username_path)))
driver.find_element_by_xpath(username_path).clear()
driver.find_element_by_xpath(username_path).send_keys(username) # 输入微博账号
driver.find_element_by_xpath(password_path).clear()
driver.find_element_by_xpath(password_path).send_keys(password) # 输入微博密码
login_verifycode(driver)
def login_verifycode(driver):
''' 执行登录中对验证码的所有操作 '''
verifycode_path = '/html/body/div[4]/div[2]/div[3]/div[3]/div[4]/input'
login_path = '/html/body/div[4]/div[2]/div[3]/div[3]/div[6]/a'
driver.find_element_by_xpath(login_path).click()
WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located((By.XPATH, verifycode_path)))
try:
driver.find_element_by_xpath(verifycode_path).click()
need_verifycode = True
while need_verifycode:
need_verifycode = login_verifycode_input(driver, login_path)
except:
pass
def login_verifycode_input(driver, login_path):
''' 输入验证码 '''
verifycode_path = '/html/body/div[4]/div[2]/div[3]/div[3]/div[4]/input'
verifycode = input('VerifyCode :')
if verifycode == 'N':
return False
else:
print(verifycode)
driver.find_element_by_xpath(verifycode_path).send_keys(verifycode) # 输入验证码
driver.find_element_by_xpath(login_path).click()
return True
def infoframe(each, frame=None, add=False):
''' 返回内容与长度统一的用户基本信息框 '''
each_key = [i for i in each.keys()]
frame_key = ['昵称', '真实姓名', '所在地', '性别', '性取向', '感情状况',
'生日', '血型', '博客地址', '个性域名', '简介', '注册时间', 'href']
if add:
frame.append([each[i] if i in each_key else '' for i in frame_key])
else:
frame_val = [each[i] if i in each_key else '' for i in frame_key]
frame = [frame_key, frame_val]
return frame
def get_info(driver, href, frame=None, add=False):
''' 爬取单个用户基本信息 '''
button_path = r'/html/body/div[1]/div/div[2]/div/div[2]/div[1]/div[4]/div/div/a/span'
info_path = r'/html/body/div[1]/div/div[2]/div/div[2]/div[2]/div/div/div/div[2]/div/ul'
exist_href = enter_infopage(driver, button_path, href)
if exist_href == False or exist_href == None:
return False
text_list = info_spider(driver, info_path)
if text_list == []:
return frame
info_list = text_list[0].split('\n')
info_key = [info_list[i][:-1] for i in range(len(info_list)) if i%2 == 0]
info_val = [info_list[i] for i in range(len(info_list)) if i %2 == 1]
info_key.append('href') # 将用户主页加入到输出数据中
info_val.append(href)
each = {i: j for i, j in zip(info_key, info_val)}
if add:
return infoframe(each, frame, True)
else:
return infoframe(each)
def userinfo(href, username, password, browser='Firefox', filepath=None, saved=False, newfile=False):
'''
主程序, 获取单个或所有用户的基本信息
参数类型及含义
----------
href : str or list 用户主页url或url列表
username : str 用于登录的微博账号
password : str 用于登录的微博密码
filepath : str 文件保存路径
saved : bool 是否保存文件, 若为False则仅返回数据而不保存
newfile : bool 保存文件时是否创建新文件?若为True, 则创建新文件或覆盖原文件; 若为False则在原文件基础上追加数据
使用范例
----------
> myhref = 'https://weibo.com/weibokefu'
> myusername = '123456'
> mypassword = '123456'
> myfilepath = 'C:/test.csv'
> only_data = userinfo(myhref, myusername, mypassword) # 仅获取爬取的数据
> userinfo(myhref, myusername, mypassword, filepath=myfilepath, saved=True, newfilw=True) # 将爬取的数据作为一个新文件保存到C盘
'''
url = 'https://weibo.com/login.php' # 微博主页面, 用于登录账户
login_path = '//div[@class="gn_login"]/ul[@class="gn_login_list"]/li[3]/a[@class="S_txt1"]'
frame_key = ['昵称', '真实姓名', '所在地', '性别', '性取向', '感情状况',
'生日', '血型', '博客地址', '个性域名', '简介', '注册时间', 'href']
if browser == 'Firefox':
driver = webdriver.Firefox(executable_path='geckodriver') # 打开浏览器
elif browser == 'Chrome':
driver = webdriver.Chrome(executable_path='chromedriver')
else:
return ValueError('目前仅支持Firefox与Chrome浏览器')
driver.get(url)
weibo_login(driver, login_path, username, password)
time.sleep(2)
if isinstance(href, str):
result = get_info(driver, href)
if result == False:
print('这个href不可用 : %s'%href)
else:
if saved:
save_userinfo(result, filepath, newfile)
else:
return result
else:
result = get_info(driver, href[0])
if result == False:
frame_val = ['' for i in frame_key]
frame = [frame_key, frame_val]
else:
frame = result
length = len(href) - 1
count = 1
for i in range(1, len(href)):
result = get_info(driver, href[i], frame=frame, add=True)
if result == False:
continue
else:
frame = result
windows = driver.window_handles
if len(windows) > 1:
driver.switch_to.window(windows[0])
print('距爬完还有%d条, 已爬取%d条'%(length, count))
length -= 1
count += 1
if saved:
save_userinfo(frame, filepath, newfile)
else:
return frame
def enter_infopage(driver, button_path, href):
''' 进入用户基本信息所在界面, 并爬取数据 '''
check1 = driver.current_url
driver.get(href)
check2 = driver.current_url
if check1 in check2:
return False
try:
driver.find_element_by_xpath(button_path).click()
return True
except:
try:
WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located((By.XPATH, button_path)))
driver.find_element_by_xpath(button_path).click()
return True
except:
driver.refresh()
enter_infopage(driver, button_path, href)
def info_spider(driver, info_path):
''' 爬取页面中的用户基本信息 '''
texts = driver.find_elements_by_xpath(info_path)
count = 0
while texts == [] and count <= 5:
time.sleep(1)
texts = driver.find_elements_by_xpath(info_path)
count += 1
if count == 6:
return []
return [i.text for i in texts]
def save_userinfo(data, filepath, newfile=False):
'''
以csv格式输出并保存爬取的数据
参数类型及含义
----------
data : array/list 爬取的用户基本信息数据
filepath : str 文件保存路径
newfile : bool 保存文件时是否创建新文件?若为True, 则创建新文件或覆盖原文件; 若为False则在原文件基础上追加数据
'''
if re.match(r'(.*?).csv$', filepath):
if newfile:
write_type = 'w'
else:
write_type = 'a+'
data = data[1:] # 去除标题行
with open(filepath, write_type, newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerows(data)
file.close()
else:
return ValueError('目前只支持输出csv格式')