This repository has been archived by the owner on Jan 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
154 lines (139 loc) · 4.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import re
import time
import json
import requests
from urllib import parse
from urllib import request
from bs4 import BeautifulSoup
probDict={}
ACTimeDict={}
probSet=set([])
userDict={}
userList=[]
resDict={}
resList=[]
allAC=0
validAC=0
lastACTime="-"
resJSONList=[]
def writeFile(context): #测试时用
f=open("test.txt","w")
f.write(context)
f.close()
def readUser():# 读取待查询用户列表
for line in open("users.csv"):
line=line.replace('\r\n','')
line=line.replace('\n','')
if (line[0]=='#'):
continue
now=line.split(',')
userDict[now[0]]=now[1]
userList.append(now[0])
print("已读取到 "+format(len(userList))+" 个用户待爬取:")
print(userList)
def getHTML(url):
res=""
while True:
try:
res=requests.get(url,timeout=5).text
except:
print("请求超时/失败,正在重请求中..")
time.sleep(0.2)
continue
break
return res
def getStatus(url,page):
html=getHTML(url)
html=format(html)
# 提取并遍历每一条记录
reg=re.compile(r'itemList = (.*?);')
jsonstr=reg.findall(html)[0]
records=json.loads(jsonstr)
for entry in records:
name=entry['info']['problemName']
proid=entry['info']['problemId']
ACTime=entry['info']['submitTime']
probDict[proid]=name
ACTimeDict[proid]=ACTime
if (proid not in probSet):
global allAC,validAC
allAC=allAC+1
if (proid>=100):
validAC=validAC+1
probSet.add(proid)
# 记录最后 AC 时间
if (page==1 and len(records)>0):
global lastACTime
lastACTime=records[0]['info']['submitTime']
# 检测是否有下一页
dom=BeautifulSoup(html,"html.parser")
nextbutton=dom.find(id="page_next")
if (nextbutton==None):
return "0"
if ("disabled" in nextbutton['class']):
return "0"
return "https://loj.ac"+nextbutton['href']
def getUserStatus(user):
nowUrl="https://loj.ac/submissions?status=Accepted&submitter="+format(user)
global allAC,validAC,resList,lastACTime
allAC=0
validAC=0
lastACTime="-"
probDict.clear()
probSet.clear()
ACTimeDict.clear()
name=userDict[user]
print("正在爬取 "+user+"("+name+") 的AC记录")
page=1
while (1):
print("爬取第"+format(page)+"页...")
nowUrl=getStatus(nowUrl,page)
if (nowUrl=="0"):
break
page=page+1
result=sorted(probSet)
resList=result
resDict[user]=result
print("")
print(user+"("+name+") 共AC了 "+format(allAC)+" 道题,有效AC "+format(validAC)+" 题,分别是:")
print(result)
print("最后 AC 时间:"+lastACTime)
print("")
def getJSONDict(user):
res={}
res['user']=user
res['name']=userDict[user]
res['allAC']=allAC
res['validAC']=validAC
res['lastACTime']=lastACTime
probJSONList=[]
for proid in resList:
tmp={}
tmp['id']=proid
tmp['title']=probDict[proid]
tmp['time']=ACTimeDict[proid]
probJSONList.append(tmp)
res['probs']=probJSONList
return res
readUser()
for user in userList:
getUserStatus(user)
resJSONList.append(getJSONDict(user))
JSON=json.dumps(resJSONList,ensure_ascii=False)
# print(json.dumps(resJSONList,ensure_ascii=False))
# writeFile(json.dumps(resJSONList,ensure_ascii=False))
# 导出HTMl
f=open("assets/res","r",encoding="utf-8")
html="<script>var archivetime='"+time.strftime("%Y-%m-%d %H:%M",time.localtime())+"';var json="+JSON+";</script>"+f.read()
f.close()
f=open("result.html","w",encoding="utf-8")
f.write(html)
f.close()
# 导出CSV
f=open("result.csv","w",encoding="ansi")
f.write("用户名,备注名,AC数,有效AC数,最后AC时间"+"\n")
for i in range(0,len(resJSONList)):
f.write(format(resJSONList[i]['user'])+","+format(resJSONList[i]['name'])+","+format(resJSONList[i]['allAC'])+","+format(resJSONList[i]['validAC'])+","+format(resJSONList[i]['lastACTime'])+"\n")
f.close()
print("已导出为 HTML(result.html) 和 CSV(result.csv) 文件")