In [2]:
# %load app.py
import sys;
import json;

from crawler.community.bitcointalk import bitcointalk;
from crawler.community.ripplecoin import ripplecoin;
from crawler.community.ethereum import ethereum;
from crawler.community.litecointalk import litecointalk;

configpath = "";
if len(sys.argv) < 2 :
	configpath = "./default-config.json";
else :
	configpath = sys.argv[1];

crawler = None;

with open(configpath, "r") as configfile :
	config = json.loads(configfile.read());

	print("community : " + config["community"]);
	if (config["community"] == "bitcointalk") :
		crawler = bitcointalk();
	elif (config["community"] == "ripplecoin") :
		crawler = ripplecoin();
	elif (config["community"] == "ethereum") :
		crawler = ethereum();
	elif (config["community"] == "litecointalk") :
		crawler = litecointalk();
	else :
		print("There is no crawler");

	if(crawler == None) :
		sys.exit();

	if("pages" in config) :
		print("start crawling pages : " + str(config["pages"]["startpage"]) + " to : " + str(config["pages"]["endpage"]));
		crawler.crawlingPages(int(config["pages"]["startpage"]), int(config["pages"]["endpage"]));

	if("page" in config) :
		print("start crawling page : " + str(config["page"]["pageno"]));
		crawler.crawlingPage(int(config["page"]["pageno"]));

# crawler = bitcointalk();
# crawler.crawlingPage(1);
# crawler.crawlingPages(1,5);

# crawler = ripplecoin();
# crawler.crawlingPages(1,10);

# crawler = ethereum();
# crawler.crawlingPages(1,10);

# crawler = litecointalk();
# crawler.crawlingPages(1, 10);

ModuleNotFoundError: No module named 'crawler.community.bitcointalk'

In [6]:
# %load crawler.py
import requests;

class crawler(object) :

	

	@classmethod
	def __init__(self) :
		self.__debug = False;
		pass;

	@classmethod
	def getAddressFormat(self) :
		return "";

	@classmethod
	def getResponse(self, address) :

		headers={"Header":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"}

		response = requests.get(address, headers=headers);

		return response;

	@classmethod
	def debuglog(self, message) :
		if (self.__debug) :
			print(message);

In [None]:
# %load bitcointalk.py
# http://www.todayhumor.co.kr/board/view.php?table={tablename}&no={2}&s_no={2}

from crawler.crawler import crawler
from bs4 import BeautifulSoup;
import re;
from datetime import datetime;
import json;

class bitcointalk(crawler) :
	@classmethod
	def __init__(self) :
		super(bitcointalk,self).__init__();

		self.__addressformat = "https://bitcointalk.org/index.php?board=1.{0}"
		self.__soup = None;


	@classmethod
	def __getAddressFormat(self) :
		return self.__addressformat;

	@classmethod
	def __soupFind(self, tag, attrs) :
		return self.__soup.find(tag, attrs);

	@classmethod
	def __soupFindAll(self, tag, attrs) :
		return self.__soup.find_all(tag, attrs=attrs);

	@classmethod
	def __loadHtml(self, index) :
		page = str(40*(index-1));
		address = self.__getAddressFormat().format(page);

		response = super(bitcointalk,self).getResponse(address);

		html = response.text;

		self.__soup = BeautifulSoup(html, "html5lib");

	@classmethod
	def getHtml(self) :
		if(self.__soup is None) :
			return "";
		else :
			return self.__soup.prettify();

	@classmethod
	def crawlingPage(self, pageno) :
		if(pageno < 1) :
			pageno = 1;

		try :
			print("pageno : " + pageno);

			self.__loadHtml(pageno);

			postinfolist = self.__parsePostsInfo(pageno);

			result={};
			postlist=[];

			for postinfo in postinfolist:

				post = {};
				
				post = self.__parsePost(postinfo["uri"], postinfo["reply"]);
				post["views"] = postinfo["views"];


				postlist.append(post);

			result["posts"] = postlist;

			f = open("bitcointalk"+"_"+str(pageno)+".json","wb");
			f.write(json.dumps(result, ensure_ascii=False).encode('utf-8'));
			f.close();

			return result;
			
		except Exception as e:
			raise e;

	@classmethod
	def crawlingPages(self, startpage, endpage) :

		if startpage <= 0 :
			startpage = 1;
			
		if endpage <= 0 :
			endpage = 1;

		pages = {};

		pages["posts"] = [];

		for page in range(startpage, endpage+1) :
			pageresult = self.crawlingPage(page);

			pages["posts"] += pageresult["posts"];


		return pages;

	@classmethod
	def __parsePostsInfo(self, page) :
		spans = self.__soupFindAll("span", {"id":re.compile(r"msg_[0-9]+")});

		postinfolist = [];

		for span in spans :

			uri = span.a["href"];
			td = span.parent;

			tdlist = td.parent.find_all("td",{"class":td["class"]});

			result = {"uri":uri,"views":int(tdlist[2].text),"reply":int(tdlist[1].text)};

			postinfolist.append(result);

		return postinfolist;

	@classmethod
	def __parsePost(self, address, replycount) :

		post={};
		idx = 0;
		replies=[];

		postresponse = self.getResponse(address);
		soup = BeautifulSoup(postresponse.text, "html5lib");
		quickModForm = soup.find("form",{"id":"quickModForm"});
		tr = quickModForm.find("tr");
		trlist=quickModForm.find_all("tr",attrs={"class":tr["class"]});

		for tr in trlist :
			headerandpost = tr.find("td",{"class":"td_headerandpost"});

			subject = headerandpost.find("div",{"id":re.compile(r"subject")});
			datestr = subject.parent.find("div",{"class":"smalltext"}).text;
			postobj = headerandpost.find("div",{"class":"post"});

			quotelist = postobj.find_all("div",{"class":"quoteheader"});
			for quote in quotelist :
				quote.decompose();

			if(idx == 0) :
				post["topic"]=subject.text;
				post["content"]=self.__removeTag(postobj.prettify().split("\n"));
				post["date"]=self.__parseDate(datestr);
			else :
				reply={};
				reply["date"]=self.__parseDate(datestr);
				reply["content"]=self.__removeTag(postobj.prettify().split("\n"));
				replies.append(reply);
			idx += 1;

		if(replycount >= 20) :
			replypageno = int(replycount/20)+1;



			for currentreplypage in range(1,replypageno) :

				result=self.__parseReply(address+str(currentreplypage*20));

				replies = replies + result;

		post["replies"]=replies;

		return post;

	@classmethod
	def __parseReply(self, address) :

		replies = [];
		
		postresponse = self.getResponse(address);

		soup = BeautifulSoup(postresponse.text, "html5lib");
		quickModForm = soup.find("form",{"id":"quickModForm"});
		tr = quickModForm.find("tr");
		trlist=quickModForm.find_all("tr",attrs={"class":tr["class"]});

		for tr in trlist :
			headerandpost = tr.find("td",{"class":"td_headerandpost"});

			subject = headerandpost.find("div",{"id":re.compile(r"subject")});
			datestr = subject.parent.find("div",{"class":"smalltext"}).text;
			postobj = headerandpost.find("div",{"class":"post"});
			for quote in postobj.find_all("div",{"class":"quoteheader"}) :
				quote.decompose();

			reply={};
			reply["date"]=self.__parseDate(datestr);
			reply["content"]= self.__removeTag(postobj.prettify().split("\n"));
			replies.append(reply);

		return replies;


	@classmethod
	def __parseDate(self,datestr):

		date="";
		if "Today" in datestr:
			split = datestr.split("at");
			date = datetime.now().strftime("%Y-%m-%d");
			date += split[1];
		else :
			dateobj = datetime.strptime(datestr,"%B %d, %Y, %I:%M:%S %p");
			date = dateobj.strftime("%Y-%m-%d %H:%M:%S");

		return date;

	@classmethod
	def __removeTag(self, lines) :
		result = "";

		for line in lines :
			line = re.sub('<[^>]*>','',line);
			line = re.sub('</[^>]*>','',line);
			line = re.sub('[\n\t]','',line);
			line = re.sub('\\\\n','',line);
			line = line.strip();

			if (len(line) > 0) :
				result += line+'\n';

		return result;

In [2]:
for i in range(1,4):
    print ('hi')

hi
hi
hi
