Skip to content
This repository has been archived by the owner on Oct 26, 2023. It is now read-only.

Commit

Permalink
Added ability to crawl only allowed domains in config file
Browse files Browse the repository at this point in the history
  • Loading branch information
rtrevinnoc committed Mar 7, 2021
1 parent 1d8810b commit 0835a05
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 3 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ HOST_NAME = "my_public_future_instance" # THE NAMES 'private' and 'weare
with open("tranco_JKGY.csv") as tranco:
SEED_URLS = [x.strip() for x in tranco.readlines()]
PEER_PORT = 3000
LIMIT_DOMAINS = None
ALLOWED_DOMAINS = ["*"]
CONCURRENT_REQUESTS = 10
CONCURRENT_REQUESTS_PER_DOMAIN = 2.0
CONCURRENT_ITEMS = 100
Expand Down
2 changes: 2 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ It is suggested to start with this configuration template, which is essentially
with open("tranco_JKGY.csv") as tranco:
SEED_URLS = [x.strip() for x in tranco.readlines()]
PEER_PORT = 3000
LIMIT_DOMAINS = None
ALLOWED_DOMAINS = ["*"]
CONCURRENT_REQUESTS = 10
CONCURRENT_REQUESTS_PER_DOMAIN = 2.0
CONCURRENT_ITEMS = 100
Expand Down
9 changes: 6 additions & 3 deletions linkreaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from scrapy.crawler import CrawlerProcess
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from config import SEED_URLS, CONCURRENT_REQUESTS, CONCURRENT_REQUESTS_PER_DOMAIN, CONCURRENT_ITEMS, REACTOR_THREADPOOL_MAXSIZE, DOWNLOAD_MAXSIZE, LOG_LEVEL, AUTOTHROTTLE, DEPTH_PRIORITY, TARGET_CONCURRENCY, MAX_DELAY, START_DELAY
from config import SEED_URLS, CONCURRENT_REQUESTS, CONCURRENT_REQUESTS_PER_DOMAIN, CONCURRENT_ITEMS, REACTOR_THREADPOOL_MAXSIZE, DOWNLOAD_MAXSIZE, LOG_LEVEL, AUTOTHROTTLE, DEPTH_PRIORITY, TARGET_CONCURRENCY, MAX_DELAY, START_DELAY, LIMIT_DOMAINS, ALLOWED_DOMAINS
from Monad import *
import numpy as np

Expand Down Expand Up @@ -123,7 +123,7 @@ def returnDataFromImageTags(url: str, someIterable: list) -> list:

class Indexer(scrapy.Spider):
name = "indexer"
allowed_urls = ["*"]
allowed_urls = ALLOWED_DOMAINS
custom_settings = {
"CONCURRENT_REQUESTS": CONCURRENT_REQUESTS,
"CONCURRENT_REQUESTS_PER_DOMAIN": CONCURRENT_REQUESTS_PER_DOMAIN,
Expand Down Expand Up @@ -151,7 +151,10 @@ class Indexer(scrapy.Spider):
"DEPTH_PRIORITY": DEPTH_PRIORITY,
"SCHEDULER_DISK_QUEUE": 'scrapy.squeues.PickleFifoDiskQueue',
"SCHEDULER_MEMORY_QUEUE": 'scrapy.squeues.FifoMemoryQueue',
"AJAXCRAWL_ENABLED": True
"AJAXCRAWL_ENABLED": True,
"SPIDER_MIDDLEWARES": {
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': LIMIT_DOMAINS
}
}

start_urls = SEED_URLS
Expand Down

0 comments on commit 0835a05

Please sign in to comment.