Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FTPFileStore to FilesPipeline #3961

Merged
merged 20 commits into from Jan 24, 2020
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 55 additions & 3 deletions scrapy/pipelines/files.py
Expand Up @@ -10,6 +10,7 @@
import time
import logging
from email.utils import parsedate_tz, mktime_tz
from ftplib import FTP
from six.moves.urllib.parse import urlparse
from collections import defaultdict
import six
Expand All @@ -31,6 +32,7 @@
from scrapy.utils.request import referer_str
from scrapy.utils.boto import is_botocore
from scrapy.utils.datatypes import CaselessDict
from scrapy.utils.ftp import ftp_makedirs_cwd

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -248,6 +250,53 @@ def persist_file(self, path, buf, info, meta=None, headers=None):
)


class FTPFilesStore(object):

FTP_USERNAME = None
FTP_PASSWORD = None

def __init__(self, uri):
assert uri.startswith('ftp://')
u = urlparse(uri)
self.port = u.port
self.host = u.hostname
self.port = int(u.port or '21')
OmarFarrag marked this conversation as resolved.
Show resolved Hide resolved
self.username = u.username or self.FTP_USERNAME
self.password = u.password or self.FTP_PASSWORD
self.basedir = u.path.rstrip('/')

def persist_file(self, path, buf, info, meta=None, headers=None):

def _persist_file(path, buf):
ftp = FTP()
ftp.connect(self.host, self.port)
ftp.login(self.username, self.password)
buf.seek(0)
# If the path is like 'x/y/z.ext' the 'x/y' is rel_path and
# 'z.ext' is file name
# If path is only the file name 'z.ext', then rel_path is
# the empty string and filename is 'z.ext'
x = path.rsplit('/',1)
rel_path, filename = ('/' + x[0].lstrip('/'), x[1]) if len(x) > 1 else ('', x[0])
abs_path = self.basedir + rel_path
ftp_makedirs_cwd(ftp, abs_path)
ftp.storbinary('STOR %s' % filename, buf)

return threads.deferToThread(_persist_file, path, buf)

def stat_file(self, path, info):
def _stat_file(path):
try:
last_modified = float(self.ftp.voidcmd("MDTM " + self.basedir + '/' + path)[4:].strip())
m = hashlib.md5()
self.ftp.retrbinary('RETR %s' % self.basedir + path, m.update)
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
# The file doesn't exist
except Exception as e :
return {}
return threads.deferToThread(_stat_file, path)


class FilesPipeline(MediaPipeline):
"""Abstract pipeline that implement the file downloading

Expand All @@ -274,6 +323,7 @@ class FilesPipeline(MediaPipeline):
'file': FSFilesStore,
's3': S3FilesStore,
'gs': GCSFilesStore,
'ftp': FTPFilesStore
}
DEFAULT_FILES_URLS_FIELD = 'file_urls'
DEFAULT_FILES_RESULT_FIELD = 'files'
Expand All @@ -284,7 +334,6 @@ def __init__(self, store_uri, download_func=None, settings=None):

if isinstance(settings, dict) or settings is None:
settings = Settings(settings)

OmarFarrag marked this conversation as resolved.
Show resolved Hide resolved
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
Expand All @@ -303,7 +352,6 @@ def __init__(self, store_uri, download_func=None, settings=None):
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
)

super(FilesPipeline, self).__init__(download_func=download_func, settings=settings)

@classmethod
Expand All @@ -321,6 +369,10 @@ def from_settings(cls, settings):
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None

ftp_store = cls.STORE_SCHEMES['ftp']
ftp_store.FTP_USERNAME = settings['FTP_USER'] # Default is 'anonymous'
OmarFarrag marked this conversation as resolved.
Show resolved Hide resolved
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD'] # Default is `guest`

store_uri = settings['FILES_STORE']
return cls(store_uri, settings=settings)

Expand Down Expand Up @@ -460,4 +512,4 @@ def item_completed(self, results, item, info):
def file_path(self, request, response=None, info=None):
media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
media_ext = os.path.splitext(request.url)[1]
return 'full/%s%s' % (media_guid, media_ext)
return 'full/%s%s' % (media_guid, media_ext)