Skip to content

Commit

Permalink
Merge pull request #230 from samvera/no-more-parties
Browse files Browse the repository at this point in the history
Replace HTTParty with Typhoeus for improved large file streaming support
  • Loading branch information
jrgriffiniii committed Jul 24, 2018
2 parents 3f8d526 + 68f6020 commit 3bfdbb7
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 52 deletions.
2 changes: 1 addition & 1 deletion browse-everything.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
spec.add_dependency 'font-awesome-rails'
spec.add_dependency 'google-api-client', '~> 0.21'
spec.add_dependency 'google_drive', '~> 2.1'
spec.add_dependency 'httparty', '~> 0.15'
spec.add_dependency 'typhoeus'
spec.add_dependency 'rails', '>= 4.2'
spec.add_dependency 'ruby-box'
spec.add_dependency 'sass-rails'
Expand Down
30 changes: 18 additions & 12 deletions lib/browse_everything/retriever.rb
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# frozen_string_literal: true

require 'addressable'
require 'httparty'
require 'tempfile'
require 'typhoeus'

module BrowseEverything
# Class for raising errors when a download is invalid
# @see HTTParty::Error
class DownloadError < HTTParty::Error
class DownloadError < StandardError
attr_reader :response

# Constructor
# @param msg [String]
# @param response [HTTParty::Response] response from the server
# @param response [Typhoeus::Response] response from the server
def initialize(msg, response)
@response = response
super(msg)
Expand All @@ -31,6 +30,12 @@ class Retriever

attr_accessor :chunk_size

class << self
def can_retrieve?(uri)
Typhoeus.get(uri, headers: { Range: 'bytes=0-0' }).success?
end
end

# Constructor
def initialize
@chunk_size = CHUNK_SIZE
Expand Down Expand Up @@ -126,14 +131,15 @@ def retrieve_http(options)
url = options.fetch(:url)
retrieved = 0

# Determine whether or not to stream the body by the size of the resource requested
stream_body = file_size > 500.megabytes

response = HTTParty.get(url.to_s, stream_body: stream_body, headers: headers) do |chunk|
retrieved += chunk.length
request = Typhoeus::Request.new(url.to_s)
request.on_headers do |response|
raise DownloadError.new("#{self.class}: Failed to download #{url}", response) unless response.code == 200
end
request.on_body do |chunk|
retrieved += chunk.bytesize
yield(chunk, retrieved, file_size)
end
raise DownloadError.new("#{self.class}: Failed to download #{url}", response) unless response.code == 200
request.run
end

# Retrieve the file size
Expand All @@ -148,8 +154,8 @@ def get_file_size(options)
when 'file'
File.size(url.path)
when /https?/
response = HTTParty.head(url.to_s, headers: headers)
length_value = response.content_length || file_size
response = Typhoeus.head(url.to_s, headers: headers)
length_value = response.headers['Content-Length'] || file_size
length_value.to_i
else
raise URI::BadURIError, "Unknown URI scheme: #{url.scheme}"
Expand Down
93 changes: 93 additions & 0 deletions spec/fixtures/vcr_cassettes/retriever.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,98 @@
---
http_interactions:
- request:
method: get
uri: https://retrieve.cloud.example.com/some/dir/can_retrieve.pdf
headers:
Range: bytes=0-0
response:
status:
code: '206'
message: 'Partial Content'
headers:
Content-Length:
- 1
Content-Type:
- application/pdf
Content-Range:
- 'bytes 0-0/64134'
body:
encoding: ASCII-8BIT
string: '%'
http_version: '1.1'
recorded_at: Tue, 24 Jul 2018 10:38:42 GMT
- request:
method: get
uri: https://retrieve.cloud.example.com/some/dir/cannot_retrieve.pdf
headers:
Range: bytes=0-0
response:
status:
code: '403'
message: 'Unauthorized'
http_version: '1.1'
recorded_at: Tue, 24 Jul 2018 10:38:42 GMT
- request:
method: get
uri: https://drive.google.com/uc?export=download&id=id
headers:
Authorization:
- Bearer access-token
response:
status:
code: '200'
message: OK
headers:
Content-Length: '1234'
body:
encoding: ASCII-8BIT
string: content
http_version: '1.1'
recorded_at: Mon, 23 Jul 2018 16:00:03 GMT
- request:
method: head
uri: https://drive.google.com/uc?export=download&id=id
headers:
Authorization:
- Bearer access-token
response:
status:
code: '200'
message: OK
headers:
Content-Length: '1234'
body:
encoding: ASCII-8BIT
string: content
http_version: '1.1'
recorded_at: Mon, 23 Jul 2018 16:00:03 GMT
- request:
method: get
uri: https://retrieve.cloud.example.com/some/dir/file_error.pdf
headers:
Accept: ! '*/*'
Authorization: Bearer ya29.kQCEAHj1bwFXr2AuGQJmSGRWQXpacmmYZs4kzCiXns3d6H1ZpIDWmdM8
response:
status:
code: '403'
message: Unauthorized
headers:
Server: nginx
Date: Wed, 01 Oct 2014 14:01:16 GMT
Content-Type: application/json
Transfer-Encoding: chunked
Connection: close
pragma: no-cache
cache-control: no-cache
body:
encoding: ASCII-8BIT
string: |-
{"error":{"errors":[{"domain":"usageLimits","reason":"dailyLimitExceededUnreg",
"message":"Daily Limit for Unauthenticated Use Exceeded. Continued use requires signup.",
"extendedHelp":"https://code.google.com/apis/console"}],"code":403,
"message":"Daily Limit for Unauthenticated Use Exceeded. Continued use requires signup."}}
http_version: '1.1'
recorded_at: Mon, 23 Jul 2018 15:27:31 GMT
- request:
method: get
uri: https://retrieve.cloud.example.com/some/dir/file.pdf
Expand Down
77 changes: 38 additions & 39 deletions spec/lib/browse_everything/retriever_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,12 @@

context "when retrieving a resource from a cloud storage provider" do
let(:url) { URI.parse("https://drive.google.com/uc?id=id&export=download") }
let(:response) { double }
let(:headers) do
{
'Authorization:' => 'Bearer access-token'
}
end

before do
WebMock.disable!
allow(response).to receive(:content).and_return('content')
allow(response).to receive(:content_length).and_return('1234')
allow(response).to receive(:code).and_return(200)
allow(HTTParty).to receive(:head).and_return(response)
allow(HTTParty).to receive(:get).and_return(response)
end

it 'calculates or retrieves the size of a file' do
retriever.retrieve(options) do |chunk, retrieved, total|
expect(total).to eq 1234
end
end

after do
WebMock.enable!
end
end

context "when retrieving a resource with an unsupported protocol" do
Expand Down Expand Up @@ -121,31 +102,19 @@
end

context 'when downloading content and a server error occurs' do
let(:download_options) { spec['0'] }
let(:response) { instance_double(HTTParty::Response) }
let(:error) do
let(:spec) do
{
'error' =>
{
'errors' => [
{
'domain' => 'usageLimits',
'reason' => 'dailyLimitExceededUnreg',
'message' => 'Daily Limit for Unauthenticated Use Exceeded. Continued use requires signup.',
'extendedHelp' => 'https://code.google.com/apis/console'
}
],
'code' => 403,
'message' => 'Daily Limit for Unauthenticated Use Exceeded. Continued use requires signup.'
'0' => {
'url' => 'https://retrieve.cloud.example.com/some/dir/file_error.pdf',
'auth_header' => { 'Authorization' => 'Bearer ya29.kQCEAHj1bwFXr2AuGQJmSGRWQXpacmmYZs4kzCiXns3d6H1ZpIDWmdM8' },
'expires' => expiry_time,
'file_name' => 'file.pdf',
'file_size' => size.to_s
}
}
end
let(:download_options) { spec['0'] }

before do
allow(response).to receive(:code).and_return(403)
allow(response).to receive(:body).and_return(error)
allow(HTTParty).to receive(:get).and_return(response)
end
it 'raises an exception' do
expect { retriever.download(download_options) }.to raise_error(BrowseEverything::DownloadError, /BrowseEverything::Retriever: Failed to download/)
end
Expand Down Expand Up @@ -198,4 +167,34 @@
end
end

context '.can_retrieve?' do
let(:expiry_time) { (Time.current + 3600).xmlschema }
let(:spec) do
{
'0' => {
'url' => url,
'auth_header' => { 'Authorization' => 'Bearer ya29.kQCEAHj1bwFXr2AuGQJmSGRWQXpacmmYZs4kzCiXns3d6H1ZpIDWmdM8' },
'expires' => expiry_time,
'file_name' => 'file.pdf',
'file_size' => '64134'
}
}
end

context 'can retrieve' do
let(:url) { 'https://retrieve.cloud.example.com/some/dir/can_retrieve.pdf' }

it 'says it can' do
expect(described_class.can_retrieve?(url)).to be_truthy
end
end

context 'cannot retrieve' do
let(:url) { 'https://retrieve.cloud.example.com/some/dir/cannot_retrieve.pdf' }

it 'says it cannot' do
expect(described_class.can_retrieve?(url)).to be_falsey
end
end
end
end

0 comments on commit 3bfdbb7

Please sign in to comment.