Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reading: Update GoogleSheetsFetcher to recur into folders and batch calls #2623

Merged
merged 4 commits into from Sep 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
115 changes: 89 additions & 26 deletions app/importers/reading/google_sheets_fetcher.rb
Expand Up @@ -27,18 +27,42 @@
require 'csv'

class GoogleSheetsFetcher
def initialize(options = {})
@log = options.fetch(:log, nil)
end

# returns [Tab]
def get_tabs_from_folder(folder_id)
sheet_ids = get_sheet_ids(folder_id)
# optionally recur into folders with recursive: true
# Note that this may hit API quotas, and this class
# isn't optimized to batch http requests.
def get_tabs_from_folder(folder_id, options = {})
log "get_tabs_from_folder(#{folder_id})"
sheets = list_sheets(folder_id)

tabs = []
sheets.files.each do |file|
tabs += get_tabs_from_sheet(file.id)
end

sheet_ids.files.flat_map do |file|
get_tabs_from_sheet(file.id)
# Google Drive isn't actually a folder system, so this
# recurs manually since the number of files is small.
# See https://stackoverflow.com/questions/41741520/how-do-i-search-sub-folders-and-sub-sub-folders-in-google-drive
# for more on how Drive works, or the `batch` method in
# the Ruby API for alternatives.
if options.fetch(:recursive, false)
sub_folders = list_folders(folder_id)
sub_folders.files.each do |sub_folder|
tabs += get_tabs_from_folder(sub_folder.id, options)
end
end

tabs
end

# returns [Tab]
def get_tabs_from_sheet(sheet_id)
download_tab_csvs(sheet_id)
log "get_tabs_from_sheet(#{sheet_id})"
download_tab_csvs_batched(sheet_id)
end

private
Expand Down Expand Up @@ -67,34 +91,47 @@ def application_name
'Student Insights, GoogleSheetsFetcher'
end

# No real escaping for building this query
def get_sheet_ids(unsafe_folder_id)
# initialize drive API
drive_service = Google::Apis::DriveV3::DriveService.new
drive_service.client_options.application_name = @application_name
drive_service.authorization = check_authorization()
# See https://developers.google.com/drive/api/v3/search-files
# for info on how queries work.
def list_sheets(unsafe_folder_id)
log " list_sheets(#{unsafe_folder_id})"
folder_id = verify_safe_folder_id!(unsafe_folder_id)
q = "'#{folder_id}' in parents and mimeType = 'application/vnd.google-apps.spreadsheet'"
drive_service.list_files(q: q, fields: 'files(id, name)')
end

# minimal check for query injection
raise "invalid unsafe_folder_id: #{unsafe_folder_id}" if /[^a-zA-Z0-9\-_]/.match(unsafe_folder_id).present?
q = "'#{unsafe_folder_id}' in parents"
def list_folders(unsafe_folder_id)
log " list_folders(#{unsafe_folder_id})"
folder_id = verify_safe_folder_id!(unsafe_folder_id)
q = "'#{folder_id}' in parents and mimeType = 'application/vnd.google-apps.folder'"
drive_service.list_files(q: q, fields: 'files(id, name)')
end

def download_tab_csvs(sheet_id)
#Initialize sheets API
sheet_service = Google::Apis::SheetsV4::SheetsService.new
sheet_service.client_options.application_name = @application_name
sheet_service.authorization = check_authorization()
# minimal check for query injection
def verify_safe_folder_id!(unsafe_folder_id)
raise "invalid unsafe_folder_id: #{unsafe_folder_id}" if /[^a-zA-Z0-9\-_]/.match(unsafe_folder_id).present?
unsafe_folder_id
end

# Returns [Tab] with CSV data for all sheets
def download_tab_csvs_batched(sheet_id)
# To manage quotas, do this in two API calls, one for the Spreadsheet to get all metadata,
# then a second batch request to get the contents of each tab.
# See https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets.values/batchGet
# and https://github.com/googleapis/google-api-ruby-client/blob/cb0b81f79451b8dee9df07eb248110b3e6045916/generated/google/apis/sheets_v4/service.rb
spreadsheet = sheets_service.get_spreadsheet(sheet_id)
sheet_titles = spreadsheet.sheets.map(&:properties).map(&:title)
batch_responses = sheets_service.batch_get_spreadsheet_values(sheet_id, ranges: sheet_titles)
log " download_tab_csvs_batched(#{sheet_id}), found #{sheet_titles.size} sheets"

# Get values from sheets indexed by sheet name
# iterate through to zip them together
tabs = []
spreadsheet = sheet_service.get_spreadsheet(sheet_id)
spreadsheet.sheets.each_with_object({}) do |sheet, hash| # each tab in the spreadsheet
spreadsheet.sheets.each_with_index do |sheet, sheet_index|
# If it's a new empty sheet, `#values` will return nil instead of [[]], so
# handle that as a special case.
sheet_values = batch_responses.value_ranges[sheet_index].values || [[]]
tab_csv = CSV.generate do |csv|
sheet_values = sheet_service.get_spreadsheet_values(sheet_id, sheet.properties.title).values
sheet_values.each do |row|
csv << row
end
sheet_values.each {|row| csv << row }
end
tabs << Tab.new({
spreadsheet_id: spreadsheet.spreadsheet_id,
Expand All @@ -108,6 +145,32 @@ def download_tab_csvs(sheet_id)
tabs
end

def drive_service
if @drive_service.nil?
drive_service = Google::Apis::DriveV3::DriveService.new
drive_service.client_options.application_name = @application_name
drive_service.authorization = check_authorization()
@drive_service = drive_service
end
@drive_service
end

def sheets_service
if @sheets_service.nil?
sheets_service = Google::Apis::SheetsV4::SheetsService.new
sheets_service.client_options.application_name = @application_name
sheets_service.authorization = check_authorization()
@sheets_service = sheets_service
end
@sheets_service
end

def log(msg)
return if @log.nil?
text = if msg.class == String then msg else JSON.pretty_generate(msg) end
@log.puts "GoogleSheetsFetcher: #{text}"
end

# A tab of a spreadsheet
class Tab < Struct.new :spreadsheet_id, :spreadsheet_name, :spreadsheet_url, :tab_id, :tab_name, :tab_csv, keyword_init: true
end
Expand Down
4 changes: 2 additions & 2 deletions app/lib/reading_validator.rb
Expand Up @@ -2,7 +2,7 @@ class ReadingValidator
def self.debug_error_messages
checks = {}
validator = ReadingValidator.new
keys.each do |key|
ReadingBenchmarkDataPoint::VALID_BENCHMARK_ASSESSMENT_KEYS.each do |key|
ds = ReadingBenchmarkDataPoint.where(benchmark_assessment_key: key)
ds.each do |d|
msg = validator.validate_json_meaning(key, d.json['value'])
Expand All @@ -16,7 +16,7 @@ def self.debug_error_messages

def self.debug_float_range
ranges = {}
keys.map do |key|
ReadingBenchmarkDataPoint::VALID_BENCHMARK_ASSESSMENT_KEYS.map do |key|
ds = ReadingBenchmarkDataPoint.where(benchmark_assessment_key: key)
values = ds.map {|d| d.json['value'].try(:to_f) || nil }.compact
ranges[key] = { min: values.min, max: values.max }
Expand Down