From 7c00292214d2af651944fa867389c30aabe2538b Mon Sep 17 00:00:00 2001 From: Robert Schultheis Date: Fri, 14 Jun 2019 12:08:52 -0600 Subject: [PATCH 1/4] MVP script to sync github advisory data in --- Gemfile | 3 +- Gemfile.lock | 6 +- Rakefile | 6 + lib/github_advisory_sync.rb | 219 ++++++++++++++++++++++++++++++++++++ 4 files changed, 232 insertions(+), 2 deletions(-) create mode 100644 lib/github_advisory_sync.rb diff --git a/Gemfile b/Gemfile index 241af445bb..fc49da05db 100644 --- a/Gemfile +++ b/Gemfile @@ -1,7 +1,8 @@ source 'https://rubygems.org' -gem 'rspec' +gem 'faraday' gem 'rake' +gem 'rspec' group :development do gem 'pry' diff --git a/Gemfile.lock b/Gemfile.lock index 95982d9e63..77df35ba7d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -3,8 +3,11 @@ GEM specs: coderay (1.1.2) diff-lcs (1.3) + faraday (0.15.4) + multipart-post (>= 1.2, < 3) method_source (0.9.0) mini_portile2 (2.4.0) + multipart-post (2.1.1) nokogiri (1.10.3) mini_portile2 (~> 2.4.0) pry (0.11.3) @@ -29,10 +32,11 @@ PLATFORMS ruby DEPENDENCIES + faraday nokogiri pry rake rspec BUNDLED WITH - 1.17.1 + 1.17.3 diff --git a/Rakefile b/Rakefile index bc9aa3e980..1dc35990f3 100644 --- a/Rakefile +++ b/Rakefile @@ -22,5 +22,11 @@ namespace :lint do end end +# TODO add description +task :sync_github_advisories do + require_relative "lib/github_advisory_sync" + GitHub::GitHubAdvisorySync.sync +end + task :lint => ['lint:yaml', 'lint:cve'] task :default => :lint diff --git a/lib/github_advisory_sync.rb b/lib/github_advisory_sync.rb new file mode 100644 index 0000000000..af0353d482 --- /dev/null +++ b/lib/github_advisory_sync.rb @@ -0,0 +1,219 @@ +require "faraday" +require "json" +require "yaml" + +module GitHub + class GitHubAdvisorySync + def self.sync + gh_api_client = GraphQLAPIClient.new + gh_advisories = gh_api_client.retrieve_all_rubygem_publishable_advisories + + files_written = [] + gh_advisories.each do |advisory| + files_written += advisory.write_files + end + files_written + end + end + + class GraphQLAPIClient + GITHUB_API_URL = "https://api.github.com/graphql" + + GitHubApiTokenMissingError = Class.new(StandardError) + + # return a lazy initialized connection to github api + def github_api(adapter = :net_http) + @faraday_connection ||= begin + puts "Initializing GitHub API connection to URL: #{GITHUB_API_URL}" + Faraday.new do |conn_builder| + conn_builder.adapter adapter + conn_builder.headers = { + "User-Agent" => "rubysec/ruby-advisory-db rubysec sync script", + "Content-Type" => "application/json", + "Authorization" => "token #{github_api_token}" + } + end + end + @faraday_connection + end + + # An error class which gets raised when a GraphQL request fails + GitHubGraphQLAPIError = Class.new(StandardError) + + # all interactions with the API go through this method to standardize + # error checking and how queries and requests are formed + def github_graphql_query(graphql_query_name, graphql_variables = {}) + graphql_query_str = GraphQLQueries.const_get graphql_query_name + graphql_body = JSON.generate query: graphql_query_str, + variables: graphql_variables + puts "Executing GraphQL request: #{graphql_query_name}. Request variables:\n#{graphql_variables.to_yaml}\n" + faraday_response = github_api.post do |req| + req.url GITHUB_API_URL + req.body = graphql_body + end + puts "Got response code: #{faraday_response.status}" + # puts "Response body string:\n---#{faraday_response.body}\n---" + if faraday_response.status != 200 + raise(GitHubGraphQLAPIError, "GitHub GraphQL request to #{faraday_response.env.url} failed: #{faraday_response.body}") + end + body_obj = JSON.parse faraday_response.body + if body_obj["errors"] + raise(GitHubGraphQLAPIError, body_obj["errors"].map { |e| e["message"] }.join(", ")) + end + # puts "Query was successful. Response body:\n#{JSON.pretty_generate(body_obj)}\n" + body_obj + end + + def retrieve_all_github_advisories(max_pages = 50, page_size = 100) # up to 5K + all_advisories = [] + variables = { "first" => page_size } + max_pages.times do |page_num| + puts "Getting page #{page_num + 1} of GitHub Advisories" + page = github_graphql_query(:GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY, variables) + advisories_this_page = page["data"]["securityAdvisories"]["nodes"] + puts "found #{advisories_this_page.length} advisories on page #{page_num}" + all_advisories += advisories_this_page + break unless page["data"]["securityAdvisories"]["pageInfo"]["hasNextPage"] == true + variables["after"] = page["data"]["securityAdvisories"]["pageInfo"]["endCursor"] + end + puts "Retrieved #{all_advisories.length} Advisories from GitHub API" + + all_advisories.map do |advisory_graphql_obj| + GitHubAdvisory.new github_advisory_graphql_object: advisory_graphql_obj + end + end + + def retrieve_all_rubygem_publishable_advisories + all_advisories = retrieve_all_github_advisories + # remove withdrawn advisories, and remove those where there are no vulnerabilities. + all_advisories.reject { |advisory| advisory.withdrawn? } + .select { |advisory| advisory.has_ruby_vulnerabilities? } + end + + module GraphQLQueries + GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY = <<-GRAPHQL.freeze + query($first: Int, $after: String) { + securityAdvisories(first: $first, after: $after) { + pageInfo { + endCursor + hasNextPage + hasPreviousPage + startCursor + } + nodes { + identifiers { + type + value + } + summary + description + severity + references { + url + } + publishedAt + withdrawnAt + vulnerabilities(ecosystem:RUBYGEMS, first: 10) { + nodes { + package { + name + ecosystem + } + vulnerableVersionRange + firstPatchedVersion { + identifier + } + } + } + } + } + } + GRAPHQL + end + + private + + def github_api_token + unless ENV["GH_API_TOKEN"] + raise GitHubApiTokenMissingError, "Unable to make API requests. Must define 'GH_API_TOKEN' environment variable." + end + ENV["GH_API_TOKEN"] + end + end + + class GitHubAdvisory + attr_reader :github_advisory_graphql_object + def initialize(github_advisory_graphql_object:) + @github_advisory_graphql_object = github_advisory_graphql_object + end + + def cve_id + cve_id_obj = github_advisory_graphql_object["identifiers"].find{ |id| id["type"] == "CVE" } + return nil unless cve_id_obj + cve_id_obj["value"] + end + + def package_names + github_advisory_graphql_object["vulnerabilities"]["nodes"].map{|v| v["package"]["name"]}.uniq + end + + def rubysec_filenames + package_names.map do |package_name| + File.join("gems", package_name, "#{cve_id}.yml") + end + end + + def withdrawn? + !github_advisory_graphql_object["withdrawnAt"].nil? + end + + def external_reference + github_advisory_graphql_object["references"].first["url"] + end + + def vulnerabilities + github_advisory_graphql_object["vulnerabilities"]["nodes"] + end + + def has_ruby_vulnerabilities? + vulnerabilities.any? do |vuln| + vuln["package"]["ecosystem"] == "RUBYGEMS" + end + end + + def some_rubysec_files_do_not_exist? + rubysec_filenames.any?{|filename| !File.exist?(filename) } + end + + def write_files + return [] unless cve_id + return [] unless some_rubysec_files_do_not_exist? + + files_written = [] + vulnerabilities.each do |vulnerability| + filename_to_write = File.join("gems", vulnerability["package"]["name"], "#{cve_id}.yml") + next if File.exist?(filename_to_write) + + data = { + gem: vulnerability["package"]["name"], + cve: cve_id[4..20], + date: github_advisory_graphql_object["publishedAt"], + url: external_reference, + title: github_advisory_graphql_object["summary"], + description: github_advisory_graphql_object["description"], + } + + dir_to_write = File.dirname(filename_to_write) + Dir.mkdir dir_to_write unless Dir.exist?(dir_to_write) + File.open(filename_to_write, "w") do |file| + file.write data.to_yaml + end + puts "Wrote: #{filename_to_write}" + files_written << filename_to_write + end + + files_written + end + + end +end From b9cb5629629eea870935d0e8964ab47ef1f7b314 Mon Sep 17 00:00:00 2001 From: Robert Schultheis Date: Tue, 18 Jun 2019 11:22:00 -0600 Subject: [PATCH 2/4] github sync script updates --- lib/github_advisory_sync.rb | 72 +++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 14 deletions(-) diff --git a/lib/github_advisory_sync.rb b/lib/github_advisory_sync.rb index af0353d482..ec6eb9548f 100644 --- a/lib/github_advisory_sync.rb +++ b/lib/github_advisory_sync.rb @@ -1,12 +1,26 @@ require "faraday" require "json" require "yaml" +require "open-uri" module GitHub class GitHubAdvisorySync - def self.sync - gh_api_client = GraphQLAPIClient.new - gh_advisories = gh_api_client.retrieve_all_rubygem_publishable_advisories + + # Sync makes sure there are rubysec advisories for all GitHub advisories + # It writes a set of yaml files, one for each GitHub Advisory that + # is not already present in this repo + # + # The min_year argument specifies the earliest year CVE to sync + # There are many old CVEs in the GitHub advisory dataset that are not in here + # It is more important to sync the newer ones, so this allows the user to + # control how old of CVEs the sync should pull over + def self.sync(min_year: 2018) + gh_advisories = GraphQLAPIClient.new.retrieve_all_rubygem_publishable_advisories + + gh_advisories.select! do |advisory| + _, cve_year = advisory.cve_id.match(/^CVE-(\d+)-\d+$/).to_a + cve_year.to_i >= min_year + end files_written = [] gh_advisories.each do |advisory| @@ -52,7 +66,6 @@ def github_graphql_query(graphql_query_name, graphql_variables = {}) req.body = graphql_body end puts "Got response code: #{faraday_response.status}" - # puts "Response body string:\n---#{faraday_response.body}\n---" if faraday_response.status != 200 raise(GitHubGraphQLAPIError, "GitHub GraphQL request to #{faraday_response.env.url} failed: #{faraday_response.body}") end @@ -60,11 +73,10 @@ def github_graphql_query(graphql_query_name, graphql_variables = {}) if body_obj["errors"] raise(GitHubGraphQLAPIError, body_obj["errors"].map { |e| e["message"] }.join(", ")) end - # puts "Query was successful. Response body:\n#{JSON.pretty_generate(body_obj)}\n" body_obj end - def retrieve_all_github_advisories(max_pages = 50, page_size = 100) # up to 5K + def retrieve_all_github_advisories(max_pages = 10, page_size = 100) all_advisories = [] variables = { "first" => page_size } max_pages.times do |page_num| @@ -85,7 +97,8 @@ def retrieve_all_github_advisories(max_pages = 50, page_size = 100) # up to 5K def retrieve_all_rubygem_publishable_advisories all_advisories = retrieve_all_github_advisories - # remove withdrawn advisories, and remove those where there are no vulnerabilities. + # remove withdrawn advisories, + # and remove those where there are no vulnerabilities for ruby all_advisories.reject { |advisory| advisory.withdrawn? } .select { |advisory| advisory.has_ruby_vulnerabilities? } end @@ -142,7 +155,9 @@ def github_api_token end class GitHubAdvisory + attr_reader :github_advisory_graphql_object + def initialize(github_advisory_graphql_object:) @github_advisory_graphql_object = github_advisory_graphql_object end @@ -185,6 +200,12 @@ def some_rubysec_files_do_not_exist? rubysec_filenames.any?{|filename| !File.exist?(filename) } end + + def cveproject_link + _, year, suffixnum = cve_id.match(/^CVE-(\d+)-(\d+)$/).to_a + "https://raw.githubusercontent.com/CVEProject/cvelist/master/#{year}/#{suffixnum.sub(/...$/, 'xxx')}/#{cve_id}.json" + end + def write_files return [] unless cve_id return [] unless some_rubysec_files_do_not_exist? @@ -195,18 +216,42 @@ def write_files next if File.exist?(filename_to_write) data = { - gem: vulnerability["package"]["name"], - cve: cve_id[4..20], - date: github_advisory_graphql_object["publishedAt"], - url: external_reference, - title: github_advisory_graphql_object["summary"], - description: github_advisory_graphql_object["description"], + "gem" => vulnerability["package"]["name"], + "cve" => cve_id[4..20], + "date" => github_advisory_graphql_object["publishedAt"], + "url" => external_reference, + "title" => github_advisory_graphql_object["summary"], + "description" => github_advisory_graphql_object["description"], + "cvss_v3" => "", + "patched_versions" => [ "" ], + "unaffected_versions" => [ "" ] } dir_to_write = File.dirname(filename_to_write) Dir.mkdir dir_to_write unless Dir.exist?(dir_to_write) File.open(filename_to_write, "w") do |file| + # create an automatically generated advisory yaml file file.write data.to_yaml + + # The data we just wrote is incomplete, + # and therefore should not be committed as is + # We can not directly translate from GitHub to rubysec advisory format + # + # The patched_versions field is not exactly available. + # - GitHub has a first_patched_version field, + # but rubysec advisory needs a ruby version spec + # + # The unnaffected_versions field is similarly not directly available + # This optional field must be inferred from the vulnerableVersionRange + # + # To help write those fields, we put all the github data below. + # + # The second block of yaml in a .yaml file is ignored (after the second "---" line) + # This effectively makes this data a large comment + # Still it should be removed before the data goes into rubysec + file.write "\n\n# GitHub advisory data below - **Remove this data before committing**\n" + file.write "# Use this data to write patched_versions (and potentially unaffected_versions) above\n" + file.write github_advisory_graphql_object.to_yaml end puts "Wrote: #{filename_to_write}" files_written << filename_to_write @@ -214,6 +259,5 @@ def write_files files_written end - end end From 6a0e1fdef590edebb48bd4d2de9218017f13ae16 Mon Sep 17 00:00:00 2001 From: Robert Schultheis Date: Wed, 19 Jun 2019 08:28:13 -0600 Subject: [PATCH 3/4] minor improvements to github sync script --- Rakefile | 2 +- lib/github_advisory_sync.rb | 36 +++++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Rakefile b/Rakefile index 1dc35990f3..68f5e50203 100644 --- a/Rakefile +++ b/Rakefile @@ -22,7 +22,7 @@ namespace :lint do end end -# TODO add description +desc "Sync GitHub RubyGem Advisories into this project" task :sync_github_advisories do require_relative "lib/github_advisory_sync" GitHub::GitHubAdvisorySync.sync diff --git a/lib/github_advisory_sync.rb b/lib/github_advisory_sync.rb index ec6eb9548f..c905c1174b 100644 --- a/lib/github_advisory_sync.rb +++ b/lib/github_advisory_sync.rb @@ -17,6 +17,11 @@ class GitHubAdvisorySync def self.sync(min_year: 2018) gh_advisories = GraphQLAPIClient.new.retrieve_all_rubygem_publishable_advisories + # filter out advisories with a CVE year that is before the min_year + # The script will write many files for years 2013, 2014 and other earlier years + # Since older CVEs are not as interesting, I am leaving it up to the caller to + # decide how older they want. The script is really designed to keep data synced + # over going forward gh_advisories.select! do |advisory| _, cve_year = advisory.cve_id.match(/^CVE-(\d+)-\d+$/).to_a cve_year.to_i >= min_year @@ -26,6 +31,14 @@ def self.sync(min_year: 2018) gh_advisories.each do |advisory| files_written += advisory.write_files end + + puts "\nSync completed" + if files_written.empty? + puts "Nothing to sync today! All CVEs after #{min_year} are already present" + else + puts "Wrote these files:\n#{files_written.to_yaml}" + end + files_written end end @@ -83,7 +96,6 @@ def retrieve_all_github_advisories(max_pages = 10, page_size = 100) puts "Getting page #{page_num + 1} of GitHub Advisories" page = github_graphql_query(:GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY, variables) advisories_this_page = page["data"]["securityAdvisories"]["nodes"] - puts "found #{advisories_this_page.length} advisories on page #{page_num}" all_advisories += advisories_this_page break unless page["data"]["securityAdvisories"]["pageInfo"]["hasNextPage"] == true variables["after"] = page["data"]["securityAdvisories"]["pageInfo"]["endCursor"] @@ -162,12 +174,24 @@ def initialize(github_advisory_graphql_object:) @github_advisory_graphql_object = github_advisory_graphql_object end + # extract the CVE identifier from the GitHub Advisory identifier list def cve_id - cve_id_obj = github_advisory_graphql_object["identifiers"].find{ |id| id["type"] == "CVE" } + identifier_list = github_advisory_graphql_object["identifiers"] + cve_id_obj = identifier_list.find { |id| id["type"] == "CVE" } return nil unless cve_id_obj + cve_id_obj["value"] end + # return a date as a string like 2019-03-21. + def published_day + return nil unless github_advisory_graphql_object["publishedAt"] + + pub_date = Date.parse(github_advisory_graphql_object["publishedAt"]) + # pub_date.strftime("%Y-%m-%d") + pub_date + end + def package_names github_advisory_graphql_object["vulnerabilities"]["nodes"].map{|v| v["package"]["name"]}.uniq end @@ -200,12 +224,6 @@ def some_rubysec_files_do_not_exist? rubysec_filenames.any?{|filename| !File.exist?(filename) } end - - def cveproject_link - _, year, suffixnum = cve_id.match(/^CVE-(\d+)-(\d+)$/).to_a - "https://raw.githubusercontent.com/CVEProject/cvelist/master/#{year}/#{suffixnum.sub(/...$/, 'xxx')}/#{cve_id}.json" - end - def write_files return [] unless cve_id return [] unless some_rubysec_files_do_not_exist? @@ -218,7 +236,7 @@ def write_files data = { "gem" => vulnerability["package"]["name"], "cve" => cve_id[4..20], - "date" => github_advisory_graphql_object["publishedAt"], + "date" => published_day, "url" => external_reference, "title" => github_advisory_graphql_object["summary"], "description" => github_advisory_graphql_object["description"], From 97a727e9b51235a65c7fa11caefe22d77583b189 Mon Sep 17 00:00:00 2001 From: Robert Schultheis Date: Wed, 19 Jun 2019 08:42:57 -0600 Subject: [PATCH 4/4] Readme explains how to run GitHub Advisory sync --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index f0d82f65dc..f021c5da00 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,35 @@ bundle install bundle exec rspec ``` +### GitHub Advisory Sync + +There is a script that will create initial yaml files for RubyGem advisories which +are in the [GitHub Security Advisory API](https://developer.github.com/v4/object/securityadvisory/), +but are not already in this dataset. This script can be periodically run to ensure +this repo has all the data that is present in the GitHub Advisory data. + +The GitHub Advisory API requires a token to access it. +- It can be a completely scopeless token (recommended), it does not require any permissions at all. +- Get yours at https://github.com/settings/tokens + +To run the GitHub Advisory sync, start by executing the rake task: +``` +GH_API_TOKEN= bundle exec rake sync_github_advisories +``` + +- The rake task will write yaml files for any missing advisories. +- Those files must be further edited. + - Fill in `cvss_v3` field by following the CVE link and getting it from page + - Fill in `patched_versions` field, using the comments at the bottom of the file + - Fill in `unaffected_versions`, optional, if there are unaffected_versions + - delete the GitHub data at the bottom of the yaml file + - double check all the data, commit it, and make a PR + - *The GitHub Advisory data is structured opposite of RubySec unfortunately: + GitHub identifies version range which are vulnerable, RubySec identifies + version ranges which are not vulnerable. This is why some manual + work to translate is needed.* + + ## Credits Please see [CONTRIBUTORS.md].