diff --git a/Gemfile b/Gemfile index 241af445bb..fc49da05db 100644 --- a/Gemfile +++ b/Gemfile @@ -1,7 +1,8 @@ source 'https://rubygems.org' -gem 'rspec' +gem 'faraday' gem 'rake' +gem 'rspec' group :development do gem 'pry' diff --git a/Gemfile.lock b/Gemfile.lock index 95982d9e63..77df35ba7d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -3,8 +3,11 @@ GEM specs: coderay (1.1.2) diff-lcs (1.3) + faraday (0.15.4) + multipart-post (>= 1.2, < 3) method_source (0.9.0) mini_portile2 (2.4.0) + multipart-post (2.1.1) nokogiri (1.10.3) mini_portile2 (~> 2.4.0) pry (0.11.3) @@ -29,10 +32,11 @@ PLATFORMS ruby DEPENDENCIES + faraday nokogiri pry rake rspec BUNDLED WITH - 1.17.1 + 1.17.3 diff --git a/README.md b/README.md index f0d82f65dc..f021c5da00 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,35 @@ bundle install bundle exec rspec ``` +### GitHub Advisory Sync + +There is a script that will create initial yaml files for RubyGem advisories which +are in the [GitHub Security Advisory API](https://developer.github.com/v4/object/securityadvisory/), +but are not already in this dataset. This script can be periodically run to ensure +this repo has all the data that is present in the GitHub Advisory data. + +The GitHub Advisory API requires a token to access it. +- It can be a completely scopeless token (recommended), it does not require any permissions at all. +- Get yours at https://github.com/settings/tokens + +To run the GitHub Advisory sync, start by executing the rake task: +``` +GH_API_TOKEN= bundle exec rake sync_github_advisories +``` + +- The rake task will write yaml files for any missing advisories. +- Those files must be further edited. + - Fill in `cvss_v3` field by following the CVE link and getting it from page + - Fill in `patched_versions` field, using the comments at the bottom of the file + - Fill in `unaffected_versions`, optional, if there are unaffected_versions + - delete the GitHub data at the bottom of the yaml file + - double check all the data, commit it, and make a PR + - *The GitHub Advisory data is structured opposite of RubySec unfortunately: + GitHub identifies version range which are vulnerable, RubySec identifies + version ranges which are not vulnerable. This is why some manual + work to translate is needed.* + + ## Credits Please see [CONTRIBUTORS.md]. diff --git a/Rakefile b/Rakefile index bc9aa3e980..68f5e50203 100644 --- a/Rakefile +++ b/Rakefile @@ -22,5 +22,11 @@ namespace :lint do end end +desc "Sync GitHub RubyGem Advisories into this project" +task :sync_github_advisories do + require_relative "lib/github_advisory_sync" + GitHub::GitHubAdvisorySync.sync +end + task :lint => ['lint:yaml', 'lint:cve'] task :default => :lint diff --git a/lib/github_advisory_sync.rb b/lib/github_advisory_sync.rb new file mode 100644 index 0000000000..c905c1174b --- /dev/null +++ b/lib/github_advisory_sync.rb @@ -0,0 +1,281 @@ +require "faraday" +require "json" +require "yaml" +require "open-uri" + +module GitHub + class GitHubAdvisorySync + + # Sync makes sure there are rubysec advisories for all GitHub advisories + # It writes a set of yaml files, one for each GitHub Advisory that + # is not already present in this repo + # + # The min_year argument specifies the earliest year CVE to sync + # There are many old CVEs in the GitHub advisory dataset that are not in here + # It is more important to sync the newer ones, so this allows the user to + # control how old of CVEs the sync should pull over + def self.sync(min_year: 2018) + gh_advisories = GraphQLAPIClient.new.retrieve_all_rubygem_publishable_advisories + + # filter out advisories with a CVE year that is before the min_year + # The script will write many files for years 2013, 2014 and other earlier years + # Since older CVEs are not as interesting, I am leaving it up to the caller to + # decide how older they want. The script is really designed to keep data synced + # over going forward + gh_advisories.select! do |advisory| + _, cve_year = advisory.cve_id.match(/^CVE-(\d+)-\d+$/).to_a + cve_year.to_i >= min_year + end + + files_written = [] + gh_advisories.each do |advisory| + files_written += advisory.write_files + end + + puts "\nSync completed" + if files_written.empty? + puts "Nothing to sync today! All CVEs after #{min_year} are already present" + else + puts "Wrote these files:\n#{files_written.to_yaml}" + end + + files_written + end + end + + class GraphQLAPIClient + GITHUB_API_URL = "https://api.github.com/graphql" + + GitHubApiTokenMissingError = Class.new(StandardError) + + # return a lazy initialized connection to github api + def github_api(adapter = :net_http) + @faraday_connection ||= begin + puts "Initializing GitHub API connection to URL: #{GITHUB_API_URL}" + Faraday.new do |conn_builder| + conn_builder.adapter adapter + conn_builder.headers = { + "User-Agent" => "rubysec/ruby-advisory-db rubysec sync script", + "Content-Type" => "application/json", + "Authorization" => "token #{github_api_token}" + } + end + end + @faraday_connection + end + + # An error class which gets raised when a GraphQL request fails + GitHubGraphQLAPIError = Class.new(StandardError) + + # all interactions with the API go through this method to standardize + # error checking and how queries and requests are formed + def github_graphql_query(graphql_query_name, graphql_variables = {}) + graphql_query_str = GraphQLQueries.const_get graphql_query_name + graphql_body = JSON.generate query: graphql_query_str, + variables: graphql_variables + puts "Executing GraphQL request: #{graphql_query_name}. Request variables:\n#{graphql_variables.to_yaml}\n" + faraday_response = github_api.post do |req| + req.url GITHUB_API_URL + req.body = graphql_body + end + puts "Got response code: #{faraday_response.status}" + if faraday_response.status != 200 + raise(GitHubGraphQLAPIError, "GitHub GraphQL request to #{faraday_response.env.url} failed: #{faraday_response.body}") + end + body_obj = JSON.parse faraday_response.body + if body_obj["errors"] + raise(GitHubGraphQLAPIError, body_obj["errors"].map { |e| e["message"] }.join(", ")) + end + body_obj + end + + def retrieve_all_github_advisories(max_pages = 10, page_size = 100) + all_advisories = [] + variables = { "first" => page_size } + max_pages.times do |page_num| + puts "Getting page #{page_num + 1} of GitHub Advisories" + page = github_graphql_query(:GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY, variables) + advisories_this_page = page["data"]["securityAdvisories"]["nodes"] + all_advisories += advisories_this_page + break unless page["data"]["securityAdvisories"]["pageInfo"]["hasNextPage"] == true + variables["after"] = page["data"]["securityAdvisories"]["pageInfo"]["endCursor"] + end + puts "Retrieved #{all_advisories.length} Advisories from GitHub API" + + all_advisories.map do |advisory_graphql_obj| + GitHubAdvisory.new github_advisory_graphql_object: advisory_graphql_obj + end + end + + def retrieve_all_rubygem_publishable_advisories + all_advisories = retrieve_all_github_advisories + # remove withdrawn advisories, + # and remove those where there are no vulnerabilities for ruby + all_advisories.reject { |advisory| advisory.withdrawn? } + .select { |advisory| advisory.has_ruby_vulnerabilities? } + end + + module GraphQLQueries + GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY = <<-GRAPHQL.freeze + query($first: Int, $after: String) { + securityAdvisories(first: $first, after: $after) { + pageInfo { + endCursor + hasNextPage + hasPreviousPage + startCursor + } + nodes { + identifiers { + type + value + } + summary + description + severity + references { + url + } + publishedAt + withdrawnAt + vulnerabilities(ecosystem:RUBYGEMS, first: 10) { + nodes { + package { + name + ecosystem + } + vulnerableVersionRange + firstPatchedVersion { + identifier + } + } + } + } + } + } + GRAPHQL + end + + private + + def github_api_token + unless ENV["GH_API_TOKEN"] + raise GitHubApiTokenMissingError, "Unable to make API requests. Must define 'GH_API_TOKEN' environment variable." + end + ENV["GH_API_TOKEN"] + end + end + + class GitHubAdvisory + + attr_reader :github_advisory_graphql_object + + def initialize(github_advisory_graphql_object:) + @github_advisory_graphql_object = github_advisory_graphql_object + end + + # extract the CVE identifier from the GitHub Advisory identifier list + def cve_id + identifier_list = github_advisory_graphql_object["identifiers"] + cve_id_obj = identifier_list.find { |id| id["type"] == "CVE" } + return nil unless cve_id_obj + + cve_id_obj["value"] + end + + # return a date as a string like 2019-03-21. + def published_day + return nil unless github_advisory_graphql_object["publishedAt"] + + pub_date = Date.parse(github_advisory_graphql_object["publishedAt"]) + # pub_date.strftime("%Y-%m-%d") + pub_date + end + + def package_names + github_advisory_graphql_object["vulnerabilities"]["nodes"].map{|v| v["package"]["name"]}.uniq + end + + def rubysec_filenames + package_names.map do |package_name| + File.join("gems", package_name, "#{cve_id}.yml") + end + end + + def withdrawn? + !github_advisory_graphql_object["withdrawnAt"].nil? + end + + def external_reference + github_advisory_graphql_object["references"].first["url"] + end + + def vulnerabilities + github_advisory_graphql_object["vulnerabilities"]["nodes"] + end + + def has_ruby_vulnerabilities? + vulnerabilities.any? do |vuln| + vuln["package"]["ecosystem"] == "RUBYGEMS" + end + end + + def some_rubysec_files_do_not_exist? + rubysec_filenames.any?{|filename| !File.exist?(filename) } + end + + def write_files + return [] unless cve_id + return [] unless some_rubysec_files_do_not_exist? + + files_written = [] + vulnerabilities.each do |vulnerability| + filename_to_write = File.join("gems", vulnerability["package"]["name"], "#{cve_id}.yml") + next if File.exist?(filename_to_write) + + data = { + "gem" => vulnerability["package"]["name"], + "cve" => cve_id[4..20], + "date" => published_day, + "url" => external_reference, + "title" => github_advisory_graphql_object["summary"], + "description" => github_advisory_graphql_object["description"], + "cvss_v3" => "", + "patched_versions" => [ "" ], + "unaffected_versions" => [ "" ] + } + + dir_to_write = File.dirname(filename_to_write) + Dir.mkdir dir_to_write unless Dir.exist?(dir_to_write) + File.open(filename_to_write, "w") do |file| + # create an automatically generated advisory yaml file + file.write data.to_yaml + + # The data we just wrote is incomplete, + # and therefore should not be committed as is + # We can not directly translate from GitHub to rubysec advisory format + # + # The patched_versions field is not exactly available. + # - GitHub has a first_patched_version field, + # but rubysec advisory needs a ruby version spec + # + # The unnaffected_versions field is similarly not directly available + # This optional field must be inferred from the vulnerableVersionRange + # + # To help write those fields, we put all the github data below. + # + # The second block of yaml in a .yaml file is ignored (after the second "---" line) + # This effectively makes this data a large comment + # Still it should be removed before the data goes into rubysec + file.write "\n\n# GitHub advisory data below - **Remove this data before committing**\n" + file.write "# Use this data to write patched_versions (and potentially unaffected_versions) above\n" + file.write github_advisory_graphql_object.to_yaml + end + puts "Wrote: #{filename_to_write}" + files_written << filename_to_write + end + + files_written + end + end +end