diff --git a/svc/Cargo.lock b/svc/Cargo.lock index ca06393be2..324c8def62 100644 --- a/svc/Cargo.lock +++ b/svc/Cargo.lock @@ -2459,22 +2459,18 @@ dependencies = [ [[package]] name = "cloudflare" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0778f99ea7ad39b49b758eb418da7117b93232a5f6a09f9b79a094b77ac88cc2" +version = "0.12.0" +source = "git+https://github.com/cloudflare/cloudflare-rs.git?rev=f14720e42184ee176a97676e85ef2d2d85bc3aae#f14720e42184ee176a97676e85ef2d2d85bc3aae" dependencies = [ - "anyhow", - "async-trait", - "base64 0.13.1", - "cfg-if", "chrono", "http 0.2.12", "percent-encoding", "reqwest 0.11.27", "serde", "serde_json", - "serde_qs", + "serde_urlencoded", "serde_with", + "thiserror", "url", "uuid", ] @@ -2498,6 +2494,7 @@ dependencies = [ "nomad-util", "nomad_client", "rand", + "reqwest 0.11.27", "rivet-metrics", "rivet-operation", "rivet-runtime", @@ -2505,7 +2502,6 @@ dependencies = [ "serde", "sqlx 0.7.4 (git+https://github.com/rivet-gg/sqlx?rev=08d6e61aa0572e7ec557abbedb72cebb96e1ac5b)", "ssh2", - "thiserror", "token-create", "tokio", "trust-dns-resolver", @@ -8151,17 +8147,6 @@ dependencies = [ "serde", ] -[[package]] -name = "serde_qs" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cac3f1e2ca2fe333923a1ae72caca910b98ed0630bb35ef6f8c8517d6e81afa" -dependencies = [ - "percent-encoding", - "serde", - "thiserror", -] - [[package]] name = "serde_repr" version = "0.1.19" diff --git a/svc/pkg/cluster/Cargo.toml b/svc/pkg/cluster/Cargo.toml index 2927d751e2..5cf84a0115 100644 --- a/svc/pkg/cluster/Cargo.toml +++ b/svc/pkg/cluster/Cargo.toml @@ -9,20 +9,20 @@ license = "Apache-2.0" acme-lib = "0.9" anyhow = "1.0" chirp-workflow = { path = "../../../lib/chirp-workflow/core" } -cloudflare = "0.10.1" +cloudflare = { git = "https://github.com/cloudflare/cloudflare-rs.git", rev = "f14720e42184ee176a97676e85ef2d2d85bc3aae" } http = "0.2" include_dir = "0.7.3" indoc = "1.0" lazy_static = "1.4" nomad-util = { path = "../../../lib/nomad-util" } rand = "0.8" +reqwest = { version = "0.11", features = ["json"] } rivet-metrics = { path = "../../../lib/metrics" } rivet-operation = { path = "../../../lib/operation/core" } rivet-runtime = { path = "../../../lib/runtime" } s3-util = { path = "../../../lib/s3-util" } serde = { version = "1.0.198", features = ["derive"] } ssh2 = "0.9.4" -thiserror = "1.0" trust-dns-resolver = { version = "0.23.2", features = ["dns-over-native-tls"] } ip-info = { path = "../ip/ops/info" } diff --git a/svc/pkg/cluster/src/util/mod.rs b/svc/pkg/cluster/src/util/mod.rs index 5256436cd5..6b9b8b2291 100644 --- a/svc/pkg/cluster/src/util/mod.rs +++ b/svc/pkg/cluster/src/util/mod.rs @@ -1,5 +1,5 @@ use chirp_workflow::prelude::*; -use cloudflare::framework as cf_framework; +use cloudflare::{endpoints as cf, framework as cf_framework}; use crate::types::PoolType; @@ -13,13 +13,6 @@ pub const INSTALL_SCRIPT_HASH: &str = include_str!(concat!(env!("OUT_DIR"), "/ha // TTL of the token written to prebake images. Prebake images are renewed before the token would expire pub const SERVER_TOKEN_TTL: i64 = util::duration::days(30 * 6); -#[derive(thiserror::Error, Debug)] -#[error("cloudflare: {source}")] -struct CloudflareError { - #[from] - source: anyhow::Error, -} - // Cluster id for provisioning servers pub fn default_cluster_id() -> Uuid { Uuid::nil() @@ -36,15 +29,145 @@ pub fn server_name(provider_datacenter_id: &str, pool_type: PoolType, server_id: format!("{ns}-{provider_datacenter_id}-{pool_type_str}-{server_id}",) } -pub(crate) async fn cf_client() -> GlobalResult { +pub(crate) async fn cf_client( + cf_token: Option<&str>, +) -> GlobalResult { // Create CF client - let cf_token = util::env::read_secret(&["cloudflare", "terraform", "auth_token"]).await?; + let cf_token = if let Some(cf_token) = cf_token { + cf_token.to_string() + } else { + util::env::read_secret(&["cloudflare", "terraform", "auth_token"]).await? + }; let client = cf_framework::async_api::Client::new( cf_framework::auth::Credentials::UserAuthToken { token: cf_token }, Default::default(), cf_framework::Environment::Production, - ) - .map_err(CloudflareError::from)?; + )?; Ok(client) } + +/// Tries to create a DNS record. If a 400 error is received, it deletes the existing record and tries again. +pub(crate) async fn create_dns_record( + client: &cf_framework::async_api::Client, + cf_token: &str, + zone_id: &str, + record_name: &str, + content: cf::dns::DnsContent, +) -> GlobalResult { + tracing::info!(%record_name, "creating dns record"); + + let create_record_res = client + .request(&cf::dns::CreateDnsRecord { + zone_identifier: zone_id, + params: cf::dns::CreateDnsRecordParams { + name: record_name, + content: content.clone(), + proxied: Some(false), + ttl: Some(60), + priority: None, + }, + }) + .await; + + match create_record_res { + Ok(create_record_res) => Ok(create_record_res.result.id), + // Try to delete record on error + Err(err) => { + if let cf_framework::response::ApiFailure::Error( + http::status::StatusCode::BAD_REQUEST, + _, + ) = err + { + tracing::warn!(%record_name, "failed to create dns record, trying to delete"); + + let dns_type = match content { + cf::dns::DnsContent::A { .. } => "A", + cf::dns::DnsContent::AAAA { .. } => "AAAA", + cf::dns::DnsContent::CNAME { .. } => "CNAME", + cf::dns::DnsContent::NS { .. } => "NS", + cf::dns::DnsContent::MX { .. } => "MX", + cf::dns::DnsContent::TXT { .. } => "TXT", + cf::dns::DnsContent::SRV { .. } => "SRV", + }; + let list_records_res = get_dns_record(cf_token, record_name, dns_type).await?; + + if let Some(record) = list_records_res { + delete_dns_record(client, zone_id, &record.id).await?; + tracing::info!(%record_name, "deleted dns record, trying again"); + + // Second try + let create_record_res2 = client + .request(&cf::dns::CreateDnsRecord { + zone_identifier: zone_id, + params: cf::dns::CreateDnsRecordParams { + name: record_name, + content, + proxied: Some(false), + ttl: Some(60), + priority: None, + }, + }) + .await?; + + return Ok(create_record_res2.result.id); + } else { + tracing::warn!(%record_name, "failed to get matching dns record"); + } + } + + // Throw original error + Err(err.into()) + } + } +} + +pub(crate) async fn delete_dns_record( + client: &cf_framework::async_api::Client, + zone_id: &str, + record_id: &str, +) -> GlobalResult<()> { + tracing::info!(%record_id, "deleting dns record"); + + client + .request(&cf::dns::DeleteDnsRecord { + zone_identifier: zone_id, + identifier: record_id, + }) + .await?; + + Ok(()) +} + +/// Fetches the dns record by name. +async fn get_dns_record( + cf_token: &str, + record_name: &str, + dns_type: &str, +) -> GlobalResult> { + let list_records_res = reqwest::Client::new() + .get("https://api.cloudflare.com/client/v4/zones/{zone_id}/dns_records") + .bearer_auth(cf_token) + .query(&("name", &record_name)) + .query(&("type", dns_type)) + .send() + .await? + .to_global_error() + .await?; + + let status = list_records_res.status(); + if status.is_success() { + match list_records_res + .json::>>() + .await + { + Ok(api_resp) => Ok(api_resp.result.into_iter().next()), + Err(e) => Err(cf_framework::response::ApiFailure::Invalid(e).into()), + } + } else { + let parsed: Result = + list_records_res.json().await; + let errors = parsed.unwrap_or_default(); + Err(cf_framework::response::ApiFailure::Error(status, errors).into()) + } +} diff --git a/svc/pkg/cluster/src/workflows/datacenter/tls_issue.rs b/svc/pkg/cluster/src/workflows/datacenter/tls_issue.rs index 5fd1f73515..8bb41510ee 100644 --- a/svc/pkg/cluster/src/workflows/datacenter/tls_issue.rs +++ b/svc/pkg/cluster/src/workflows/datacenter/tls_issue.rs @@ -3,8 +3,9 @@ use acme_lib::{ persist::{MemoryPersist, Persist, PersistKey, PersistKind}, Account, Directory, DirectoryUrl, }; +use cloudflare::endpoints as cf; + use chirp_workflow::prelude::*; -use cloudflare::{endpoints as cf, framework as cf_framework, framework::async_api::ApiClient}; use futures_util::StreamExt; use tokio::task; use trust_dns_resolver::{ @@ -13,7 +14,10 @@ use trust_dns_resolver::{ TokioAsyncResolver, }; -use crate::{types::TlsState, util::cf_client}; +use crate::{ + types::TlsState, + util::{cf_client, create_dns_record, delete_dns_record}, +}; const ENCRYPT_EMAIL: &str = "letsencrypt@rivet.gg"; @@ -89,7 +93,8 @@ struct OrderOutput { #[activity(Order)] #[timeout = 130] async fn order(ctx: &ActivityCtx, input: &OrderInput) -> GlobalResult { - let client = cf_client().await?; + let cf_token = util::env::read_secret(&["cloudflare", "terraform", "auth_token"]).await?; + let client = cf_client(Some(&cf_token)).await?; // Fetch ACME account registration let account = acme_account().await?; @@ -115,6 +120,7 @@ async fn order(ctx: &ActivityCtx, input: &OrderInput) -> GlobalResult GlobalResult GlobalResult> { Ok(acc) } -async fn create_dns_record( - client: &cf_framework::async_api::Client, - zone_id: &str, - record_name: &str, - content: &str, -) -> GlobalResult { - tracing::info!(%record_name, "creating dns record"); - - let create_record_res = client - .request(&cf::dns::CreateDnsRecord { - zone_identifier: zone_id, - params: cf::dns::CreateDnsRecordParams { - name: record_name, - content: cf::dns::DnsContent::TXT { - content: content.to_string(), - }, - proxied: Some(false), - ttl: Some(60), - priority: None, - }, - }) - .await; - - match create_record_res { - Ok(create_record_res) => Ok(create_record_res.result.id), - // Try to delete record on error - Err(err) => { - if let cf_framework::response::ApiFailure::Error( - http::status::StatusCode::BAD_REQUEST, - _, - ) = err - { - tracing::info!(%record_name, "failed to create dns record, trying to delete"); - - let list_records_res = client - .request(&cf::dns::ListDnsRecords { - zone_identifier: zone_id, - params: cf::dns::ListDnsRecordsParams { - name: Some(record_name.to_string()), - record_type: Some(cf::dns::DnsContent::TXT { - // We aren't filtering by content - content: "".to_string(), - }), - per_page: Some(1), - ..Default::default() - }, - }) - .await?; - - if let Some(record) = list_records_res.result.first() { - delete_dns_record(client, zone_id, &record.id).await?; - tracing::info!(%record_name, "deleted dns record"); - } else { - tracing::warn!(%record_name, "failed to get matching dns record"); - } - } - - // Throw error - Err(err.into()) - } - } -} - -async fn delete_dns_record( - client: &cf_framework::async_api::Client, - zone_id: &str, - record_id: &str, -) -> GlobalResult<()> { - tracing::info!(%record_id, "deleting dns record"); - - client - .request(&cf::dns::DeleteDnsRecord { - zone_identifier: zone_id, - identifier: record_id, - }) - .await?; - - Ok(()) -} - async fn poll_txt_dns(hostname: &str, content: &str) -> GlobalResult<()> { // Because the dns resolver has its own internal cache, we create a new one for each poll function call // so that clearing cache does not affect other concurrent txt lookup calls diff --git a/svc/pkg/cluster/src/workflows/server/dns_create.rs b/svc/pkg/cluster/src/workflows/server/dns_create.rs index bc07210ded..9fd45bbaee 100644 --- a/svc/pkg/cluster/src/workflows/server/dns_create.rs +++ b/svc/pkg/cluster/src/workflows/server/dns_create.rs @@ -1,9 +1,9 @@ use std::net::{IpAddr, Ipv4Addr}; use chirp_workflow::prelude::*; -use cloudflare::{endpoints as cf, framework::async_api::ApiClient}; +use cloudflare::endpoints as cf; -use crate::util::cf_client; +use crate::util::{cf_client, create_dns_record}; #[derive(Debug, Serialize, Deserialize)] pub struct Input { @@ -99,7 +99,19 @@ async fn create_dns_record( ctx: &ActivityCtx, input: &CreateDnsRecordInput, ) -> GlobalResult { - let client = cf_client().await?; + let cf_token = util::env::read_secret(&["cloudflare", "terraform", "auth_token"]).await?; + let client = cf_client(Some(&cf_token)).await?; + + create_dns_record( + &client, + &cf_token, + &input.zone_id, + &input.record_name, + cf::dns::DnsContent::A { + content: input.public_ip, + }, + ) + .await?; let create_record_res = client .request(&cf::dns::CreateDnsRecord { diff --git a/svc/pkg/cluster/src/workflows/server/dns_delete.rs b/svc/pkg/cluster/src/workflows/server/dns_delete.rs index af82b89760..0c560a73f1 100644 --- a/svc/pkg/cluster/src/workflows/server/dns_delete.rs +++ b/svc/pkg/cluster/src/workflows/server/dns_delete.rs @@ -1,5 +1,5 @@ use chirp_workflow::prelude::*; -use cloudflare::{endpoints as cf, framework as cf_framework, framework::async_api::ApiClient}; +use cloudflare::{endpoints as cf, framework as cf_framework}; use crate::util::cf_client; @@ -84,7 +84,7 @@ struct DeleteDnsRecordInput { #[activity(DeleteDnsRecord)] async fn delete_dns_record(ctx: &ActivityCtx, input: &DeleteDnsRecordInput) -> GlobalResult<()> { - let client = cf_client().await?; + let client = cf_client(None).await?; let res = client .request(&cf::dns::DeleteDnsRecord { @@ -93,6 +93,7 @@ async fn delete_dns_record(ctx: &ActivityCtx, input: &DeleteDnsRecordInput) -> G }) .await; + // Gracefully fail if not found if let Err(cf_framework::response::ApiFailure::Error(http::status::StatusCode::NOT_FOUND, _)) = res { diff --git a/svc/pkg/cluster/standalone/workflow-backfill/src/lib.rs b/svc/pkg/cluster/standalone/workflow-backfill/src/lib.rs index 6433312627..a48678ad24 100644 --- a/svc/pkg/cluster/standalone/workflow-backfill/src/lib.rs +++ b/svc/pkg/cluster/standalone/workflow-backfill/src/lib.rs @@ -24,6 +24,16 @@ pub async fn run_from_env() -> GlobalResult<()> { let crdb = ctx.crdb().await?; let mut tx = crdb.begin().await?; + // Delete invalid dns record rows + sql_execute!( + [ctx, @tx &mut tx] + " + DELETE FROM db_cluster.servers_cloudflare + WHERE dns_record_id IS NULL OR secondary_dns_record_id IS NULL + ", + ) + .await?; + let mut bctx = BackfillCtx::new(); #[derive(sqlx::FromRow)]