Remove docs matching hack and add more correct strategies for docs di…

…scovery. (#1882) This pull request is a draft. It includes changes to docs discovery that ensure we are matching entityDocs to the correct property field descriptions. - adds number pattern to regex so we can match "S3" - uses full parent path as docs path so we can properly map different "key" field descriptions to the correct parent field - adds new regex to nested header discovery - allows for dot-spaced keys in entityDocs for more precise lookup Fixes #1789 Fixes #1876 Fixes #1875
pulumi · Apr 22, 2024 · 64c7a9b · 64c7a9b
2 parents 94c1de4 + 26fd8df
commit 64c7a9b
Show file tree

Hide file tree

Showing 4 changed files with 422 additions and 96 deletions.
diff --git a/pkg/tfgen/docs.go b/pkg/tfgen/docs.go
@@ -597,7 +597,6 @@ func (p *tfMarkdownParser) parse(tfMarkdown []byte) (entityDocs, error) {
 	footerLinks := getFooterLinks(markdown)
 
 	doc, _ := cleanupDoc(p.rawname, p.sink, p.infoCtx, p.ret, footerLinks)
-
 	return doc, nil
 }
 
@@ -820,84 +819,187 @@ func (p *tfMarkdownParser) parseSchemaWithNestedSections(subsection []string) {
 	parseTopLevelSchemaIntoDocs(&p.ret, topLevelSchema, p.sink.warn)
 }
 
+type markdownLineInfo struct {
+	name, desc          string
+	isFound, isIndented bool
+}
+
 // parseArgFromMarkdownLine takes a line of Markdown and attempts to parse it for a Terraform argument and its
-// description
-func parseArgFromMarkdownLine(line string) (string, string, bool) {
+// description. It returns a struct containing the name and description of the arg, whether an arg was found,
+// and whether it has indented space, denoting it as a subproperty of a previously found property.
+func parseArgFromMarkdownLine(line string) markdownLineInfo {
 	matches := argumentBulletRegexp.FindStringSubmatch(line)
-
+	var parsed markdownLineInfo
 	if len(matches) > 4 {
-		return matches[1], matches[4], true
-	}
+		if strings.HasPrefix(matches[0], "  ") {
+			parsed.isIndented = true
+		}
+		parsed.name = matches[1]
+		parsed.desc = matches[4]
+		parsed.isFound = true
 
-	return "", "", false
+	}
+	return parsed
 }
 
 var genericNestedRegexp = regexp.MustCompile("supports? the following:")
 
 var nestedObjectRegexps = []*regexp.Regexp{
 	// For example:
 	// s3_bucket.html.markdown: "The `website` object supports the following:"
+	// appmesh_gateway_route.html.markdown:
+	//		"The `grpc_route`, `http_route` and `http2_route` objects supports the following:"
 	// ami.html.markdown: "When `virtualization_type` is "hvm" the following additional arguments apply:"
-	regexp.MustCompile("`([a-z_]+)`.*following"),
+	regexp.MustCompile("`([a-z_0-9]+)`.*following"),
 
 	// For example:
 	// athena_workgroup.html.markdown: "#### result_configuration Argument Reference"
-	regexp.MustCompile("(?i)## ([a-z_]+).* argument reference"),
+	regexp.MustCompile("(?i)## ([a-z_0-9]+).* argument reference"),
+
+	// For example:
+	// codebuild_project.html.markdown: "#### build_batch_config: restrictions"
+	// codebuild_project.html.markdown: "#### logs_config: s3_logs"
+	regexp.MustCompile("###+ ([a-zA-Z_0-9]+: [a-zA-Z_0-9]+).*"),
 
 	// For example:
 	// elasticsearch_domain.html.markdown: "### advanced_security_options"
-	regexp.MustCompile("###+ ([a-z_]+).*"),
+	regexp.MustCompile("###+ ([a-z_0-9]+).*"),
 
 	// For example:
 	// dynamodb_table.html.markdown: "### `server_side_encryption`"
-	regexp.MustCompile("###+ `([a-z_]+).*`"),
+	regexp.MustCompile("###+ `([a-z_0-9]+).*`"),
 
 	// For example:
 	// route53_record.html.markdown: "### Failover Routing Policy"
-	regexp.MustCompile("###+ ([a-zA-Z_ ]+).*"),
+	// appflow_flow.html.markdown: "###### S3 Input Format Config"
+	regexp.MustCompile("###+ ([a-zA-Z_ 0-9]+).*"),
 
 	// For example:
 	// sql_database_instance.html.markdown:
-	// "The optional `settings.ip_configuration.authorized_networks[]`` sublist supports:"
+	// 		"The optional `settings.ip_configuration.authorized_networks[]`` sublist supports:"
 	regexp.MustCompile("`([a-zA-Z_.\\[\\]]+)`.*supports:"),
+
+	// For example when blocks/subblocks/sublists are defined across more than one line
+	// sql_database_instance.html.markdown:
+	//		"The optional `settings.maintenance_window` subblock for instances declares a one-hour"
+	regexp.MustCompile("The .* `([a-zA-Z_.\\[\\]]+)` sublist .*"),
+	regexp.MustCompile("The .* `([a-zA-Z_.\\[\\]]+)` subblock .*"),
+	regexp.MustCompile("The .* `([a-zA-Z_.\\[\\]]+)` block .*"),
+}
+
+// getMultipleNestedBlockNames is called when we detect that a resource matches the "`([a-z_0-9]+)`.*following" regex.
+// We check if more than one nested block name is listed on this line, and we additionally check if there's an
+// indication of an extra nested object, denoted by "'s", as in:
+// "The `grpc_route`, `http_route` and `http2_route` 's `action` object supports the following:".
+func getMultipleNestedBlockNames(match string) []string {
+	subNest := ""
+	var nestedBlockNames []string
+	// First we check for the presence of a possible nested property via 's
+	if strings.Contains(match, "'s ") {
+		// split the match along the 's
+		part1, part2, _ := strings.Cut(match, "'s")
+		match = part1
+		// Extract our subheading. It should be the second item in part2 of the match.
+		part2Slice := strings.Split(part2, "`")
+		if len(part2Slice) >= 2 {
+			subNest = part2Slice[1]
+		}
+	}
+	// As per previous regex match, the resource names will be surrounded by backticks, so we extract them
+	tokenInBackticks := regexp.MustCompile("`[^`]+`")
+	blockNames := tokenInBackticks.FindAllString(match, -1)
+	for _, blockName := range blockNames {
+		if blockName != "" {
+			blockName = strings.Trim(blockName, "`")
+			blockName = strings.ToLower(blockName)
+			blockName = strings.Replace(blockName, " ", "_", -1)
+			blockName = strings.TrimSuffix(blockName, "[]")
+			if subNest != "" {
+				// For the format:
+				//
+				//		The `grpc_route`, `http_route` and `http2_route` 's `action` object supports the following:
+				//
+				// the result should be grpc_route.action
+				blockName = blockName + "." + subNest
+			}
+			nestedBlockNames = append(nestedBlockNames, blockName)
+		}
+	}
+	return nestedBlockNames
+}
+
+// getNestedNameWithColon handles cases where a property is shown as a subpoperty by means of a colon in the docs.
+// For example:
+//
+//	#### logs_config: s3_logs
+//
+// should return []string{"logs_config.s3_logs"}.
+func getNestedNameWithColon(match string) []string {
+	var nestedBlockNames []string
+	parts := strings.Split(match, ":")
+
+	blockName := strings.ToLower(parts[0])
+	blockName = strings.Replace(blockName, " ", "_", -1)
+
+	subNest := strings.ToLower(parts[1])
+	subNest = strings.TrimSpace(subNest)
+	subNest = strings.Replace(subNest, " ", "_", -1)
+
+	blockName = blockName + "." + subNest
+	nestedBlockNames = append(nestedBlockNames, blockName)
+	return nestedBlockNames
 }
 
-// getNestedBlockName take a line of a Terraform docs Markdown page and returns the name of the nested block it
+// getNestedBlockNames takes line of a Terraform docs Markdown page and returns the name(s) of the nested block it
 // describes. If the line does not describe a nested block, an empty string is returned.
 //
 // Examples of nested blocks include (but are not limited to):
 //
 // - "The `private_cluster_config` block supports:" -> "private_cluster_config"
 // - "The optional settings.backup_configuration subblock supports:" -> "settings.backup_configuration"
-func getNestedBlockName(line string) string {
+func getNestedBlockNames(line string) []string {
 	nested := ""
-	for _, match := range nestedObjectRegexps {
+	var nestedBlockNames []string
+
+	for i, match := range nestedObjectRegexps {
 		matches := match.FindStringSubmatch(line)
-		if len(matches) >= 2 {
+
+		// If we match with the first regex, we have to see if we've got many to many matching for resources going on.
+		if len(matches) >= 2 && i == 0 {
+			nestedBlockNames = getMultipleNestedBlockNames(matches[0])
+			break
+		} else if len(matches) >= 2 && i == 2 {
+			// there's a colon in the subheader; split the line
+			nestedBlockNames = getNestedNameWithColon(matches[1])
+			break
+		} else if len(matches) >= 2 {
 			nested = strings.ToLower(matches[1])
 			nested = strings.Replace(nested, " ", "_", -1)
 			nested = strings.TrimSuffix(nested, "[]")
-			parts := strings.Split(nested, ".")
-			nested = parts[len(parts)-1]
+			nestedBlockNames = append(nestedBlockNames, nested)
 			break
 		}
 	}
-
-	return nested
+	return nestedBlockNames
 }
 
 func parseArgReferenceSection(subsection []string, ret *entityDocs) {
+	// Variable to remember the last argument we found.
 	var lastMatch string
-	var nested docsPath
+	// Collection to hold all arguments that headline a nested description.
+	var nesteds []docsPath
 
 	addNewHeading := func(name, desc, line string) {
 		// found a property bullet, extract the name and description
-		if nested != "" {
-			// We found this line within a nested field. We should record it as such.
-			if ret.Arguments[nested] == nil {
-				totalArgumentsFromDocs++
+		if len(nesteds) > 0 {
+			for _, nested := range nesteds {
+				// We found this line within a nested field. We should record it as such.
+				if ret.Arguments[nested] == nil {
+					totalArgumentsFromDocs++
+				}
+				ret.Arguments[nested.join(name)] = &argumentDocs{desc}
 			}
-			ret.Arguments[nested.join(name)] = &argumentDocs{desc}
+
 		} else {
 			if genericNestedRegexp.MatchString(line) {
 				return
@@ -906,39 +1008,77 @@ func parseArgReferenceSection(subsection []string, ret *entityDocs) {
 			totalArgumentsFromDocs++
 		}
 	}
-
+	// This function adds the current line as a description to the last matched resource,
+	//in cases where there's no resource match found on this line.
+	//It represents a multi-line description for a field.
 	extendExistingHeading := func(line string) {
-		line = "\n" + strings.TrimSpace(line)
-		if nested != "" {
-			ret.Arguments[nested.join(lastMatch)].description += line
+		if len(nesteds) > 0 {
+			for _, nested := range nesteds {
+				line = "\n" + strings.TrimSpace(line)
+				ret.Arguments[nested.join(lastMatch)].description += line
+			}
 		} else {
 			if genericNestedRegexp.MatchString(line) {
 				lastMatch = ""
-				nested = ""
+				nesteds = []docsPath{}
 				return
 			}
+			line = "\n" + strings.TrimSpace(line)
 			ret.Arguments[docsPath(lastMatch)].description += line
 		}
 	}
 
+	// hadSpace tells us if the previous line was blank.
 	var hadSpace bool
+
+	// parentName tracks the name of the previous resource, in case we get indented child resources.
+	var parentName string
+
 	for _, line := range subsection {
-		if name, desc, matchFound := parseArgFromMarkdownLine(line); matchFound {
-			// We have found a new
-			addNewHeading(name, desc, line)
+		// We have found a new resource on this line.
+		parsedArg := parseArgFromMarkdownLine(line)
+		name := parsedArg.name
+		desc := parsedArg.desc
+		matchFound := parsedArg.isFound
+		isIndented := parsedArg.isIndented
+		if matchFound {
+			// We have found a new argument.
+			// If a bullet point is indented, we have most likely found a sub-field of the previous line.
+			// See: https://github.com/pulumi/pulumi-terraform-bridge/issues/1875
+			if isIndented {
+				name = parentName + "." + name
+			} else {
+				parentName = name
+			}
 			lastMatch = name
+			addNewHeading(name, desc, line)
+
 		} else if strings.TrimSpace(line) == "---" {
 			// --- is a markdown section break. This probably indicates the
 			// section is over, but we take it to mean that the current
 			// heading is over.
 			lastMatch = ""
-		} else if nestedBlockCurrentLine := getNestedBlockName(line); hadSpace && nestedBlockCurrentLine != "" {
-			nested = docsPath(nestedBlockCurrentLine)
+		} else if nestedBlockCurrentLine := getNestedBlockNames(line); hadSpace && len(nestedBlockCurrentLine) > 0 {
+			// This tells us if there's a resource that is about to have subfields (nesteds)
+			// in subsequent lines.
+			//empty nesteds
+			nesteds = []docsPath{}
+			for _, item := range nestedBlockCurrentLine {
+				nesteds = append(nesteds, docsPath(item))
+			}
 			lastMatch = ""
 		} else if !isBlank(line) && lastMatch != "" {
+			// This appends the current line to the previous match's description.
 			extendExistingHeading(line)
-		} else if nestedBlockCurrentLine := getNestedBlockName(line); nestedBlockCurrentLine != "" {
-			nested = docsPath(nestedBlockCurrentLine)
+
+		} else if nestedBlockCurrentLine := getNestedBlockNames(line); len(nestedBlockCurrentLine) > 0 {
+			// This tells us if there's a resource that is about to have subfields (nesteds)
+			// in subsequent lines.
+			//empty nesteds
+			nesteds = []docsPath{}
+			for _, item := range nestedBlockCurrentLine {
+				nesteds = append(nesteds, docsPath(item))
+			}
 			lastMatch = ""
 		} else if lastMatch != "" {
 			extendExistingHeading(line)