Skip to content

Commit

Permalink
Remove docs matching hack and add more correct strategies for docs di…
Browse files Browse the repository at this point in the history
…scovery. (#1882)

This pull request is a draft.

It includes changes to docs discovery that ensure we are matching
entityDocs to the correct property field descriptions.

- adds number pattern to regex so we can match "S3"
- uses full parent path as docs path so we can properly map different
"key" field descriptions to the correct parent field
- adds new regex to nested header discovery
- allows for dot-spaced keys in entityDocs for more precise lookup

Fixes #1789

Fixes #1876

Fixes #1875
  • Loading branch information
guineveresaenger committed Apr 22, 2024
2 parents 94c1de4 + 26fd8df commit 64c7a9b
Show file tree
Hide file tree
Showing 4 changed files with 422 additions and 96 deletions.
218 changes: 179 additions & 39 deletions pkg/tfgen/docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,6 @@ func (p *tfMarkdownParser) parse(tfMarkdown []byte) (entityDocs, error) {
footerLinks := getFooterLinks(markdown)

doc, _ := cleanupDoc(p.rawname, p.sink, p.infoCtx, p.ret, footerLinks)

return doc, nil
}

Expand Down Expand Up @@ -820,84 +819,187 @@ func (p *tfMarkdownParser) parseSchemaWithNestedSections(subsection []string) {
parseTopLevelSchemaIntoDocs(&p.ret, topLevelSchema, p.sink.warn)
}

type markdownLineInfo struct {
name, desc string
isFound, isIndented bool
}

// parseArgFromMarkdownLine takes a line of Markdown and attempts to parse it for a Terraform argument and its
// description
func parseArgFromMarkdownLine(line string) (string, string, bool) {
// description. It returns a struct containing the name and description of the arg, whether an arg was found,
// and whether it has indented space, denoting it as a subproperty of a previously found property.
func parseArgFromMarkdownLine(line string) markdownLineInfo {
matches := argumentBulletRegexp.FindStringSubmatch(line)

var parsed markdownLineInfo
if len(matches) > 4 {
return matches[1], matches[4], true
}
if strings.HasPrefix(matches[0], " ") {
parsed.isIndented = true
}
parsed.name = matches[1]
parsed.desc = matches[4]
parsed.isFound = true

return "", "", false
}
return parsed
}

var genericNestedRegexp = regexp.MustCompile("supports? the following:")

var nestedObjectRegexps = []*regexp.Regexp{
// For example:
// s3_bucket.html.markdown: "The `website` object supports the following:"
// appmesh_gateway_route.html.markdown:
// "The `grpc_route`, `http_route` and `http2_route` objects supports the following:"
// ami.html.markdown: "When `virtualization_type` is "hvm" the following additional arguments apply:"
regexp.MustCompile("`([a-z_]+)`.*following"),
regexp.MustCompile("`([a-z_0-9]+)`.*following"),

// For example:
// athena_workgroup.html.markdown: "#### result_configuration Argument Reference"
regexp.MustCompile("(?i)## ([a-z_]+).* argument reference"),
regexp.MustCompile("(?i)## ([a-z_0-9]+).* argument reference"),

// For example:
// codebuild_project.html.markdown: "#### build_batch_config: restrictions"
// codebuild_project.html.markdown: "#### logs_config: s3_logs"
regexp.MustCompile("###+ ([a-zA-Z_0-9]+: [a-zA-Z_0-9]+).*"),

// For example:
// elasticsearch_domain.html.markdown: "### advanced_security_options"
regexp.MustCompile("###+ ([a-z_]+).*"),
regexp.MustCompile("###+ ([a-z_0-9]+).*"),

// For example:
// dynamodb_table.html.markdown: "### `server_side_encryption`"
regexp.MustCompile("###+ `([a-z_]+).*`"),
regexp.MustCompile("###+ `([a-z_0-9]+).*`"),

// For example:
// route53_record.html.markdown: "### Failover Routing Policy"
regexp.MustCompile("###+ ([a-zA-Z_ ]+).*"),
// appflow_flow.html.markdown: "###### S3 Input Format Config"
regexp.MustCompile("###+ ([a-zA-Z_ 0-9]+).*"),

// For example:
// sql_database_instance.html.markdown:
// "The optional `settings.ip_configuration.authorized_networks[]`` sublist supports:"
// "The optional `settings.ip_configuration.authorized_networks[]`` sublist supports:"
regexp.MustCompile("`([a-zA-Z_.\\[\\]]+)`.*supports:"),

// For example when blocks/subblocks/sublists are defined across more than one line
// sql_database_instance.html.markdown:
// "The optional `settings.maintenance_window` subblock for instances declares a one-hour"
regexp.MustCompile("The .* `([a-zA-Z_.\\[\\]]+)` sublist .*"),
regexp.MustCompile("The .* `([a-zA-Z_.\\[\\]]+)` subblock .*"),
regexp.MustCompile("The .* `([a-zA-Z_.\\[\\]]+)` block .*"),
}

// getMultipleNestedBlockNames is called when we detect that a resource matches the "`([a-z_0-9]+)`.*following" regex.
// We check if more than one nested block name is listed on this line, and we additionally check if there's an
// indication of an extra nested object, denoted by "'s", as in:
// "The `grpc_route`, `http_route` and `http2_route` 's `action` object supports the following:".
func getMultipleNestedBlockNames(match string) []string {
subNest := ""
var nestedBlockNames []string
// First we check for the presence of a possible nested property via 's
if strings.Contains(match, "'s ") {
// split the match along the 's
part1, part2, _ := strings.Cut(match, "'s")
match = part1
// Extract our subheading. It should be the second item in part2 of the match.
part2Slice := strings.Split(part2, "`")
if len(part2Slice) >= 2 {
subNest = part2Slice[1]
}
}
// As per previous regex match, the resource names will be surrounded by backticks, so we extract them
tokenInBackticks := regexp.MustCompile("`[^`]+`")
blockNames := tokenInBackticks.FindAllString(match, -1)
for _, blockName := range blockNames {
if blockName != "" {
blockName = strings.Trim(blockName, "`")
blockName = strings.ToLower(blockName)
blockName = strings.Replace(blockName, " ", "_", -1)
blockName = strings.TrimSuffix(blockName, "[]")
if subNest != "" {
// For the format:
//
// The `grpc_route`, `http_route` and `http2_route` 's `action` object supports the following:
//
// the result should be grpc_route.action
blockName = blockName + "." + subNest
}
nestedBlockNames = append(nestedBlockNames, blockName)
}
}
return nestedBlockNames
}

// getNestedNameWithColon handles cases where a property is shown as a subpoperty by means of a colon in the docs.
// For example:
//
// #### logs_config: s3_logs
//
// should return []string{"logs_config.s3_logs"}.
func getNestedNameWithColon(match string) []string {
var nestedBlockNames []string
parts := strings.Split(match, ":")

blockName := strings.ToLower(parts[0])
blockName = strings.Replace(blockName, " ", "_", -1)

subNest := strings.ToLower(parts[1])
subNest = strings.TrimSpace(subNest)
subNest = strings.Replace(subNest, " ", "_", -1)

blockName = blockName + "." + subNest
nestedBlockNames = append(nestedBlockNames, blockName)
return nestedBlockNames
}

// getNestedBlockName take a line of a Terraform docs Markdown page and returns the name of the nested block it
// getNestedBlockNames takes line of a Terraform docs Markdown page and returns the name(s) of the nested block it
// describes. If the line does not describe a nested block, an empty string is returned.
//
// Examples of nested blocks include (but are not limited to):
//
// - "The `private_cluster_config` block supports:" -> "private_cluster_config"
// - "The optional settings.backup_configuration subblock supports:" -> "settings.backup_configuration"
func getNestedBlockName(line string) string {
func getNestedBlockNames(line string) []string {
nested := ""
for _, match := range nestedObjectRegexps {
var nestedBlockNames []string

for i, match := range nestedObjectRegexps {
matches := match.FindStringSubmatch(line)
if len(matches) >= 2 {

// If we match with the first regex, we have to see if we've got many to many matching for resources going on.
if len(matches) >= 2 && i == 0 {
nestedBlockNames = getMultipleNestedBlockNames(matches[0])
break
} else if len(matches) >= 2 && i == 2 {
// there's a colon in the subheader; split the line
nestedBlockNames = getNestedNameWithColon(matches[1])
break
} else if len(matches) >= 2 {
nested = strings.ToLower(matches[1])
nested = strings.Replace(nested, " ", "_", -1)
nested = strings.TrimSuffix(nested, "[]")
parts := strings.Split(nested, ".")
nested = parts[len(parts)-1]
nestedBlockNames = append(nestedBlockNames, nested)
break
}
}

return nested
return nestedBlockNames
}

func parseArgReferenceSection(subsection []string, ret *entityDocs) {
// Variable to remember the last argument we found.
var lastMatch string
var nested docsPath
// Collection to hold all arguments that headline a nested description.
var nesteds []docsPath

addNewHeading := func(name, desc, line string) {
// found a property bullet, extract the name and description
if nested != "" {
// We found this line within a nested field. We should record it as such.
if ret.Arguments[nested] == nil {
totalArgumentsFromDocs++
if len(nesteds) > 0 {
for _, nested := range nesteds {
// We found this line within a nested field. We should record it as such.
if ret.Arguments[nested] == nil {
totalArgumentsFromDocs++
}
ret.Arguments[nested.join(name)] = &argumentDocs{desc}
}
ret.Arguments[nested.join(name)] = &argumentDocs{desc}

} else {
if genericNestedRegexp.MatchString(line) {
return
Expand All @@ -906,39 +1008,77 @@ func parseArgReferenceSection(subsection []string, ret *entityDocs) {
totalArgumentsFromDocs++
}
}

// This function adds the current line as a description to the last matched resource,
//in cases where there's no resource match found on this line.
//It represents a multi-line description for a field.
extendExistingHeading := func(line string) {
line = "\n" + strings.TrimSpace(line)
if nested != "" {
ret.Arguments[nested.join(lastMatch)].description += line
if len(nesteds) > 0 {
for _, nested := range nesteds {
line = "\n" + strings.TrimSpace(line)
ret.Arguments[nested.join(lastMatch)].description += line
}
} else {
if genericNestedRegexp.MatchString(line) {
lastMatch = ""
nested = ""
nesteds = []docsPath{}
return
}
line = "\n" + strings.TrimSpace(line)
ret.Arguments[docsPath(lastMatch)].description += line
}
}

// hadSpace tells us if the previous line was blank.
var hadSpace bool

// parentName tracks the name of the previous resource, in case we get indented child resources.
var parentName string

for _, line := range subsection {
if name, desc, matchFound := parseArgFromMarkdownLine(line); matchFound {
// We have found a new
addNewHeading(name, desc, line)
// We have found a new resource on this line.
parsedArg := parseArgFromMarkdownLine(line)
name := parsedArg.name
desc := parsedArg.desc
matchFound := parsedArg.isFound
isIndented := parsedArg.isIndented
if matchFound {
// We have found a new argument.
// If a bullet point is indented, we have most likely found a sub-field of the previous line.
// See: https://github.com/pulumi/pulumi-terraform-bridge/issues/1875
if isIndented {
name = parentName + "." + name
} else {
parentName = name
}
lastMatch = name
addNewHeading(name, desc, line)

} else if strings.TrimSpace(line) == "---" {
// --- is a markdown section break. This probably indicates the
// section is over, but we take it to mean that the current
// heading is over.
lastMatch = ""
} else if nestedBlockCurrentLine := getNestedBlockName(line); hadSpace && nestedBlockCurrentLine != "" {
nested = docsPath(nestedBlockCurrentLine)
} else if nestedBlockCurrentLine := getNestedBlockNames(line); hadSpace && len(nestedBlockCurrentLine) > 0 {
// This tells us if there's a resource that is about to have subfields (nesteds)
// in subsequent lines.
//empty nesteds
nesteds = []docsPath{}
for _, item := range nestedBlockCurrentLine {
nesteds = append(nesteds, docsPath(item))
}
lastMatch = ""
} else if !isBlank(line) && lastMatch != "" {
// This appends the current line to the previous match's description.
extendExistingHeading(line)
} else if nestedBlockCurrentLine := getNestedBlockName(line); nestedBlockCurrentLine != "" {
nested = docsPath(nestedBlockCurrentLine)

} else if nestedBlockCurrentLine := getNestedBlockNames(line); len(nestedBlockCurrentLine) > 0 {
// This tells us if there's a resource that is about to have subfields (nesteds)
// in subsequent lines.
//empty nesteds
nesteds = []docsPath{}
for _, item := range nestedBlockCurrentLine {
nesteds = append(nesteds, docsPath(item))
}
lastMatch = ""
} else if lastMatch != "" {
extendExistingHeading(line)
Expand Down

0 comments on commit 64c7a9b

Please sign in to comment.