Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(warehouse): glue partitions #2899

Merged
merged 16 commits into from
Feb 1, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 79 additions & 3 deletions warehouse/integrations/datalake/schema-repository/glue.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package schemarepository

import (
"fmt"
"net/url"
"strings"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/glue"
Expand Down Expand Up @@ -113,12 +115,15 @@ func (gl *GlueSchemaRepository) CreateSchema() (err error) {
}

func (gl *GlueSchemaRepository) CreateTable(tableName string, columnMap map[string]string) (err error) {
tableInput := &glue.TableInput{
Name: aws.String(tableName),
}
tableInput.PartitionKeys = gl.getPartitionKeys()

// create table request
input := glue.CreateTableInput{
DatabaseName: aws.String(gl.Namespace),
TableInput: &glue.TableInput{
Name: aws.String(tableName),
},
TableInput: tableInput,
}

// add storage descriptor to create table request
Expand Down Expand Up @@ -161,6 +166,7 @@ func (gl *GlueSchemaRepository) AddColumns(tableName string, columnsInfo []wareh

// add storage descriptor to update table request
updateTableInput.TableInput.StorageDescriptor = gl.getStorageDescriptor(tableName, tableSchema)
updateTableInput.TableInput.PartitionKeys = gl.getPartitionKeys()

// update table
_, err = gl.glueClient.UpdateTable(&updateTableInput)
Expand Down Expand Up @@ -215,3 +221,73 @@ func (gl *GlueSchemaRepository) getS3LocationForTable(tableName string) string {
filePath += warehouseutils.GetTablePathInObjectStorage(gl.Namespace, tableName)
return fmt.Sprintf("%s/%s", bucketPath, filePath)
}

// RefreshPartitions takes a tableName and a list of loadFiles and refreshes all the
// partitions that are modified by the path in those loadFiles. It returns any error
// reported by Glue
func (gl *GlueSchemaRepository) RefreshPartitions(tableName string, loadFiles []warehouseutils.LoadFileT) (err error) {
pkgLogger.Infof("Refreshing partitions for table %s with a batch of %d files", tableName, len(loadFiles))
locationToPartition := make(map[string]glue.PartitionInput)
for _, loadFile := range loadFiles {
locationFolder, _ := url.QueryUnescape(warehouseutils.GetS3LocationFolder(loadFile.Location))
if _, ok := locationToPartition[locationFolder]; ok {
// Go to next file if we are already going to process this locationFolder
continue
}
storageDescriptor := glue.StorageDescriptor{
Location: aws.String(locationFolder),
SerdeInfo: &glue.SerDeInfo{
Name: aws.String(glueSerdeName),
SerializationLibrary: aws.String(glueSerdeSerializationLib),
},
InputFormat: aws.String(glueParquetInputFormat),
OutputFormat: aws.String(glueParquetOutputFormat),
}
pathParts := strings.Split(locationFolder, "/")
partitioning := strings.Split(pathParts[len(pathParts)-1], "=")
if len(partitioning) < 2 {
pkgLogger.Infof("Can not refresh partitions, timeWindowFormat setting has no = sign")
return
}
partition := partitioning[1]
partitionInput := glue.PartitionInput{StorageDescriptor: &storageDescriptor, Values: []*string{aws.String(partition)}}
locationToPartition[locationFolder] = partitionInput
}
partitionInputs := make([]*glue.PartitionInput, 0, len(locationToPartition))

// Check for existing partitions. We do not want to generate unnecessary (for already existing
// partitions) changes in Glue tables (since the number of versions of a Glue table
// is limited)
for key, partition := range locationToPartition {
getPartitionInput := glue.GetPartitionInput{
DatabaseName: aws.String(gl.Namespace),
PartitionValues: partition.Values,
TableName: aws.String(tableName),
}
_, err := gl.glueClient.GetPartition(&getPartitionInput)
if err != nil {
_partition := locationToPartition[key]
partitionInputs = append(partitionInputs, &_partition)
} else {
pkgLogger.Debugf("Skipping: %s", partition)
}
}
if len(partitionInputs) == 0 {
pkgLogger.Infof("No new partitions to refresh")
return
}
pkgLogger.Infof("Refreshing %d partitions", len(partitionInputs))
pkgLogger.Debugf("PartitionInputs: %s", partitionInputs)
batchCreatePartitionInput := glue.BatchCreatePartitionInput{
DatabaseName: aws.String(gl.Namespace),
PartitionInputList: partitionInputs,
TableName: aws.String(tableName),
}
_, err = gl.glueClient.BatchCreatePartition(&batchCreatePartitionInput)
return
}

func (_ *GlueSchemaRepository) getPartitionKeys() []*glue.Column {
columnName := strings.Split(warehouseutils.GlueTimeWindowFormat, "=")[0]
achettyiitr marked this conversation as resolved.
Show resolved Hide resolved
return []*glue.Column{{Name: aws.String(columnName), Type: aws.String("date")}}
}
4 changes: 4 additions & 0 deletions warehouse/integrations/datalake/schema-repository/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ func (ls *LocalSchemaRepository) AlterColumn(tableName, columnName, columnType s
// update schema
return ls.uploader.UpdateLocalSchema(schema)
}

func (*LocalSchemaRepository) RefreshPartitions(_ string, _ []warehouseutils.LoadFileT) error {
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,17 @@ type SchemaRepository interface {
CreateTable(tableName string, columnMap map[string]string) (err error)
AddColumns(tableName string, columnsInfo []warehouseutils.ColumnInfo) (err error)
AlterColumn(tableName, columnName, columnType string) (err error)
RefreshPartitions(tableName string, loadFiles []warehouseutils.LoadFileT) error
}

func UseGlue(w *warehouseutils.Warehouse) bool {
glueConfig := warehouseutils.GetConfigValueBoolString(UseGlueConfig, *w)
hasAWSRegion := misc.HasAWSRegionInConfig(w.Destination.Config)
return glueConfig == "true" && hasAWSRegion
}

func NewSchemaRepository(wh warehouseutils.Warehouse, uploader warehouseutils.UploaderI) (SchemaRepository, error) {
if warehouseutils.GetConfigValueBoolString(UseGlueConfig, wh) == "true" && misc.HasAWSRegionInConfig(wh.Destination.Config) {
if UseGlue(&wh) {
return NewGlueSchemaRepository(wh)
}
return NewLocalSchemaRepository(wh, uploader)
Expand Down
33 changes: 32 additions & 1 deletion warehouse/internal/loadfiles/loadfiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"context"
"fmt"
"strings"
"time"

schemarepository "github.com/rudderlabs/rudder-server/warehouse/integrations/datalake/schema-repository"

jsoniter "github.com/json-iterator/go"
"github.com/rudderlabs/rudder-server/config"
Expand Down Expand Up @@ -207,7 +210,7 @@ func (lf *LoadFileGenerator) createFromStaging(ctx context.Context, job model.Up
payload.StagingDestinationConfig = revisionConfig.Config
}
if slices.Contains(warehouseutils.TimeWindowDestinations, job.Warehouse.Type) {
payload.LoadFilePrefix = warehouseutils.GetLoadFilePrefix(stagingFile.TimeWindow, job.Warehouse)
payload.LoadFilePrefix = GetLoadFilePrefix(stagingFile.TimeWindow, job.Warehouse)
}

payloadJSON, err := json.Marshal(payload)
Expand Down Expand Up @@ -346,3 +349,31 @@ func (lf *LoadFileGenerator) destinationRevisionIDMap(ctx context.Context, job m
}
return
}

func GetLoadFilePrefix(timeWindow time.Time, warehouse warehouseutils.Warehouse) (timeWindowFormat string) {
switch warehouse.Type {
case warehouseutils.GCS_DATALAKE:
var (
timeWindowLayout = warehouseutils.GetConfigValue("timeWindowLayout", warehouse)
tableSuffixPath = warehouseutils.GetConfigValue("tableSuffix", warehouse)
)

if timeWindowLayout == "" {
timeWindowLayout = warehouseutils.DatalakeTimeWindowFormat
}

timeWindowFormat = timeWindow.Format(timeWindowLayout)

if tableSuffixPath != "" {
timeWindowFormat = fmt.Sprintf("%v/%v", tableSuffixPath, timeWindowFormat)
}
case warehouseutils.S3_DATALAKE:
timeWindowFormat = timeWindow.Format(warehouseutils.DatalakeTimeWindowFormat)
if schemarepository.UseGlue(&warehouse) {
timeWindowFormat = timeWindow.Format(warehouseutils.GlueTimeWindowFormat)
}
default:
timeWindowFormat = timeWindow.Format(warehouseutils.DatalakeTimeWindowFormat)
}
return timeWindowFormat
}
83 changes: 83 additions & 0 deletions warehouse/internal/loadfiles/loadfiles_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,86 @@ func TestCreateLoadFiles_DestinationHistory(t *testing.T) {
require.Zero(t, endID)
})
}

func TestGetLoadFilePrefix(t *testing.T) {
testCases := []struct {
name string
warehouse warehouseutils.Warehouse
expected string
}{
{
name: "s3 datalake",
warehouse: warehouseutils.Warehouse{
Destination: backendconfig.DestinationT{
Config: map[string]interface{}{
"tableSuffix": "key=val",
},
},
Type: warehouseutils.S3_DATALAKE,
},
expected: "2022/08/06/14",
},
{
name: "s3 datalake with glue",
warehouse: warehouseutils.Warehouse{
Destination: backendconfig.DestinationT{
Config: map[string]interface{}{
"tableSuffix": "key=val",
"region": "test-region",
"useGlue": true,
},
},
Type: warehouseutils.S3_DATALAKE,
},
expected: "dt=2022-08-06",
},
{
name: "azure datalake",
warehouse: warehouseutils.Warehouse{
Destination: backendconfig.DestinationT{
Config: map[string]interface{}{
"tableSuffix": "key=val",
},
},
Type: warehouseutils.AZURE_DATALAKE,
},
expected: "2022/08/06/14",
},
{
name: "gcs datalake",
warehouse: warehouseutils.Warehouse{
Destination: backendconfig.DestinationT{
Config: map[string]interface{}{
"tableSuffix": "key=val",
},
},
Type: warehouseutils.GCS_DATALAKE,
},
expected: "key=val/2022/08/06/14",
},
{
name: "gcs datalake with suffix and layout",
warehouse: warehouseutils.Warehouse{
Destination: backendconfig.DestinationT{
Config: map[string]interface{}{
"tableSuffix": "key=val",
"timeWindowLayout": "year=2006/month=01/day=02/hour=15",
},
},
Type: warehouseutils.GCS_DATALAKE,
},
expected: "key=val/year=2022/month=08/day=06/hour=14",
},
}
for _, tc := range testCases {
tc := tc

t.Run(tc.name, func(t *testing.T) {
t.Parallel()

timeWindow := time.Date(2022, time.Month(8), 6, 14, 10, 30, 0, time.UTC)
got := loadfiles.GetLoadFilePrefix(timeWindow, tc.warehouse)
require.Equal(t, got, tc.expected)
})
}
}
45 changes: 45 additions & 0 deletions warehouse/upload.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"sync"
"time"

schemarepository "github.com/rudderlabs/rudder-server/warehouse/integrations/datalake/schema-repository"

"github.com/rudderlabs/rudder-server/warehouse/integrations/manager"

"golang.org/x/exp/slices"
Expand Down Expand Up @@ -482,6 +484,10 @@ func (job *UploadJobT) run() (err error) {
job.matchRowsInStagingAndLoadFiles()
job.recordLoadFileGenerationTimeStat(startLoadFileID, endLoadFileID)

if err = job.refreshPartitions(startLoadFileID, endLoadFileID); err != nil {
break
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we log the error? As an Warn possibly?

}

newStatus = nextUploadState.completed

case model.UpdatedTableUploadsCounts:
Expand Down Expand Up @@ -2077,3 +2083,42 @@ func (job *UploadJobT) GetLocalSchema() warehouseutils.SchemaT {
func (job *UploadJobT) UpdateLocalSchema(schema warehouseutils.SchemaT) error {
return job.schemaHandle.updateLocalSchema(schema)
}

func (job *UploadJobT) refreshPartitions(loadFileStartID, loadFileEndID int64) error {
if slices.Contains(warehouseutils.TimeWindowDestinations, job.upload.DestinationType) {
return nil
Jayachand marked this conversation as resolved.
Show resolved Hide resolved
}

var (
repository schemarepository.SchemaRepository
err error
)

if repository, err = schemarepository.NewSchemaRepository(job.warehouse, job); err != nil {
return fmt.Errorf("create schema repository: %w", err)
}

// Refresh partitions if exists
for tableName := range job.upload.UploadSchema {
loadFiles := job.GetLoadFilesMetadata(warehouseutils.GetLoadFilesOptionsT{
Table: tableName,
StartID: loadFileStartID,
EndID: loadFileEndID,
})

// This is best done every 100 files, since it's a batch request for updates in Glue
partitionBatchSize := 99
for i := 0; i < len(loadFiles); i += partitionBatchSize {
end := i + partitionBatchSize
if end > len(loadFiles) {
end = len(loadFiles)
}

if err = repository.RefreshPartitions(tableName, loadFiles[i:end]); err != nil {
return fmt.Errorf("refresh partitions: %w", err)
}
}
}

return nil
}
21 changes: 1 addition & 20 deletions warehouse/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ const (
BQLoadedAtFormat = "2006-01-02 15:04:05.999999 Z"
BQUuidTSFormat = "2006-01-02 15:04:05 Z"
DatalakeTimeWindowFormat = "2006/01/02/15"
GlueTimeWindowFormat = "dt=2006-01-02"
)

const (
Expand Down Expand Up @@ -980,26 +981,6 @@ func GetLoadFileFormat(whType string) string {
}
}

func GetLoadFilePrefix(timeWindow time.Time, warehouse Warehouse) (timeWindowFormat string) {
whType := warehouse.Type
switch whType {
case GCS_DATALAKE:
timeWindowLayout := GetConfigValue("timeWindowLayout", warehouse)
if timeWindowLayout == "" {
timeWindowLayout = DatalakeTimeWindowFormat
}

timeWindowFormat = timeWindow.Format(timeWindowLayout)
tableSuffixPath := GetConfigValue("tableSuffix", warehouse)
if tableSuffixPath != "" {
timeWindowFormat = fmt.Sprintf("%v/%v", tableSuffixPath, timeWindowFormat)
}
default:
timeWindowFormat = timeWindow.Format(DatalakeTimeWindowFormat)
}
return timeWindowFormat
}

func GetRequestWithTimeout(ctx context.Context, url string, timeout time.Duration) ([]byte, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url, http.NoBody)
if err != nil {
Expand Down
Loading