-
Notifications
You must be signed in to change notification settings - Fork 317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(warehouse): glue partitions #2899
Changes from 5 commits
9ba80ee
7f2eb43
7b89363
2d8b6c4
727ce75
2d83d26
35fb007
57f5b15
faeb322
3b6c171
b6ee12a
ea841a9
ea23e35
b4b3762
a4f3388
63fcc30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package schemarepository | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"os" | ||
"testing" | ||
|
||
"github.com/aws/aws-sdk-go/service/glue" | ||
backendconfig "github.com/rudderlabs/rudder-server/config/backend-config" | ||
"github.com/rudderlabs/rudder-server/services/filemanager" | ||
|
||
"github.com/rudderlabs/rudder-server/utils/misc" | ||
warehouseutils "github.com/rudderlabs/rudder-server/warehouse/utils" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestGlueSchemaRepositoryRoundTrip(t *testing.T) { | ||
type S3Credentials struct { | ||
AccessKeyID string | ||
AccessKey string | ||
Region string | ||
Bucket string | ||
} | ||
|
||
var ( | ||
credentialsStr string | ||
credentials S3Credentials | ||
err error | ||
credentialsEnv = "S3_DATALAKE_TEST_CREDENTIALS" | ||
testNamespace = fmt.Sprintf("test_namespace_%s", warehouseutils.RandHex()) | ||
testTable = fmt.Sprintf("test_table_%s", warehouseutils.RandHex()) | ||
testFile = "testdata/load.parquet" | ||
testColumns = map[string]string{ | ||
"id": "string", | ||
"received_at": "datetime", | ||
"test_array_bool": "array(boolean)", | ||
"test_array_datetime": "array(datetime)", | ||
"test_array_float": "array(float)", | ||
"test_array_int": "array(int)", | ||
"test_array_string": "array(string)", | ||
"test_bool": "boolean", | ||
"test_datetime": "datetime", | ||
"test_float": "float", | ||
"test_int": "int", | ||
"test_string": "string", | ||
} | ||
) | ||
|
||
if credentialsStr = os.Getenv(credentialsEnv); credentialsStr == "" { | ||
t.Skipf("Skipping %s as %s is not set", t.Name(), credentialsEnv) | ||
} | ||
|
||
err = json.Unmarshal([]byte(credentialsStr), &credentials) | ||
require.NoError(t, err) | ||
|
||
destination := backendconfig.DestinationT{ | ||
Config: map[string]interface{}{ | ||
"region": credentials.Region, | ||
"bucketName": credentials.Bucket, | ||
"accessKeyID": credentials.AccessKeyID, | ||
"accessKey": credentials.AccessKey, | ||
}, | ||
} | ||
warehouse := warehouseutils.Warehouse{ | ||
Destination: destination, | ||
Namespace: testNamespace, | ||
} | ||
|
||
misc.Init() | ||
warehouseutils.Init() | ||
|
||
g, err := NewGlueSchemaRepository(warehouse) | ||
require.NoError(t, err) | ||
|
||
t.Logf("Creating schema %s", testNamespace) | ||
err = g.CreateSchema() | ||
require.NoError(t, err) | ||
|
||
t.Log("Creating already existing schema should not fail") | ||
err = g.CreateSchema() | ||
require.NoError(t, err) | ||
|
||
t.Cleanup(func() { | ||
t.Log("Cleaning up") | ||
_, err = g.GlueClient.DeleteDatabase(&glue.DeleteDatabaseInput{ | ||
Name: &testNamespace, | ||
}) | ||
require.NoError(t, err) | ||
}) | ||
|
||
t.Logf("Creating table %s", testTable) | ||
err = g.CreateTable(testTable, testColumns) | ||
require.NoError(t, err) | ||
|
||
t.Log("Creating already existing table should not fail") | ||
err = g.CreateTable(testTable, testColumns) | ||
require.NoError(t, err) | ||
|
||
t.Log("Adding columns to table") | ||
err = g.AddColumns(testTable, []warehouseutils.ColumnInfo{ | ||
{Name: "alter_test_bool", Type: "boolean"}, | ||
{Name: "alter_test_string", Type: "string"}, | ||
{Name: "alter_test_int", Type: "int"}, | ||
{Name: "alter_test_float", Type: "float"}, | ||
{Name: "alter_test_datetime", Type: "datetime"}, | ||
}) | ||
|
||
t.Log("Preparing load files metadata") | ||
f, err := os.Open(testFile) | ||
require.NoError(t, err) | ||
|
||
t.Cleanup(func() { | ||
_ = f.Close() | ||
}) | ||
|
||
fmFactory := filemanager.FileManagerFactoryT{} | ||
fm, err := fmFactory.New(&filemanager.SettingsT{ | ||
Provider: warehouseutils.S3, | ||
Config: map[string]any{ | ||
"bucketName": credentials.Bucket, | ||
"accessKeyID": credentials.AccessKeyID, | ||
"secretAccessKey": credentials.AccessKey, | ||
"region": credentials.Region, | ||
}, | ||
}) | ||
require.NoError(t, err) | ||
|
||
uploadOutput, err := fm.Upload(context.TODO(), f, fmt.Sprintf("rudder-test-payload/s3-datalake/%s/dt=2006-01-02/", warehouseutils.RandHex())) | ||
require.NoError(t, err) | ||
|
||
err = g.RefreshPartitions(testTable, []warehouseutils.LoadFileT{ | ||
{ | ||
Location: uploadOutput.Location, | ||
}, | ||
}) | ||
require.NoError(t, err) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,10 +34,7 @@ func (*LocalSchemaRepository) CreateSchema() (err error) { | |
|
||
func (ls *LocalSchemaRepository) CreateTable(tableName string, columnMap map[string]string) (err error) { | ||
// fetch schema from local db | ||
schema, _, err := ls.FetchSchema(ls.warehouse) | ||
if err != nil { | ||
return err | ||
} | ||
schema, _, _ := ls.FetchSchema(ls.warehouse) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since FetchSchema always returns nil, ignoring it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I see the schema needs to satisfy an interface. One possible way to work around this: func (ls *LocalSchemaRepository) localFetchSchema() warehouseutils.SchemaT {
return ls.uploader.GetLocalSchema()
}
func (ls *LocalSchemaRepository) FetchSchema(_ warehouseutils.Warehouse) (warehouseutils.SchemaT, warehouseutils.SchemaT, error) {
s := ls.localFetchSchema()
return s, s, nil
}
func (ls *LocalSchemaRepository) CreateTable(tableName string, columnMap map[string]string) (err error) {
schema:= ls.localFetchSchema(ls.warehouse) |
||
|
||
if _, ok := schema[tableName]; ok { | ||
return fmt.Errorf("failed to create table: table %s already exists", tableName) | ||
|
@@ -52,10 +49,7 @@ func (ls *LocalSchemaRepository) CreateTable(tableName string, columnMap map[str | |
|
||
func (ls *LocalSchemaRepository) AddColumns(tableName string, columnsInfo []warehouseutils.ColumnInfo) (err error) { | ||
// fetch schema from local db | ||
schema, _, err := ls.FetchSchema(ls.warehouse) | ||
if err != nil { | ||
return err | ||
} | ||
schema, _, _ := ls.FetchSchema(ls.warehouse) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since FetchSchema always returns nil, ignoring it. |
||
|
||
// check if table exists | ||
if _, ok := schema[tableName]; !ok { | ||
|
@@ -72,10 +66,7 @@ func (ls *LocalSchemaRepository) AddColumns(tableName string, columnsInfo []ware | |
|
||
func (ls *LocalSchemaRepository) AlterColumn(tableName, columnName, columnType string) (err error) { | ||
// fetch schema from local db | ||
schema, _, err := ls.FetchSchema(ls.warehouse) | ||
if err != nil { | ||
return err | ||
} | ||
schema, _, _ := ls.FetchSchema(ls.warehouse) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since FetchSchema always returns nil, ignoring it. |
||
|
||
// check if table exists | ||
if _, ok := schema[tableName]; !ok { | ||
|
@@ -92,3 +83,7 @@ func (ls *LocalSchemaRepository) AlterColumn(tableName, columnName, columnType s | |
// update schema | ||
return ls.uploader.UpdateLocalSchema(schema) | ||
} | ||
|
||
func (*LocalSchemaRepository) RefreshPartitions(_ string, _ []warehouseutils.LoadFileT) error { | ||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How are we checking the side effects of refresh partitions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, we have added only the round-trip flow for Glue. Since this requires an external service call to Glue. Should we mock the API call and tests this?