-
Notifications
You must be signed in to change notification settings - Fork 6
/
parquet.go
138 lines (115 loc) · 2.96 KB
/
parquet.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package columnifier
import (
"io"
"io/ioutil"
"os"
"github.com/xitongsys/parquet-go/marshal"
"github.com/reproio/columnify/record"
"github.com/reproio/columnify/parquet"
"github.com/reproio/columnify/schema"
"github.com/xitongsys/parquet-go-source/local"
parquetSource "github.com/xitongsys/parquet-go/source"
"github.com/xitongsys/parquet-go/writer"
)
// Columnifier is a parquet specific Columninifier implementation.
type parquetColumnifier struct {
w *writer.ParquetWriter
schema *schema.IntermediateSchema
rt string
}
// NewParquetColumnifier creates a new parquetColumnifier.
func NewParquetColumnifier(st string, sf string, rt string, output string, config Config) (*parquetColumnifier, error) {
schemaContent, err := ioutil.ReadFile(sf)
if err != nil {
return nil, err
}
intermediateSchema, err := schema.GetSchema(schemaContent, st)
if err != nil {
return nil, err
}
sh, err := schema.NewSchemaHandlerFromArrow(*intermediateSchema)
if err != nil {
return nil, err
}
var fw parquetSource.ParquetFile
if output != "" {
fw, err = local.NewLocalFileWriter(output)
if err != nil {
return nil, err
}
} else {
fw = parquet.NewStdioFile()
}
w, err := writer.NewParquetWriter(fw, nil, 1)
if err != nil {
return nil, err
}
w.SchemaHandler = sh
w.Footer.Schema = append(w.Footer.Schema, sh.SchemaElements...)
w.PageSize = config.Parquet.PageSize
w.RowGroupSize = config.Parquet.RowGroupSize
w.CompressionType = config.Parquet.CompressionCodec
// Intermediate record type is string typed JSON values
w.MarshalFunc = marshal.MarshalJSON
return &parquetColumnifier{
w: w,
schema: intermediateSchema,
rt: rt,
}, nil
}
// Write reads, converts input binary data and write it to buffer.
func (c *parquetColumnifier) WriteFromReader(reader io.Reader) (int, error) {
decoder, err := record.NewJsonStringConverter(reader, c.schema, c.rt)
if err != nil {
return -1, err
}
beforeSize := c.w.Size
for {
var v string
err = decoder.Convert(&v)
if err != nil {
if err == io.EOF {
break
} else {
return -1, err
}
}
if err := c.w.Write(v); err != nil {
return -1, err
}
}
afterSize := c.w.Size
return int(afterSize - beforeSize), nil
}
// writeFromFile reads, converts an input binary file.
func (c *parquetColumnifier) writeFromFile(path string) (int, error) {
f, err := os.Open(path)
if err != nil {
return -1, err
}
defer f.Close()
n, err := c.WriteFromReader(f)
if err != nil {
return -1, err
}
return n, nil
}
// WriteFromFiles reads, converts input binary files.
func (c *parquetColumnifier) WriteFromFiles(paths []string) (int, error) {
var size int
for _, p := range paths {
n, err := c.writeFromFile(p)
if err != nil {
return -1, err
}
size += n
}
return size, nil
}
// Close stops writing parquet files ant finalize this conversion.
func (c *parquetColumnifier) Close() error {
if err := c.w.WriteStop(); err != nil {
return err
}
return c.w.PFile.Close()
}