Permalink
Browse files

first commit

  • Loading branch information...
rcongiu committed Jul 12, 2011
0 parents commit 9a58d32fa4ea2f925bb745fcd6a4db935b266977
Showing with 6,904 additions and 0 deletions.
  1. +35 −0 LICENSE
  2. +84 −0 README.txt
  3. +91 −0 pom.xml
  4. +312 −0 src/main/java/org/openx/data/jsonserde/JsonSerDe.java
  5. +279 −0 src/main/java/org/openx/data/jsonserde/json/CDL.java
  6. +169 −0 src/main/java/org/openx/data/jsonserde/json/Cookie.java
  7. +90 −0 src/main/java/org/openx/data/jsonserde/json/CookieList.java
  8. +163 −0 src/main/java/org/openx/data/jsonserde/json/HTTP.java
  9. +77 −0 src/main/java/org/openx/data/jsonserde/json/HTTPTokener.java
  10. +928 −0 src/main/java/org/openx/data/jsonserde/json/JSONArray.java
  11. +28 −0 src/main/java/org/openx/data/jsonserde/json/JSONException.java
  12. +456 −0 src/main/java/org/openx/data/jsonserde/json/JSONML.java
  13. +1,633 −0 src/main/java/org/openx/data/jsonserde/json/JSONObject.java
  14. +18 −0 src/main/java/org/openx/data/jsonserde/json/JSONString.java
  15. +78 −0 src/main/java/org/openx/data/jsonserde/json/JSONStringer.java
  16. +445 −0 src/main/java/org/openx/data/jsonserde/json/JSONTokener.java
  17. +323 −0 src/main/java/org/openx/data/jsonserde/json/JSONWriter.java
  18. +508 −0 src/main/java/org/openx/data/jsonserde/json/XML.java
  19. +365 −0 src/main/java/org/openx/data/jsonserde/json/XMLTokener.java
  20. +139 −0 src/main/java/org/openx/data/jsonserde/objectinspector/JSONObjectMapAdapter.java
  21. +62 −0 src/main/java/org/openx/data/jsonserde/objectinspector/JsonListObjectInspector.java
  22. +67 −0 src/main/java/org/openx/data/jsonserde/objectinspector/JsonMapObjectInspector.java
  23. +153 −0 src/main/java/org/openx/data/jsonserde/objectinspector/JsonObjectInspectorFactory.java
  24. +78 −0 src/main/java/org/openx/data/jsonserde/objectinspector/JsonStructObjectInspector.java
  25. +197 −0 src/test/java/org/openx/data/jsonserde/JsonSerDeTest.java
  26. +3 −0 src/test/resources/data1.txt
  27. +15 −0 src/test/scripts/complex_test.sql
  28. +1 −0 src/test/scripts/complexdata.txt
  29. +10 −0 src/test/scripts/create.sql
  30. +14 −0 src/test/scripts/create_dest.sql
  31. +12 −0 src/test/scripts/create_source.sql
  32. +3 −0 src/test/scripts/data.txt
  33. +20 −0 src/test/scripts/derby.log
  34. +3 −0 src/test/scripts/load.sql
  35. +15 −0 src/test/scripts/nested_test.sql
  36. +1 −0 src/test/scripts/nesteddata.txt
  37. +4 −0 src/test/scripts/query.sql
  38. +4 −0 src/test/scripts/query2.sql
  39. +4 −0 src/test/scripts/query3.sql
  40. +4 −0 src/test/scripts/query4.sql
  41. +4 −0 src/test/scripts/queryall.sql
  42. +5 −0 src/test/scripts/rerun_dest.sql
  43. +4 −0 src/test/scripts/text_data.txt
35 LICENSE
@@ -0,0 +1,35 @@
Software Copyright License Agreement (BSD License)
Copyright (c) 2011, OpenX Technologies, Inc.
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of OpenX Technologies, Inc. nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of OpenX Technologies, Inc.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,84 @@
JsonSerde - a read/write SerDe for JSON Data
Serialization/Deserialization module for Apache Hadoop Hive
This module allows hive to read and write in JSON format (see http://json.org for more info).
Features:
* Read data stored in JSON format
* Convert data to JSON format when INSERT INTO table
* arrays and maps are supported
* nested data structures are also supported.
COMPILE
Use maven to compile the serde.
EXAMPLES
Example scripts with simple sample data are in src/test/scripts. Here some excerpts:
* Query with complex fields like arrays
CREATE TABLE json_test1 (
one boolean,
three array<string>,
two double,
four string )
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH 'data.txt' OVERWRITE INTO TABLE json_test1 ;
hive> select three[1] from json_test1;
gold
yellow
* Nested structures
You can also define nested structures:
add jar ../../../target/json-serde-1.0-SNAPSHOT-jar-with-dependencies.jar;
CREATE TABLE json_nested_test (
country string,
languages array<string>,
religions map<string,array<int>>)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
STORED AS TEXTFILE;
-- data : {"country":"Switzerland","languages":["German","French","Italian"],"religions":{"catholic":[10,20],"protestant":[40,50]}}
LOAD DATA LOCAL INPATH 'nesteddata.txt' OVERWRITE INTO TABLE json_nested_test ;
select * from json_nested_test; -- result: Switzerland ["German","French","Italian"] {"catholic":[10,20],"protestant":[40,50]}
select languages[0] from json_nested_test; -- result: German
select religions['catholic'][0] from json_nested_test; -- result: 10
* ARCHITECTURE
For the JSON encoding/decoding, I am using a modified version of Douglas Rockfords JSON library:
https://github.com/douglascrockford/JSON-java
which is included in the distribution. I had to make some minor changes to it, for this reason
I included it in my distribution and moved it to another package (since it's included in hive!)
The SerDe builds a series of wrappers around JSONObject. Since serialization and deserialization
are executed for every (and possibly billions) record we want to minimize object creation, so
instead of serializing/deserializing to an ArrayList, I kept the JSONObject and built a cached
objectinspector around it. So when deserializing, hive gets a JSONObject, and a JSONStructObjectInspector
to read from it. Hive has Structs, Maps, Arrays and primitives while JSON has Objects, Arrays and primitives.
Hive Maps and Structs are both implemented as object, which are less restrictive than hive maps:
a JSON Object could be a mix of keys and values of different types, while hive expects you to declare the
type of map (example: map<string,string>). The user is responsible for having the JSON data structure
match hive table declaration.
* THANKS
Thanks to Douglas Rockford for the liberal license for his JSON library, and thanks to
my employer OpenX and my boss Michael Lum for letting me open source the code.
91 pom.xml
@@ -0,0 +1,91 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.openx.data</groupId>
<artifactId>json-serde</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>openx-json-serde</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<distributionManagement>
<repository>
<id>openx-maven-repo-scp</id>
<name>OpenX testing repo</name>
<url>scpexe://codex/home/mavenrepo/maven2/</url>
</repository>
</distributionManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
<!-- Assembly Plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>hive</groupId>
<artifactId>hive-serde</artifactId>
<version>0.6.0_trunk</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>hive</groupId>
<artifactId>hive-exec</artifactId>
<version>0.6.0_trunk</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>0.20.2</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project>
Oops, something went wrong.

0 comments on commit 9a58d32

Please sign in to comment.