Permalink
Browse files

Cleanup of readme/ch3 avro pig, made dotcloud pig example

  • Loading branch information...
1 parent 58ae01f commit c0de73b5b7278ea06454402eb166d275f6a210c0 @rjurney committed Feb 7, 2013
Showing with 45 additions and 4 deletions.
  1. +4 −0 README.md
  2. +0 −3 ch03/pig/avro_to_mongo.pig
  3. +1 −1 ch04/.dotcloud/config
  4. +40 −0 ch04/test_dotcloud_mongo.pig
View
4 README.md
@@ -52,6 +52,10 @@ cd gmail
./gmail.py -m automatic -u me@gmail.com -p 'my_password_' -s ./email.avro.schema -f '[Gmail]/All Mail' -o /tmp/test_mbox 2>&1 &
```
+Chapter 4: To the Cloud!
+
+[Chapter 4 tutorial](https://github.com/rjurney/Agile_Data_Code/tree/master/ch04)
+
Chapter 7: Collecting and Displaying Atomic Records
===================================================
View
3 ch03/pig/avro_to_mongo.pig
@@ -9,9 +9,6 @@ REGISTER /me/Software/mongo-hadoop/mongo-2.10.1.jar
REGISTER /me/Software/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
REGISTER /me/Software/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
-set mapred.map.tasks.speculative.execution false
-set mapred.reduce.tasks.speculative.execution false
-
/* Set speculative execution off so we don't have the chance of duplicate records in Mongo */
set mapred.map.tasks.speculative.execution false
set mapred.reduce.tasks.speculative.execution false
View
2 ch04/.dotcloud/config
@@ -1,6 +1,6 @@
{
"push_branch": null,
- "application": "agiledatabook",
+ "application": "testola",
"version": "0.9.4",
"push_protocol": "rsync"
}
View
40 ch04/test_dotcloud_mongo.pig
@@ -0,0 +1,40 @@
+REGISTER /me/Software/pig/build/ivy/lib/Pig/avro-1.5.3.jar
+REGISTER /me/Software/pig/build/ivy/lib/Pig/json-simple-1.1.jar
+REGISTER /me/Software/pig/contrib/piggybank/java/piggybank.jar
+
+DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
+
+/* MongoDB libraries and configuration */
+REGISTER /me/Software/mongo-hadoop/mongo-2.10.1.jar
+REGISTER /me/Software/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
+REGISTER /me/Software/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
+
+/* Set speculative execution off so we don't have the chance of duplicate records in Mongo */
+set mapred.map.tasks.speculative.execution false
+set mapred.reduce.tasks.speculative.execution false
+define MongoStorage com.mongodb.hadoop.pig.MongoStorage(); /* Shortcut */
+
+rmf /tmp/sent_counts.txt
+
+/* Load the emails in avro format (edit the path to match where you saved them) using the AvroStorage UDF from Piggybank */
+messages = LOAD '/me/Data/test_mbox' USING AvroStorage();
+
+/* Filter nulls, they won't help */
+messages = FILTER messages BY (from IS NOT NULL) AND (tos IS NOT NULL);
+
+/* Emails can be 'to' more than one person. FLATTEN() will project our from with each 'to' that exists. */
+addresses = FOREACH messages GENERATE from.address AS from, FLATTEN(tos.(address)) AS to;
+
+/* Lowercase the email addresses, so we don't count MiXed case of the same address as multiple addresses */
+lowers = FOREACH addresses GENERATE LOWER(from) AS from, LOWER(to) AS to;
+
+/* GROUP BY each from/to pair into a bag (array), then count the bag's contents ($1 means the 2nd field) to get a total.
+ Same as SQL: SELECT from, to, COUNT(*) FROM lowers GROUP BY (from, to);
+ Note: COUNT_STAR differs from COUNT in that it counts nulls. */
+by_from_to = GROUP lowers BY (from, to);
+sent_counts = FOREACH by_from_to GENERATE FLATTEN(group) AS (from, to), COUNT_STAR(lowers) AS total;
+
+/* Sort the data, highest sent count first */
+sent_counts = ORDER sent_counts BY total DESC;
+STORE sent_counts INTO '/tmp/sent_counts.txt';
+STORE sent_counts INTO 'mongodb://jack:OpenSesame@testola-rjurney-data-0.azva.dotcloud.net:40961/agile_data.sent_counts' using MongoStorage();

0 comments on commit c0de73b

Please sign in to comment.