Browse files

added article example

  • Loading branch information...
1 parent 36149ce commit 162b61c29628467e214e6bbba65b9e1ec4bc4cc8 @ptarjan committed Mar 10, 2010
Showing with 7 additions and 3 deletions.
  1. +7 −3 script.sh
View
10 script.sh
@@ -43,12 +43,16 @@ cat hamlet.txt | ./mapper.py | sort | ./reducer_numsort.py | sort -nrk 2 > out/h
diff out/hamlet_numsort.txt out/hamlet_numsort_local.txt
-# EXTRA (wikipedia)
-wget http://download.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz
-gunzip -c enwiki-latest-all-titles-in-ns0.gz | hadoop fs -put - count_example/wiki_titles
+# EXTRA (wikipedia titles)
+wget http://download.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz -O - | gunzip -c | hadoop fs -put - count_example/wiki_titles
hadoop jar $HADOOP_HOME/hadoop-streaming.jar $PARAMS -mapper mapper.py -reducer reducer_numsort.py -input count_example/wiki_titles -output count_example/wiki_titles_out -file mapper.py -file reducer_numsort.py
hadoop fs -cat count_example/wiki_titles_out/* | head
+# EXTRA x2 (wikipedia articles)
+wget http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -O - | bunzip2 -c | hadoop fs -put - count_example/wiki_articles
+hadoop jar $HADOOP_HOME/hadoop-streaming.jar $PARAMS -mapper mapper.py -reducer reducer_numsort.py -input count_example/wiki_articles -output count_example/wiki_articles_out -file mapper.py -file reducer_numsort.py
+hadoop fs -cat count_example/wiki_articles_out/* | head
+
# cleanup
hadoop fs -rmr count_example
cd ..

0 comments on commit 162b61c

Please sign in to comment.