-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ngram, uuid and scoring base classes..
- Loading branch information
1 parent
5da6792
commit 9f75aa1
Showing
21 changed files
with
1,214 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
/** | ||
* Copyright 2011 Nube Technologies | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and limitations under the License. | ||
*/ | ||
package co.nubetech.hiho.similarity.ngram; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.conf.Configured; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.mapreduce.Job; | ||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; | ||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; | ||
import org.apache.hadoop.util.Tool; | ||
import org.apache.hadoop.util.ToolRunner; | ||
import org.apache.log4j.Logger; | ||
|
||
import co.nubetech.hiho.common.HIHOException; | ||
|
||
public class NGramJob extends Configured implements Tool { | ||
|
||
final static Logger logger = Logger | ||
.getLogger(co.nubetech.hiho.similarity.ngram.NGramJob.class); | ||
|
||
private String inputPath = null; | ||
|
||
public void populateConfiguration(String[] args) { | ||
for (int i = 0; i < args.length - 1; i++) { | ||
if ("-inputPath".equals(args[i])) { | ||
inputPath = args[++i]; | ||
} | ||
} | ||
} | ||
|
||
public void checkMandatoryConfs() throws HIHOException { | ||
if (inputPath == null) { | ||
throw new HIHOException( | ||
"The provided input path is empty, please specify inputPath"); | ||
} | ||
} | ||
|
||
@Override | ||
public int run(String[] args) throws Exception { | ||
Configuration conf = getConf(); | ||
populateConfiguration(args); | ||
try { | ||
checkMandatoryConfs(); | ||
} catch (HIHOException e1) { | ||
e1.printStackTrace(); | ||
throw new Exception(e1); | ||
} | ||
Job job = new Job(conf); | ||
job.setJobName("NGram job"); | ||
job.setJarByClass(NGramJob.class); | ||
|
||
Class inputFormatClass = Class | ||
.forName("org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat"); | ||
Class outputFormatClass = Class | ||
.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"); | ||
// org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat | ||
// org.apache.hadoop.mapreduce.lib.output.TextOutputFormat | ||
Class inputKeyClass = Class.forName("org.apache.hadoop.io.Text"); | ||
Class inputValueClass = Class.forName("org.apache.hadoop.io.Text"); | ||
Class outputKeyClass = Class | ||
.forName("co.nubetech.hiho.similarity.ngram.ValuePair"); | ||
Class outputValueClass = Class | ||
.forName("org.apache.hadoop.io.IntWritable"); | ||
|
||
job.setMapperClass(NGramMapper.class); | ||
job.setReducerClass(NGramReducer.class); | ||
|
||
job.setInputFormatClass(inputFormatClass); | ||
job.setMapOutputKeyClass(inputKeyClass); | ||
job.setMapOutputValueClass(inputValueClass); | ||
|
||
job.setOutputKeyClass(outputKeyClass); | ||
job.setOutputValueClass(outputValueClass); | ||
job.setOutputFormatClass(outputFormatClass); | ||
|
||
FileInputFormat.setInputPaths(job, inputPath); | ||
FileOutputFormat.setOutputPath(job, new Path("outputOfNGramJob")); | ||
|
||
int ret = 0; | ||
try { | ||
ret = job.waitForCompletion(true) ? 0 : 1; | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
return ret; | ||
} | ||
|
||
public static void main(String[] args) throws Exception { | ||
NGramJob job = new NGramJob(); | ||
int res = ToolRunner.run(new Configuration(), job, args); | ||
System.exit(res); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/** | ||
* Copyright 2011 Nube Technologies | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and limitations under the License. | ||
*/ | ||
package co.nubetech.hiho.similarity.ngram; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
|
||
import org.apache.hadoop.io.Text; | ||
import org.apache.hadoop.mapreduce.Mapper; | ||
import org.apache.log4j.Logger; | ||
|
||
public class NGramMapper extends Mapper<Text, Text, Text, Text> { | ||
|
||
final static Logger logger = Logger | ||
.getLogger(co.nubetech.hiho.similarity.ngram.NGramMapper.class); | ||
|
||
@Override | ||
public void map(Text key, Text val, Context context) throws IOException, | ||
InterruptedException { | ||
if (key == null) { | ||
throw new IOException("Key is null"); | ||
} | ||
HashSet<String> nGramList = new HashSet<String>(); | ||
int gramSize = 2; | ||
nGramList = getNGrams(key, gramSize); | ||
for (String nGrams : nGramList) { | ||
String value = key.toString() + "delimiterBetweenKeyAndValue" + val.toString(); | ||
context.write(new Text(nGrams), new Text(value)); | ||
logger.info("Key and Value in NGram Mapper is: " + new Text(nGrams) | ||
+ ", " + new Text(value)); | ||
} | ||
} | ||
|
||
public HashSet<String> getNGrams(Text line, int gramSize) { | ||
ArrayList<String> words = new ArrayList<String>(); | ||
HashSet<String> nGrams = new HashSet<String>(); | ||
String[] tokens = line.toString().split(" "); | ||
for (String t : tokens) { | ||
words.add(t); | ||
} | ||
for (int i = 0; i < words.size() - gramSize + 1; i++) { | ||
String key = ""; | ||
for (int j = i; j < i + gramSize; j++) { | ||
key += words.get(j); | ||
if(j != ( i + gramSize - 1)){ | ||
key += " "; | ||
} | ||
} | ||
nGrams.add(key); | ||
} | ||
return nGrams; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/** | ||
* Copyright 2011 Nube Technologies | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and limitations under the License. | ||
*/ | ||
package co.nubetech.hiho.similarity.ngram; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
|
||
import org.apache.hadoop.io.IntWritable; | ||
import org.apache.hadoop.io.Text; | ||
import org.apache.hadoop.mapreduce.Reducer; | ||
import org.apache.log4j.Logger; | ||
|
||
public class NGramReducer extends Reducer<Text, Text, ValuePair, IntWritable> { | ||
final static Logger logger = Logger | ||
.getLogger(co.nubetech.hiho.similarity.ngram.NGramReducer.class); | ||
|
||
@Override | ||
public void reduce(Text key, Iterable<Text> values, Context context) | ||
throws IOException, InterruptedException { | ||
|
||
if (key == null) { | ||
throw new IOException("Key is null"); | ||
} | ||
|
||
logger.info("Key in NGram Reducer is: " + key); | ||
|
||
ArrayList<Text> value = new ArrayList<Text>(); | ||
|
||
Iterator<Text> iterator = values.iterator(); | ||
while (iterator.hasNext()) { | ||
Text valueInIterator = iterator.next(); | ||
logger.info("Value in NGram Reducer is: " + valueInIterator); | ||
value.add(new Text(valueInIterator)); | ||
} | ||
|
||
for (Text valueInList : value) { | ||
logger.info("Value added in list is: " + valueInList); | ||
} | ||
|
||
for (int i = 0; i < value.size() - 1; i++) { | ||
for (int j = i + 1; j < value.size(); j++) { | ||
ValuePair valuePair = new ValuePair(); | ||
valuePair.setValue1(value.get(i)); | ||
valuePair.setValue2(value.get(j)); | ||
logger.info("Value set in ValuePair is: " + value.get(i) + ", " | ||
+ value.get(j)); | ||
context.write(valuePair, new IntWritable(1)); | ||
} | ||
|
||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/** | ||
* Copyright 2011 Nube Technologies | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and limitations under the License. | ||
*/ | ||
package co.nubetech.hiho.similarity.ngram; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.conf.Configured; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.mapreduce.Job; | ||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; | ||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; | ||
import org.apache.hadoop.util.Tool; | ||
import org.apache.hadoop.util.ToolRunner; | ||
import org.apache.log4j.Logger; | ||
|
||
public class ScoreJob extends Configured implements Tool { | ||
|
||
final static Logger logger = Logger | ||
.getLogger(co.nubetech.hiho.similarity.ngram.ScoreJob.class); | ||
|
||
@Override | ||
public int run(String[] arg0) throws Exception { | ||
Configuration conf = getConf(); | ||
Job job = new Job(conf); | ||
job.setJobName("Score job"); | ||
job.setJarByClass(ScoreJob.class); | ||
|
||
Class inputFormatClass = Class | ||
.forName("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat"); | ||
Class outputFormatClass = Class | ||
.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"); | ||
// org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat | ||
// org.apache.hadoop.mapreduce.lib.output.TextOutputFormat | ||
Class inputKeyClass = Class | ||
.forName("co.nubetech.hiho.similarity.ngram.ValuePair"); | ||
Class inputValueClass = Class | ||
.forName("org.apache.hadoop.io.IntWritable"); | ||
Class outputKeyClass = Class | ||
.forName("co.nubetech.hiho.similarity.ngram.ValuePair"); | ||
Class outputValueClass = Class | ||
.forName("org.apache.hadoop.io.LongWritable"); | ||
|
||
job.setMapperClass(ScoreMapper.class); | ||
job.setReducerClass(ScoreReducer.class); | ||
|
||
job.setInputFormatClass(inputFormatClass); | ||
job.setMapOutputKeyClass(inputKeyClass); | ||
job.setMapOutputValueClass(inputValueClass); | ||
|
||
job.setOutputKeyClass(outputKeyClass); | ||
job.setOutputValueClass(outputValueClass); | ||
job.setOutputFormatClass(outputFormatClass); | ||
|
||
FileInputFormat.setInputPaths(job, "outputOfNGramJob"); | ||
FileOutputFormat.setOutputPath(job, new Path("outputOfScoreJob")); | ||
|
||
int ret = 0; | ||
try { | ||
ret = job.waitForCompletion(true) ? 0 : 1; | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
return ret; | ||
} | ||
|
||
public static void main(String[] args) throws Exception { | ||
ScoreJob job = new ScoreJob(); | ||
int res = ToolRunner.run(new Configuration(), job, args); | ||
System.exit(res); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/** | ||
* Copyright 2011 Nube Technologies | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and limitations under the License. | ||
*/ | ||
package co.nubetech.hiho.similarity.ngram; | ||
|
||
import java.io.IOException; | ||
|
||
import org.apache.hadoop.io.IntWritable; | ||
import org.apache.hadoop.mapreduce.Mapper; | ||
import org.apache.log4j.Logger; | ||
public class ScoreMapper extends | ||
Mapper<ValuePair, IntWritable, ValuePair, IntWritable> { | ||
|
||
final static Logger logger = Logger | ||
.getLogger(co.nubetech.hiho.similarity.ngram.ScoreMapper.class); | ||
|
||
@Override | ||
public void map(ValuePair key, IntWritable val, Context context) | ||
throws IOException, InterruptedException { | ||
if (key == null) { | ||
throw new IOException("Key is null"); | ||
} | ||
context.write(key, val); | ||
} | ||
} |
Oops, something went wrong.