Skip to content

Commit

Permalink
Ngram, uuid and scoring base classes..
Browse files Browse the repository at this point in the history
  • Loading branch information
sonalgoyal committed Apr 20, 2011
1 parent 5da6792 commit 9f75aa1
Show file tree
Hide file tree
Showing 21 changed files with 1,214 additions and 0 deletions.
Binary file added lib/hsqldb-1.8.0.10.jar
Binary file not shown.
Binary file added lib/pig-0.8.0.jar
Binary file not shown.
107 changes: 107 additions & 0 deletions src/co/nubetech/hiho/similarity/ngram/NGramJob.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.similarity.ngram;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import co.nubetech.hiho.common.HIHOException;

public class NGramJob extends Configured implements Tool {

final static Logger logger = Logger
.getLogger(co.nubetech.hiho.similarity.ngram.NGramJob.class);

private String inputPath = null;

public void populateConfiguration(String[] args) {
for (int i = 0; i < args.length - 1; i++) {
if ("-inputPath".equals(args[i])) {
inputPath = args[++i];
}
}
}

public void checkMandatoryConfs() throws HIHOException {
if (inputPath == null) {
throw new HIHOException(
"The provided input path is empty, please specify inputPath");
}
}

@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
populateConfiguration(args);
try {
checkMandatoryConfs();
} catch (HIHOException e1) {
e1.printStackTrace();
throw new Exception(e1);
}
Job job = new Job(conf);
job.setJobName("NGram job");
job.setJarByClass(NGramJob.class);

Class inputFormatClass = Class
.forName("org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");
Class outputFormatClass = Class
.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat");
// org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
// org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
Class inputKeyClass = Class.forName("org.apache.hadoop.io.Text");
Class inputValueClass = Class.forName("org.apache.hadoop.io.Text");
Class outputKeyClass = Class
.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
Class outputValueClass = Class
.forName("org.apache.hadoop.io.IntWritable");

job.setMapperClass(NGramMapper.class);
job.setReducerClass(NGramReducer.class);

job.setInputFormatClass(inputFormatClass);
job.setMapOutputKeyClass(inputKeyClass);
job.setMapOutputValueClass(inputValueClass);

job.setOutputKeyClass(outputKeyClass);
job.setOutputValueClass(outputValueClass);
job.setOutputFormatClass(outputFormatClass);

FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, new Path("outputOfNGramJob"));

int ret = 0;
try {
ret = job.waitForCompletion(true) ? 0 : 1;
} catch (Exception e) {
e.printStackTrace();
}
return ret;
}

public static void main(String[] args) throws Exception {
NGramJob job = new NGramJob();
int res = ToolRunner.run(new Configuration(), job, args);
System.exit(res);
}

}
67 changes: 67 additions & 0 deletions src/co/nubetech/hiho/similarity/ngram/NGramMapper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.similarity.ngram;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;

public class NGramMapper extends Mapper<Text, Text, Text, Text> {

final static Logger logger = Logger
.getLogger(co.nubetech.hiho.similarity.ngram.NGramMapper.class);

@Override
public void map(Text key, Text val, Context context) throws IOException,
InterruptedException {
if (key == null) {
throw new IOException("Key is null");
}
HashSet<String> nGramList = new HashSet<String>();
int gramSize = 2;
nGramList = getNGrams(key, gramSize);
for (String nGrams : nGramList) {
String value = key.toString() + "delimiterBetweenKeyAndValue" + val.toString();
context.write(new Text(nGrams), new Text(value));
logger.info("Key and Value in NGram Mapper is: " + new Text(nGrams)
+ ", " + new Text(value));
}
}

public HashSet<String> getNGrams(Text line, int gramSize) {
ArrayList<String> words = new ArrayList<String>();
HashSet<String> nGrams = new HashSet<String>();
String[] tokens = line.toString().split(" ");
for (String t : tokens) {
words.add(t);
}
for (int i = 0; i < words.size() - gramSize + 1; i++) {
String key = "";
for (int j = i; j < i + gramSize; j++) {
key += words.get(j);
if(j != ( i + gramSize - 1)){
key += " ";
}
}
nGrams.add(key);
}
return nGrams;
}

}
65 changes: 65 additions & 0 deletions src/co/nubetech/hiho/similarity/ngram/NGramReducer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.similarity.ngram;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Logger;

public class NGramReducer extends Reducer<Text, Text, ValuePair, IntWritable> {
final static Logger logger = Logger
.getLogger(co.nubetech.hiho.similarity.ngram.NGramReducer.class);

@Override
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {

if (key == null) {
throw new IOException("Key is null");
}

logger.info("Key in NGram Reducer is: " + key);

ArrayList<Text> value = new ArrayList<Text>();

Iterator<Text> iterator = values.iterator();
while (iterator.hasNext()) {
Text valueInIterator = iterator.next();
logger.info("Value in NGram Reducer is: " + valueInIterator);
value.add(new Text(valueInIterator));
}

for (Text valueInList : value) {
logger.info("Value added in list is: " + valueInList);
}

for (int i = 0; i < value.size() - 1; i++) {
for (int j = i + 1; j < value.size(); j++) {
ValuePair valuePair = new ValuePair();
valuePair.setValue1(value.get(i));
valuePair.setValue2(value.get(j));
logger.info("Value set in ValuePair is: " + value.get(i) + ", "
+ value.get(j));
context.write(valuePair, new IntWritable(1));
}

}
}
}
83 changes: 83 additions & 0 deletions src/co/nubetech/hiho/similarity/ngram/ScoreJob.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.similarity.ngram;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class ScoreJob extends Configured implements Tool {

final static Logger logger = Logger
.getLogger(co.nubetech.hiho.similarity.ngram.ScoreJob.class);

@Override
public int run(String[] arg0) throws Exception {
Configuration conf = getConf();
Job job = new Job(conf);
job.setJobName("Score job");
job.setJarByClass(ScoreJob.class);

Class inputFormatClass = Class
.forName("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat");
Class outputFormatClass = Class
.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat");
// org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
// org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
Class inputKeyClass = Class
.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
Class inputValueClass = Class
.forName("org.apache.hadoop.io.IntWritable");
Class outputKeyClass = Class
.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
Class outputValueClass = Class
.forName("org.apache.hadoop.io.LongWritable");

job.setMapperClass(ScoreMapper.class);
job.setReducerClass(ScoreReducer.class);

job.setInputFormatClass(inputFormatClass);
job.setMapOutputKeyClass(inputKeyClass);
job.setMapOutputValueClass(inputValueClass);

job.setOutputKeyClass(outputKeyClass);
job.setOutputValueClass(outputValueClass);
job.setOutputFormatClass(outputFormatClass);

FileInputFormat.setInputPaths(job, "outputOfNGramJob");
FileOutputFormat.setOutputPath(job, new Path("outputOfScoreJob"));

int ret = 0;
try {
ret = job.waitForCompletion(true) ? 0 : 1;
} catch (Exception e) {
e.printStackTrace();
}
return ret;
}

public static void main(String[] args) throws Exception {
ScoreJob job = new ScoreJob();
int res = ToolRunner.run(new Configuration(), job, args);
System.exit(res);
}

}
36 changes: 36 additions & 0 deletions src/co/nubetech/hiho/similarity/ngram/ScoreMapper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.similarity.ngram;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
public class ScoreMapper extends
Mapper<ValuePair, IntWritable, ValuePair, IntWritable> {

final static Logger logger = Logger
.getLogger(co.nubetech.hiho.similarity.ngram.ScoreMapper.class);

@Override
public void map(ValuePair key, IntWritable val, Context context)
throws IOException, InterruptedException {
if (key == null) {
throw new IOException("Key is null");
}
context.write(key, val);
}
}
Loading

0 comments on commit 9f75aa1

Please sign in to comment.