Permalink
Browse files

Make Deckard run faster on a multi-core machine by parallelized execu…

…tion of various components in Deckard.
  • Loading branch information...
1 parent b7f0825 commit ac295f7b34e05f7ecb5a2da399dc60b5abab74b6 skyhover committed Jan 30, 2013
Showing with 893 additions and 512 deletions.
  1. +2 −2 LICENSE
  2. +22 −9 README
  3. +87 −25 samples/config
  4. +2 −2 scripts/bugdetect/bugcounting
  5. +2 −2 scripts/bugdetect/bugfiltering
  6. +2 −2 scripts/bugdetect/bugmerging
  7. +2 −2 scripts/bugdetect/bugordering
  8. +2 −2 scripts/bugdetect/bugtypecounting
  9. +35 −2 scripts/bugdetect/deckardd.sh
  10. +2 −2 scripts/bugdetect/mergecomments
  11. +2 −2 scripts/clonedetect/cd_coverage
  12. +9 −2 scripts/clonedetect/config-sample
  13. +9 −2 scripts/clonedetect/configure
  14. +5 −5 scripts/clonedetect/deckard.sh
  15. +2 −2 scripts/clonedetect/generateparam
  16. +2 −2 scripts/clonedetect/paramsetting
  17. +3 −2 scripts/clonedetect/post_process_groupfile
  18. +8 −11 scripts/clonedetect/vdbgen
  19. +34 −50 scripts/clonedetect/vertical-param-batch
  20. +59 −50 src/include/ptree.h
  21. +13 −4 src/lsh/Makefile
  22. +9 −3 src/lsh/sources/Makefile
  23. +1 −0 src/lsh/sources/enumBuckets.cpp
  24. +33 −24 src/main/Makefile
  25. +2 −2 src/main/bugmain.cc
  26. +50 −7 src/main/build.sh
  27. +2 −2 src/main/clean.sh
  28. +2 −2 src/main/main.cc
  29. +2 −2 src/main/out2html.C
  30. +2 −2 src/main/out2xml.C
  31. +4 −4 src/main/parseTreeMain.cc
  32. +197 −78 src/main/ptree.cc
  33. +2 −2 src/ptgen/Makefile
  34. +12 −5 src/ptgen/gcc/Makefile
  35. +18 −2 src/ptgen/gcc/c.l
  36. +14 −9 src/ptgen/gcc/c.y
  37. +2 −2 src/ptgen/gcc/catomicNodes.h
  38. +2 −2 src/ptgen/gcc/ccontextualNodes.h
  39. +2 −2 src/ptgen/gcc/cparentNodes.h
  40. +2 −2 src/ptgen/gcc/crelevantNodes.h
  41. +2 −2 src/ptgen/gcc/main.cc
  42. +2 −2 src/ptgen/gcc/mainc.py
  43. +11 −5 src/ptgen/java/Makefile
  44. +2 −2 src/ptgen/java/jatomicNodes.h
  45. +2 −2 src/ptgen/java/jcontextualNodes.h
  46. +2 −2 src/ptgen/java/jparentNodes.h
  47. +2 −2 src/ptgen/java/jrelevantNodes.h
  48. +2 −2 src/ptgen/java/main.cc
  49. +2 −2 src/ptgen/java/mainj.py
  50. +12 −5 src/ptgen/php5/Makefile
  51. +2 −2 src/ptgen/php5/mainphp.py
  52. +2 −2 src/ptgen/php5/phpatomicNodes.h
  53. +2 −2 src/ptgen/php5/phpcontextualNodes.h
  54. +2 −2 src/ptgen/php5/phpparentNodes.h
  55. +2 −2 src/ptgen/php5/phprelevantNodes.h
  56. +9 −4 src/ptgen/simple/Makefile
  57. +2 −2 src/ptgen/simple/main.cc
  58. +2 −2 src/ptgen/simple/tokid.h
  59. +2 −2 src/ptgen/yacc.g
  60. +2 −2 src/vgen/tra-gen-test.C
  61. +13 −7 src/vgen/treeTra/Makefile
  62. +4 −2 src/vgen/treeTra/clone-context-php.C
  63. +2 −2 src/vgen/treeTra/clone-context-php.h
  64. +2 −2 src/vgen/treeTra/node-vec-gen.C
  65. +2 −2 src/vgen/treeTra/node-vec-gen.h
  66. +3 −3 src/vgen/treeTra/sq-tree.C
  67. +2 −2 src/vgen/treeTra/sq-tree.h
  68. +2 −2 src/vgen/treeTra/token-counter.C
  69. +2 −2 src/vgen/treeTra/token-counter.h
  70. +4 −2 src/vgen/treeTra/token-tree-map.C
  71. +7 −7 src/vgen/treeTra/token-tree-map.h
  72. +7 −4 src/vgen/treeTra/tra-gen.C
  73. +5 −4 src/vgen/treeTra/tra-gen.h
  74. +2 −2 src/vgen/treeTra/tree-accessor.C
  75. +2 −2 src/vgen/treeTra/tree-accessor.h
  76. +2 −2 src/vgen/treeTra/tree-traversal.C
  77. +11 −11 src/vgen/treeTra/tree-vector.C
  78. +2 −2 src/vgen/treeTra/tree-vector.h
  79. +21 −12 src/vgen/treeTra/vector-merger.C
  80. +2 −2 src/vgen/treeTra/vector-merger.h
  81. +7 −4 src/vgen/treeTra/vector-output.C
  82. +2 −2 src/vgen/treeTra/vector-output.h
  83. +9 −12 src/vgen/treeTra/vgen-config.C
  84. +2 −2 src/vgen/treeTra/vgen-config.h
  85. +5 −5 src/vgen/treeTra/vgen-utils.c
  86. +2 −2 src/vgen/treeTra/vgen-utils.h
  87. +16 −5 src/vgen/vgrouping/Makefile
  88. +5 −5 src/vgen/vgrouping/computeranges.c
  89. +5 −4 src/vgen/vgrouping/dispatchvectors.c
  90. +2 −2 src/vgen/vgrouping/rundispatch
  91. +3 −3 src/vgen/vgrouping/rundispatchonefile
  92. +2 −2 src/vgen/vgrouping/runsplit
  93. +2 −2 src/vgen/vgrouping/runvectorsort
  94. +2 −2 src/vgen/vgrouping/split.c
  95. +4 −4 src/vgen/vgrouping/vectorsort.c
View
4 LICENSE
@@ -1,6 +1,6 @@
-Copyright (c) 2007-2012,
- Lingxiao Jiang <lxjiang@ucdavis.edu>
+Copyright (c) 2007-2013, University of California / Singapore Management University
+ Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
Ghassan Misherghi <ghassanm@ucdavis.edu>
Zhendong Su <su@ucdavis.edu>
Stephane Glondu <steph@glondu.net>
View
31 README
@@ -9,8 +9,8 @@ code clone detection tool. It is also capable of reporting clone-related bugs.
*
**********************************************************************
-Copyright (c) 2007-2012, University of California
- Lingxiao Jiang <lxjiang@ucdavis.edu>
+Copyright (c) 2007-2013, University of California / Singapore Management University
+ Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
Ghassan Misherghi <ghassanm@ucdavis.edu>
Zhendong Su <su@ucdavis.edu>
Stephane Glondu <steph@glondu.net>
@@ -20,23 +20,32 @@ Three-clause BSD licence
**********************************************************************
*
-* Version 1.2.3
-* January 22, 2012
+* Version 1.3
+* January 30, 2013
+*
+* What's new?
*
**********************************************************************
+- Faster clone detection by parallized execution of various components in
+ DECKARD. Use "MAX_PROCS=<number>" in the "config" file to set the maximum
+ number of processes that may be used for executions of DECKARD.
**********************************************************************
*
* Installation
*
**********************************************************************
-In bash shell or cygwin, run the build script:
+In bash shell or cygwin, go into the folder:
+
+/path/to/src/main/
+
+and run the build script:
-/path/to/src/main/build.sh
+./build.sh
-For convenience, add "src/main" into $PATH.
+For convenience, can add "/path/to/src/main" into $PATH.
NOTE: Deckard's built-in parser for Java cannot handle Java 1.5 or later
features, which means when Deckard processes a Java 1.5 file, it is very likely
@@ -49,9 +58,13 @@ cygwin shell with elevated privileges before invoking the above scripts. Also,
Deckard's performance may be tens of times slower when executed in cygwin than
on Linux due to slow I/O operations.
-To uninstall, simply
+To uninstall, go into the folder:
+
+/path/to/src/main/
+
+and simply run:
-/path/to/src/main/clean.sh
+./clean.sh
*********************************************************************
View
112 samples/config
@@ -1,70 +1,132 @@
-###################################################################
+#!/bin/sh
+#############################################################
# Configuration file for clone detection.
#
-###################################################################
+#############################################################################
# Often, need to change these common parameters:
-# - FILE_PATTERN : for source files in different languages
+# - FILE_PATTERN : what are the input file name patterns for Deckard
# - SRC_DIR : the root directory containing the source files
# - DECKARD_DIR : Where is the home directory of DECKARD
# - clone detection parameters: c.f. DECKARD's paper
# -- MIN_TOKENS
# -- STRIDE
# -- SIMILARITY
+# For Deckard2, also need to set
+# - PDG_DIR : the subdir name under $SRC_DIR that contains PDG dot files
+# - AST_DIR : the subdir name under $SRC_DIR that contains AST dot files
+# - TYPE_FILE : the file that defines AST node type names and IDs
+# - RELEVANT_NODEFILE : the file that defines relevant AST node type names
+# - LEAF_NODEFILE : the file that defines leaf AST node type names
+# - PARENT_NODEFILE : the file that defines parent AST node type names
+# The above 4 parameters are hard-coded in Deckard1 for different langauages,
+# while Deckard2 extracts those out to make it configurable without the
+# need to recompile.
+
+# Since Deckard 1&2 use different parameters, please make sure
+# the parameters are set corrected for either Deckard1 or Deckard2.
+# TODO: make the check automatic.
#
-# java, c, or php?
-FILE_PATTERN='*.java' # used for the 'find' command
+# Deckard2 supports only dot; Deckard1 supports only java, c, php.
+FILE_PATTERN='*.java' # used in the 'find' command below
# where are the source files?
-SRC_DIR='src'
+SRC_DIR="src"
+PDG_DIR="ddgs" # used by Deckard2 for 'find $SRC_DIR -ipath "*/$PDG_DIR/$FILE_PATTERN"'
+AST_DIR="asts" # each pdg should have an ast with the same name in a different folder
+# where are node definition files? used by Deckard2
+TYPE_FILE='/home/lingxiao/projects/Deckard/testdata/deckard3/AstNodeTypeNamesIDs.txt'
+RELEVANT_NODEFILE='/home/lingxiao/projects/Deckard/testdata/deckard3/AstRelevantNodes.txt'
+LEAF_NODEFILE='/home/lingxiao/projects/Deckard/testdata/deckard3/AstLeafNodes.txt'
+PARENT_NODEFILE='/home/lingxiao/projects/Deckard/testdata/deckard3/AstParentNodes.txt'
+
# where is Deckard?
-DECKARD_DIR=".."
+DECKARD_DIR="/home/lingxiao/projects/Deckard/DeckardDebugging"
# clone parameters; refer to paper.
-MIN_TOKENS='50 100'
-STRIDE='2 0'
-#DISTANCE='2.236 0.70711 1.58114'
-SIMILARITY='1.0 0.95'
+MIN_TOKENS='30 50' # can be a sequence of integers
+STRIDE='2 0' # can be a sequence of integers
+SIMILARITY='1.0 0.95' # can be a sequence of values <= 1
+#DISTANCE='0 0.70711 1.58114 2.236'
-###################################################################
+###########################################################
# Where to store result files?
#
# where to output generated vectors?
-VECTOR_DIR='vectors'
+VECTOR_DIR="vectors"
# where to output detected clone clusters?
-CLUSTER_DIR='clusters'
+CLUSTER_DIR="clusters"
# where to output timing/debugging info?
-TIME_DIR='times'
+TIME_DIR="times"
-###################################################################
+##########################################################
# where are several programs we need?
#
# where is the vector generator?
-VGEN_EXEC="$DECKARD_DIR/src/main"
+VGEN_EXEC="$DECKARD_DIR/src"
case $FILE_PATTERN in
+ *.dot )
+ VGEN_EXEC="$VGEN_EXEC/dot2d/dotvgen" ;;
*.java )
- VGEN_EXEC="$VGEN_EXEC/jvecgen" ;;
+ VGEN_EXEC="$VGEN_EXEC/main/jvecgen" ;;
*.php )
- VGEN_EXEC="$VGEN_EXEC/phpvecgen" ;;
+ VGEN_EXEC="$VGEN_EXEC/main/phpvecgen" ;;
*.c | *.h )
- VGEN_EXEC="$VGEN_EXEC/cvecgen" ;;
+ VGEN_EXEC="$VGEN_EXEC/main/cvecgen" ;;
* )
echo "Error: invalid FILE_PATTERN: $FILE_PATTERN"
VGEN_EXEC="$VGEN_EXEC/invalidvecgen" ;;
esac
-# how to divide the vectors into groups? It's just the directory name that matters
+# how to divide the vectors into groups?
GROUPING_EXEC="$DECKARD_DIR/src/vgen/vgrouping/runvectorsort"
-# where is the lsh?
+# where is the lsh for vector clustering?
CLUSTER_EXEC="$DECKARD_DIR/src/lsh/bin/enumBuckets"
# how to post process clone groups?
POSTPRO_EXEC="$DECKARD_DIR/scripts/clonedetect/post_process_groupfile"
-# how to transform source code html?
+# how to transform source code html? Used by Deckard1 only
SRC2HTM_EXEC=source-highlight
SRC2HTM_OPTS=--line-number-ref
-###################################################################
+############################################################
+# For parallel processing
+#
+# the maximal number of processes to be used (by xargs)
+# - 0 means as many as possible (upto xargs)
+MAX_PROCS=8
+
+##################################################################
# Some additional, internal parameters; can be ignored
#
# the maximal vector size for the first group; not really useful
-GROUPING_S='50'
+GROUPING_S='30' # should be a single value
#GROUPING_D
#GROUPING_C
+export DECKARD_DIR
+export FILE_PATTERN
+export SRC_DIR
+export PDG_DIR
+export AST_DIR
+
+export TYPE_FILE
+export RELEVANT_NODEFILE
+export LEAF_NODEFILE
+export PARENT_NODEFILE
+
+export VECTOR_DIR
+export TIME_DIR
+export CLUSTER_DIR
+
+export VGEN_EXEC
+export GROUPING_EXEC
+export CLUSTER_EXEC
+export POSTPRO_EXEC
+export SRC2HTM_EXEC
+export SRC2HTM_OPTS
+
+export MIN_TOKENS
+export STRIDE
+#export DISTANCE
+export SIMILARITY
+export GROUPING_S
+export GROUPING_D
+export GROUPING_C
+
View
4 scripts/bugdetect/bugcounting
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 scripts/bugdetect/bugfiltering
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 scripts/bugdetect/bugmerging
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 scripts/bugdetect/bugordering
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 scripts/bugdetect/bugtypecounting
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
37 scripts/bugdetect/deckardd.sh
@@ -1,7 +1,40 @@
#!/bin/bash
-echo "DECKARD DEBUG--A Tree-Based Code Clone-Related Bug Detection Tool. Version 1.2.3"
-echo "Copyright (c) 2007-2012. University of California"
+#
+#
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
+# Ghassan Misherghi <ghassanm@ucdavis.edu>
+# Zhendong Su <su@ucdavis.edu>
+# Stephane Glondu <steph@glondu.net>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the University of California nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+echo "DECKARD DEBUG--A Tree-Based Code Clone-Related Bug Detection Tool. Version 1.3"
+echo "Copyright (c) 2007-2013. University of California / Singapore Management University"
echo "Distributed under the three-clause BSD license."
echo
View
4 scripts/bugdetect/mergecomments
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 scripts/clonedetect/cd_coverage
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
11 scripts/clonedetect/config-sample
@@ -1,7 +1,7 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -92,6 +92,13 @@ POSTPRO_EXEC="$DECKARD_DIR/scripts/clonedetect/post_process_groupfile"
SRC2HTM_EXEC=source-highlight
SRC2HTM_OPTS=--line-number-ref
+############################################################
+# For parallel processing
+#
+# the maximal number of processes to be used (by xargs)
+# - 0 means as many as possible (upto xargs)
+MAX_PROCS=8
+
##################################################################
# Some additional, internal parameters; can be ignored
#
View
11 scripts/clonedetect/configure
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -128,5 +128,12 @@ check_exe "head"
check_exe "tail"
#check_exe "time"
+# For parallel processing
+if [[ -z "${MAX_PROCS}" ]];
+then
+ echo "Warning: MAX_PROCS not set in config. Assume 0"
+ MAX_PROCS=0
+fi
+export MAX_PROCS
View
10 scripts/clonedetect/deckard.sh
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -33,8 +33,8 @@
#
#
-echo "DECKARD--A Tree-Based Code Clone Detection Toolkit. Version 1.2.3"
-echo "Copyright (c) 2007-2012. University of California"
+echo "DECKARD--A Tree-Based Code Clone Detection Toolkit. Version 1.3"
+echo "Copyright (c) 2007-2013. University of California / Singapore Management University"
echo "Distributed under the three-clause BSD license."
echo
@@ -106,7 +106,7 @@ if [[ $errcode -ne 0 ]]; then
fi
echo "Clone detection done. Logs in $TIME_DIR/*"
-echo "Clone reports in $VECTOR_DIR/post_cluster_*"
+echo "Clone reports in $CLUSTER_DIR/post_cluster_*"
echo
# Bug Finding:
View
4 scripts/clonedetect/generateparam
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 scripts/clonedetect/paramsetting
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
5 scripts/clonedetect/post_process_groupfile
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -99,6 +99,7 @@ def only_one_diff( cluster ):
return True
return False
+#TODO: may consider to tighten the criteria and reduce the amount of overlapping clone reports.
def remove_staggered( cluster ):
rs= map( vector_linerange, cluster )
keep= map( lambda x: True, cluster )
View
19 scripts/clonedetect/vdbgen
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -34,6 +34,7 @@
#
TOOVERWRITE=
+EXEC_DIR=`dirname $0`
clean()
{
@@ -74,15 +75,11 @@ vgen()
#NOTE: "for .. in .." has trouble when a filename contains spaces; use "while read .." instead:
( time \
- find "$SRC_DIR" -iname "$FILE_PATTERN" | while read file;
+# find "$SRC_DIR" -iname "$FILE_PATTERN" | while read file;
# for file in `find "$SRC_DIR" -iname "$FILE_PATTERN"`;
- do
- echo "Parsing $file" >> "$TIME_DIR/vgen_$1_$2"
- if [[ ! -s "${file}.vec" || "$TOOVERWRITE" = "true" ]];
- then
- "$VGEN_EXEC" -i "$file" -m $1 -t $2
- fi
- done \
+# do
+ find "$SRC_DIR" -iname "$FILE_PATTERN" -print0 | xargs -0 -n 1 -P $MAX_PROCS "$EXEC_DIR/vdbgenfile" $1 $2
+# done \
) 1>>"$TIME_DIR/vgen_$1_$2" 2>&1
find "$SRC_DIR" -iname '*.vec' -print0 | xargs -0 --max-args=100 cat > "$VECTOR_DIR/vdb_$1_$2"
@@ -116,7 +113,7 @@ tohtml()
fi
}
-. `dirname $0`/configure
+. "$EXEC_DIR/configure"
if [[ $# -ge 1 ]]; then
case "$1" in
View
84 scripts/clonedetect/vertical-param-batch
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -40,7 +40,8 @@
# Feel free to adjust the workflows in the scripts to fit your needs.
TOOVERWRITE=
-coverage=`dirname $0`/cd_coverage
+EXEC_DIR=`dirname $0`
+coverage=$EXEC_DIR/cd_coverage
GROUPING_H=100000
# Delete intermediate files, but leave times/*, cluster/coverage*, cluster/post_* files there:
@@ -127,7 +128,7 @@ paramsetting()
t=$1
s=$2
sim=$3
- echo -n "paramsetting: $t $s $sim ..." | tee -a "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
+ echo -n "paramsetting: $t $s $sim ..." | tee "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
# convert SIMILARITY to DISTANCE:
i=`echo "$sim ${GROUPING_S}" | awk '{printf( "%.7g\n", sqrt((1-$1)*$2) )}'`
# create the range file if not exist:
@@ -140,6 +141,11 @@ paramsetting()
fi
# use the largest group file for paramter tuning:
# "head -n 1" may break the pipe coming from "ls", causing signal 13 (no more reader, while writer's still writing), but should not matter here:
+ grptuningid=`find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' | wc -l`
+ if [[ $grptuningid -le 0 ]]; then
+ echo "Error: $FUNCNAME failure: no vector group found: $t $s $sim" | tee -a "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
+ exit 1
+ fi
groupfortuning=`find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' -print0 | xargs -0 ls -S | head -n 1`
grptuningid=`echo ${groupfortuning} | sed "s/.*vdb_${t}_${s}_g\([0-9]*\)_${i}_${GROUPING_S}.*/\1/"`
grpal=0
@@ -156,7 +162,7 @@ paramsetting()
grpdist=`echo "$sim ${grpal}" | awk '{printf( "%.7g\n", sqrt((1-$1)*2*$2) )}'`
fi
echo -n "Looking for optimal parameters by " | tee -a "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
- cluster $groupfortuning $grpdist -c | tee -a "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
+ cluster $groupfortuning $grpdist -c >> "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
errcode=$?
if [[ $errcode -ne 0 || ! -s "${groupfortuning}.param" ]]; then
@@ -167,25 +173,10 @@ paramsetting()
echo -n "Setting Parameters for all other groups (may take hours on cygwin but only minutes on Linux...why?)..." | tee -a "${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}"
# TODO: performance improvements to reduce file I/O; replace head/tail/awk/generateparam
( time \
- find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' | while read vdb;
- do
- echo "$FUNCNAME: ${vdb}.param"
- grpfileid=`echo "$vdb" | sed "s/.*vdb_${t}_${s}_g\([0-9]*\)_${i}_${GROUPING_S}.*/\1/"`
- grpal=0
- grpdist=0
- if [[ ${grptuningid} -eq $grpfileid ]]; then
- continue
- elif [[ $grpfileid -eq 1 ]]; then
- grpal=`head -n $(expr $grpfileid + 1) $(dirname "$vdb")/ranges_${i}_${GROUPING_S} | tail -n 1 | awk '{print $3}'`
- grpdist=`echo "$sim ${grpal}" | awk '{printf( "%.7g\n", sqrt((1-$1)*$2) )}'`
- else
- grpal=`head -n $(expr $grpfileid + 1) $(dirname "$vdb")/ranges_${i}_${GROUPING_S} | tail -n 1 | awk '{print $2}'`
- grpdist=`echo "$sim ${grpal}" | awk '{printf( "%.7g\n", sqrt((1-$1)*2*$2) )}'`
- fi
- lineno=`wc -l "$vdb" | awk '{print $1}'`
- lineno=$(($lineno / 2))
- "`dirname $0`/generateparam" "${groupfortuning}.param" $lineno $grpdist > "${vdb}.param"
- done \
+# find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' | while read vdb;
+# do
+ find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' -print0 | xargs -0 -t -n 1 -P $MAX_PROCS "$EXEC_DIR/paramsetting_groupfile" $t $s $sim $groupfortuning
+# done \
) 1>>"${TIME_DIR}/paramsetting_${t}_${s}_${sim}_${GROUPING_S}" 2>&1
errcode=$?
@@ -209,7 +200,7 @@ cluster()
flag=
fi
- echo -n "Clustering '${vdb}' ${grpdist} ..." | tee -a "$TIME_DIR/cluster_${vfile}"
+ echo "Clustering '${vdb}' ${grpdist} ..." | tee -a "$TIME_DIR/cluster_${vfile}"
if [[ "$TOOVERWRITE" != "true" &&
-s "$CLUSTER_DIR/cluster_${vfile}_$2" &&
@@ -252,14 +243,15 @@ cluster()
echo "Error: Parameter File missing for '$vdb' $dist...Exit" | tee -a "$TIME_DIR/cluster_${vfile}"
exit 65
fi
+ echo "$CLUSTER_EXEC" -R $dist -M $mem -b 2 -A -f "$vdb" ${flag} -p "${vdb}.param" " > " "$CLUSTER_DIR/cluster_${vfile}" | tee -a "$TIME_DIR/cluster_${vfile}"
( time \
"$CLUSTER_EXEC" -R $dist -M $mem -b 2 -A -f "$vdb" ${flag} -p "${vdb}.param" \
> "$CLUSTER_DIR/cluster_${vfile}" \
) 1>>"$TIME_DIR/cluster_${vfile}" 2>&1
errcode=$?
if [[ $errcode -ne 0 ]]; then
- echo "$FUNCNAME: Possible errors occurred. Check log: $TIME_DIR/cluster_${vfile}" | tee -a "$TIME_DIR/cluster_${vfile}"
+ echo "$FUNCNAME: Possible errors occurred with LSH. Check log: $TIME_DIR/cluster_${vfile}" | tee -a "$TIME_DIR/cluster_${vfile}"
else
echo "Done clustering '${vdb}' ${grpdist}. Log: $TIME_DIR/cluster_${vfile}" | tee -a "$TIME_DIR/cluster_${vfile}"
fi
@@ -312,7 +304,7 @@ pcluster()
}
-. `dirname $0`/configure
+. "$EXEC_DIR/configure"
if [[ $# -ge 1 ]]; then
case "$1" in
@@ -342,7 +334,8 @@ do
do
for sim in $SIMILARITY;
do
- echo "Vector clustering w/ MIN_TOKENS=$t, STRIDE=$s, SIMILARITY=$sim ..."
+ echo
+ echo "= Vector clustering w/ MIN_TOKENS=$t, STRIDE=$s, SIMILARITY=$sim ..."
echo
# convert SIMILARITY to DISTANCE used by LSH:
@@ -351,36 +344,27 @@ do
# Grouping:
grouping $t $s $i
errcode=$?
+ if [[ $errcode -ne 0 ]]; then
+ echo "Error: grouping $t $s $sim. There may be no vectors or permission denied for the setting."
+ continue
+ fi
# NOTE:
# range file name: ranges_${i}_${GROUPING_S}
# group file names: vdb_${t}_${s}_g[0-9]+_${i}_${GROUPING_S}
# Set parameter files for LSH:
paramsetting $t $s $sim
errcode=$?
+ if [[ $errcode -ne 0 ]]; then
+ echo "Error: paramsetting $t $s $sim. There may be no vectors or permission denied for the setting."
+ continue
+ fi
# Clustering:
echo "Cluster every vector groups..." | tee "$TIME_DIR/cluster_vdb_${t}_${s}_allg_${i}_${GROUPING_S}"
- find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' | while read vdb;
- do
- grpfileid=`echo "$vdb" | sed "s/.*vdb_${t}_${s}_g\([0-9]*\)_${i}_${GROUPING_S}.*/\1/"`
- grpal=0
- grpdist=0
- # TODO: what are better distance parameters for the groups?
- if [[ $grpfileid -le 0 ]]; then
- echo "Warning: invalid group id: $grpfileid in group $vdb...Skip." | tee -a "$TIME_DIR/cluster_vdb_${t}_${s}_allg_${i}_${GROUPING_S}"
- continue
- elif [[ $grpfileid -eq 1 ]]; then
- grpal=`head -n $(expr $grpfileid + 1) $(dirname "$vdb")/ranges_${i}_${GROUPING_S} | tail -n 1 | awk '{print $3}'`
- grpdist=`echo "$sim ${grpal}" | awk '{printf( "%.7g\n", sqrt((1-$1)*$2) )}'`
- else
- grpal=`head -n $(expr $grpfileid + 1) $(dirname "$vdb")/ranges_${i}_${GROUPING_S} | tail -n 1 | awk '{print $2}'`
- grpdist=`echo "$sim ${grpal}" | awk '{printf( "%.7g\n", sqrt((1-$1)*2*$2) )}'`
- fi
- cluster "$vdb" $grpdist 1>>"$TIME_DIR/cluster_vdb_${t}_${s}_allg_${i}_${GROUPING_S}" 2>&1
- errcode=$?
- if [[ $errcode -ne 0 ]]; then
- echo "Clustering: Possible errors for vector group: $vdb $grpdist. Check log: $TIME_DIR/cluster_$(basename "${vdb}")" | tee -a "$TIME_DIR/cluster_vdb_${t}_${s}_allg_${i}_${GROUPING_S}"
- fi
- done
+# find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' | while read vdb;
+# do
+ export -f cluster
+ find "$VECTOR_DIR" -type f -name "vdb_${t}_${s}_g[0-9]*_${i}_${GROUPING_S}*" -not -name '*.param' -print0 | xargs -0 -n 1 -P $MAX_PROCS "$EXEC_DIR/vdbclustergroup" $t $s $sim
+# done
echo "Done clustering. Check log: $TIME_DIR/cluster_vdb_${t}_${s}_allg_${i}_${GROUPING_S} and $TIME_DIR/cluster_vdb_${t}_${s}_g[0-9]+_${i}_${GROUPING_S}" | tee -a "$TIME_DIR/cluster_vdb_${t}_${s}_allg_${i}_${GROUPING_S}"
# Merging:
echo -n "Merging all clone reports into $CLUSTER_DIR/cluster_vdb_${t}_${s}_allg_${sim}_${GROUPING_S} ..." | tee -a "$TIME_DIR/merging_${t}_${s}_${sim}_${GROUPING_S}"
View
109 src/include/ptree.h
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
@@ -42,47 +42,49 @@
#include <fstream>
#include <sstream>
#include <list>
+#include <set>
#include <limits.h>
-/* better NOT use this in header files for the sake of large programs with lots of components. */
-using namespace std;
-
/** A Tree Node or a tree */
class Tree;
/** A whole parse tree */
class ParseTree {
+ const static int DEBUG_LEVEL = 1;
public:
- ParseTree(Tree *root, int nTypes, map<int,string> *typeNames, map<string,int> *typeIds);
+ ParseTree(Tree *root, int nTypes, std::map<int, std::string> *typeNames, std::map<std::string,int> *typeIds);
+ /** recursively delete all tree nodes */
~ParseTree();
Tree *getRoot();
int typeCount();
/** Valid type values range from 0 to typeCount-1 */
- const string & getTypeName(int);
+ const std::string & getTypeName(int);
- int getTypeID( const string& ); /* "IDENTIFER" for identifiers. */
+ int getTypeID( const std::string& ); /* "IDENTIFER" for identifiers. */
- string filename;
+ std::string filename;
/** relevantNodes, are those that shouold be counted within the vector */
- vector<int> relevantNodes;
+ std::vector<int> relevantNodes;
/** leafNodes are the smallest nodes which are used to advance the
* sliding window */
- vector<int> leafNodes;
+ std::vector<int> leafNodes;
/** validParents are the nodes from which we will generate vectors if they
* have the required counts */
- vector<int> validParents;
+ std::vector<int> validParents;
/** this's something similar to relevantNodes??? just because of a different vector merging strategy. */
- vector<int> mergeableNodes;
+ std::vector<int> mergeableNodes;
/** dump the whole tree in a graph-like format; output filename is the 'filename'+'.grp' */
- bool dumpParseTree(bool toOveride);
+ bool dumpParseTree(const char* fn, bool toOveride);
+ /** dump the tree in the dot format; output filename is the 'filename'+'.dot' */
+ bool outputParseTree2Dot(const char* fn, bool toOveride);
/** return the smallest tree containing all elements from the line number */
Tree* line2Tree(int ln);
@@ -95,6 +97,9 @@ class ParseTree {
Tree* getContextualNode(Tree* node);
Tree* getContextualNode(long startTokenId, long endTokenId);
+ /** set node ids from a given set of node names */
+ static int setNodeIDs(std::vector<int>&, const std::set<std::string>&);
+
/** return the path from the root to the token: */
std::list<Tree*>* root2Token(long tid);
bool root2TokenAux(long tid, Tree* node, std::list<Tree*>& path);
@@ -113,18 +118,19 @@ class ParseTree {
Tree *root; /* the root tree node */
/** map node type ids to type names */
- map<int,string> *typeNames;
+ std::map<int, std::string> *typeNames;
/** map node type names to type ids */
- map<string,int> *typeIDs;
+ std::map<std::string, int> *typeIDs;
};
/* Valid type ids range from 0 to typeCount-1 */
int typeCount(std::map<int, std::string>& id2name);
int typeCount(std::map<std::string, int>& name2id);
-const string & getTypeName(std::map<int, std::string>& id2name, int id);
-int getTypeID(std::map<std::string, int>& name2id, const string& name); /* "IDENTIFER" for identifiers. */
+const std::string & getTypeName(std::map<int, std::string>& id2name, int id);
+int getTypeID(std::map<std::string, int>& name2id, const std::string& name); /* "IDENTIFER" for identifiers. */
bool isContextualNode(Tree* node); // language-dependent operation
+bool setContextualNodes(const std::set<std::string>& nodenames);
/** create a parse tree from a file: */
ParseTree* parseFile(const char * fn);
@@ -134,10 +140,10 @@ class NonTerminal;
/* enum type used as "subscripts" of tree attributes. */
typedef enum {
- NODE_VECTOR, /* the vector */
- NODE_ID, /* range of node IDs in the serialized tree */
- NODE_TOKEN_ID, /* range of token IDs */
- NODE_SERIALIZED_NEIGHBOR,
+ NODE_VECTOR, /** pointer to the tree vector */
+ NODE_ID, /** pointer to the min/max range of node IDs in the serialized tree */
+ NODE_TOKEN_ID, /** pointer to the min/max range of token IDs */
+ NODE_SERIALIZED_NEIGHBOR, /** pointer to pointers to the previous and next nodes in the serialized chain */
} NodeAttributeName_t;
class Tree {
@@ -146,7 +152,7 @@ class Tree {
int type; /* need to rely on getTypeName to get its type name */
/** the child nodes of this node */
- vector<Tree*> children;
+ std::vector<Tree*> children;
virtual bool isTerminal() { return false;}
virtual bool isNonTerminal() { return false;}
@@ -167,11 +173,11 @@ class Tree {
}
virtual void print() {
- cout << "[ " << type << " ";
+ std::cout << "[ " << type << " ";
for (int i= 0; i < children.size(); i++) {
children[i]->print();
}
- cout << "]";
+ std::cout << "]";
}
virtual void printTok() {
@@ -182,29 +188,15 @@ class Tree {
std::map<NodeAttributeName_t, void* > attributes;
- ~Tree()
- {
- /* tree nodes can not be shared: */
- for (int i= 0; i < children.size(); i++) {
- if ( children[i]!=NULL ) {
- delete children[i];
- children[i] = NULL;
- }
- }
- nextSibbling = NULL;
- parent = NULL;
-
- /* TODO: possible mem leak from the elements in attributes. */
- attributes.clear();
- }
+ virtual ~Tree();
Tree() {
- nextSibbling= NULL;
- parent= NULL;
- type= -1;
+ nextSibbling= NULL;
+ parent= NULL;
+ type= -1;
}
- /** calculate the range of line numbers for this tree node, store results in [min, max].
+ /** recursively calculate the range of line numbers for this tree node from bottom-up, and store results in [min, max].
* For performance concerns, this function should only be called once from the root node. */
int max, min;
virtual void lineRange() {
@@ -221,12 +213,29 @@ class Tree {
}
}
+ /** update the range of line numbers for this tree node based its direct children (i.e., no recursive updates),
+ * assuming every node has previously set max/min correctly already. */
+ virtual void lineRangeUpdate()
+ {
+ for (int i= 0; i < children.size(); i++ ) {
+ // no recursion needed, assuming every node has previously set max/min
+ if (max < children[i]->max) {
+ max= children[i]->max;
+ }
+ if (min > children[i]->min) {
+ min= children[i]->min;
+ }
+ }
+ }
+
Tree *nextSibbling;
Tree *parent;
int terminal_number; /* The number of terminals under *this* node */
/** output the nodes and the edges under this tree: */
- long dumpTree(ofstream & out, long n);
+ long dumpTree(std::ofstream & out, long n);
+ /** output the nodes and the edges under this tree to a dot file: */
+ long outputTree2Dot(std::ofstream & out, long n);
/** get the order number of a tree node under this tree.
* The order number is based on depth-first traversal and starts with 'n'.
@@ -241,12 +250,12 @@ class Terminal : public Tree {
public:
Terminal( int type, char *s, int line ) {
this->type= type;
- value= new string(s);
+ value= new std::string(s);
this->line= line;
}
int line;
- ~Terminal() {
+ virtual ~Terminal() {
delete value;
}
@@ -259,15 +268,15 @@ class Terminal : public Tree {
virtual void print()
{
- cout << "<" << *value << ">";
+ std::cout << "<" << *value << ">";
}
virtual void printTok()
{
- cout << *value << ", " << line << endl;
+ std::cout << *value << ", " << line << std::endl;
}
- string *value;
+ std::string *value;
};
View
17 src/lsh/Makefile
@@ -1,3 +1,8 @@
+
+# '?=' gives environment variables (if any) higher priority than definitions in the file
+# So, it would be easier to re-define DEBUGFLAGS to be '-g -pg' to enable debugging
+DEBUGFLAGS?=-O3
+
SOURCES_DIR:=sources
OBJ_DIR:=bin
OUT_DIR:=bin
@@ -24,9 +29,13 @@ TEST_BUILDS:=exactNNs \
genPlantedDS
GCC:=g++
-OPTIONS:=-DREAL_FLOAT -O3 #-DDEBUG
-CFLAGS:=$(OPTIONS)
+OPTIONS:=-DREAL_FLOAT
# -march=athlon -msse -mfpmath=sse
+CFLAGS+=$(DEBUGFLAGS) $(OPTIONS)
+CXXFLAGS+=$(DEBUGFLAGS) $(OPTIONS)
+# NOTE: 'make' uses the command "$(CC) $(LDFLAGS) x.o $(LOADLIBES) $(LDLIBS)" when linking
+# pay attention to the ordering of the linked libraries. E.g., -lm should come after x.o
+# This behavior is compiler-depedent; some newer versions of g++ (not all versions of gcc) work well no matter what the ordering is.
LIBRARIES:=-lm
#-ldmalloc
@@ -48,13 +57,13 @@ c: compile
compile:
@mkdir -p $(OUT_DIR)
- $(GCC) -o $(OUT_DIR)/$(LSH_BUILD) $(OPTIONS) $(OBJ_SOURCES) $(SOURCES_DIR)/$(LSH_BUILD).cpp $(LIBRARIES)
+ $(GCC) $(CXXFLAGS) -o $(OUT_DIR)/$(LSH_BUILD) $(OPTIONS) $(OBJ_SOURCES) $(SOURCES_DIR)/$(LSH_BUILD).cpp $(LIBRARIES)
chmod g+rwx $(OUT_DIR)/$(LSH_BUILD)
ct:
@mkdir -p $(OUT_DIR)
(for i in $(TEST_BUILDS); do \
- $(GCC) -o $(OUT_DIR)/$$i $(OPTIONS) $(OBJ_SOURCES) $(TEST_DIR)/$${i}.cpp $(LIBRARIES); chmod g+rwx $(OUT_DIR)/$$i; done)
+ $(GCC) $(CXXFLAGS) -o $(OUT_DIR)/$$i $(OPTIONS) $(OBJ_SOURCES) $(TEST_DIR)/$${i}.cpp $(LIBRARIES); chmod g+rwx $(OUT_DIR)/$$i; done)
zip:
zip -r LSHarchive.zip Makefile sources bin Documentation
View
12 src/lsh/sources/Makefile
@@ -1,9 +1,15 @@
+
+# '?=' gives environment variables (if any) higher priority than definitions in the file
+# So, it would be easier to re-define DEBUGFLAGS to be '-g -pg' to enable debugging
+DEBUGFLAGS?=-O3
+
# DEFINE_FLOAT should be set by a configure script (using testFloat.cpp)
# if this doesn't work, try REAL_DOUBLE instead
DEFINE_FLOAT = REAL_FLOAT
-CPP_OPTS = -O3 -D$(DEFINE_FLOAT) # -Wunused-variable
-LINK_OPTS = -O3 -lm
+CPP_OPTS = $(DEBUGFLAGS) -D$(DEFINE_FLOAT) # -Wunused-variable
+LINK_OPTS = $(DEBUGFLAGS)
+LDLIBS+=-lm
OUT_DIR = ../bin
LSH_SRC = BucketHashing.cpp \
@@ -31,7 +37,7 @@ TEST_BUILDS = $(addprefix $(OUT_DIR)/,$(TEST_SRC:.cpp=))
all: $(TEST_BUILDS)
$(OUT_DIR)/%: %.o $(LSH_OBJS)
- g++ -o $@ $(LINK_OPTS) $^
+ g++ -o $@ $(LINK_OPTS) $^ $(LDLIBS)
clean: # remove intermediate files
-rm -f *~ *.o .depend
View
1 src/lsh/sources/enumBuckets.cpp
@@ -267,6 +267,7 @@ void readDataSetFromFile2(char *filename)
} // end of file
fclose(f);
+ FAILIFWR(nPoints<=0, "No data point in the input file?");
// put the points in the array and free the point list
FAILIF(NULL == (dataSetPoints = (PPointT*)MALLOC(nPoints * sizeof(PPointT))));
for(IntT i = 0; i < nPoints; i++) {
View
57 src/main/Makefile
@@ -1,7 +1,7 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -30,6 +30,11 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
+
+# '?=' gives environment variables (if any) higher priority than definitions in the file
+# So, it would be easier to re-define DEBUGFLAGS to be '-g -pg' to enable debugging
+DEBUGFLAGS?=-O3
+
TREEHEADER=../include/ptree.h
TREESRC=ptree.cc
@@ -51,8 +56,12 @@ CHEADERS=../ptgen/gcc/crelevantNodes.h ../ptgen/gcc/catomicNodes.h ../ptgen/gcc/
JHEADERS=../ptgen/java/jrelevantNodes.h ../ptgen/java/jatomicNodes.h ../ptgen/java/jparentNodes.h ../ptgen/java/jcontextualNodes.h
PHPHEADERS=../ptgen/php5/phprelevantNodes.h ../ptgen/php5/phpatomicNodes.h ../ptgen/php5/phpparentNodes.h ../ptgen/php5/phpcontextualNodes.h
-CXX= g++ -I../include -I../vgen/treeTra # -g
-CXXFLAGS= -O3
+CC=g++
+CXX=g++
+CPPFLAGS+=-I../include -I../vgen/treeTra
+CFLAGS+=$(DEBUGFLAGS)
+CXXFLAGS+=$(DEBUGFLAGS)
+LDFLAGS+=$(DEBUGFLAGS)
EXES=cvecgen jvecgen cbugfilters jbugfilters out2html \
phpvecgen phpbugfilters out2xml \
@@ -65,52 +74,52 @@ TARGET:${EXES}
# because intermediate files may be overwritten.
${CTREEOBJS}:${TREESRC} ${TREEHEADER}
- $(CXX) -o $@ $(CXXFLAGS) -c -DCLANG ${TREESRC}
+ $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -c -DCLANG ${TREESRC}
${JTREEOBJS}:${TREESRC} ${TREEHEADER}
- $(CXX) -o $@ $(CXXFLAGS) -c -DJAVA ${TREESRC}
+ $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -c -DJAVA ${TREESRC}
${PHPTREEOBJS}:${TREESRC} ${TREEHEADER}
- $(CXX) -o $@ $(CXXFLAGS) -c -DPHP ${TREESRC}
+ $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -c -DPHP ${TREESRC}
cvecgen:${COBJS} ${CHEADERS} main.cc
- $(CXX) $(CXXFLAGS) -c -DCLANG main.cc
- $(CXX) -o $@ $(CXXFLAGS) main.o ${COBJS}
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c -DCLANG main.cc
+ $(CXX) -o $@ $(LDFLAGS) main.o ${COBJS}
jvecgen:${JOBJS} ${JHEADERS} main.cc
- $(CXX) $(CXXFLAGS) -c -DJAVA main.cc
- $(CXX) -o $@ $(CXXFLAGS) main.o ${JOBJS}
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c -DJAVA main.cc
+ $(CXX) -o $@ $(LDFLAGS) main.o ${JOBJS}
phpvecgen:${PHPOBJS} ${PHPHEADERS} main.cc
- $(CXX) $(CXXFLAGS) -c -DPHP main.cc
- $(CXX) -o $@ $(CXXFLAGS) main.o ${PHPOBJS}
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c -DPHP main.cc
+ $(CXX) -o $@ $(LDFLAGS) main.o ${PHPOBJS}
cbugfilters:${COBJS} ${CHEADERS} bugmain.cc
- ${CXX} ${CXXFLAGS} -c -DCLANG bugmain.cc
- ${CXX} ${CXXFLAGS} -o $@ bugmain.o ${COBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -c -DCLANG bugmain.cc
+ ${CXX} ${LDFLAGS} -o $@ bugmain.o ${COBJS}
jbugfilters:${JOBJS} ${JHEADERS} bugmain.cc
- ${CXX} ${CXXFLAGS} -c -DJAVA bugmain.cc
- ${CXX} ${CXXFLAGS} -o $@ bugmain.o ${JOBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -c -DJAVA bugmain.cc
+ ${CXX} ${LDFLAGS} -o $@ bugmain.o ${JOBJS}
phpbugfilters:${PHPOBJS} ${PHPHEADERS} bugmain.cc
- ${CXX} ${CXXFLAGS} -c -DPHP bugmain.cc
- ${CXX} ${CXXFLAGS} -o $@ bugmain.o ${PHPOBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -c -DPHP bugmain.cc
+ ${CXX} ${LDFLAGS} -o $@ bugmain.o ${PHPOBJS}
out2html:${COBJS} out2html.C # don't matter to use COBJS or others
- ${CXX} ${CXXFLAGS} -o $@ out2html.C ${COBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -o $@ out2html.C ${COBJS}
out2xml:${COBJS} out2xml.C # don't matter to use COBJS or others
- ${CXX} ${CXXFLAGS} -o $@ out2xml.C ${COBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -o $@ out2xml.C ${COBJS}
cParseTreeMain:${COBJS} ${CHEADERS} parseTreeMain.cc
- ${CXX} ${CXXFLAGS} -DCLANG -o $@ parseTreeMain.cc ${COBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -DCLANG -o $@ parseTreeMain.cc ${COBJS}
jParseTreeMain:${JOBJS} ${JHEADERS} parseTreeMain.cc
- ${CXX} ${CXXFLAGS} -DJAVA -o $@ parseTreeMain.cc ${JOBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -DJAVA -o $@ parseTreeMain.cc ${JOBJS}
phpParseTreeMain:${PHPOBJS} ${PHPHEADERS} parseTreeMain.cc
- ${CXX} ${CXXFLAGS} -DPHP -o $@ parseTreeMain.cc ${PHPOBJS}
+ ${CXX} ${CXXFLAGS} $(CPPFLAGS) -DPHP -o $@ parseTreeMain.cc ${PHPOBJS}
.PHONY: clean
clean:
View
4 src/main/bugmain.cc
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
View
57 src/main/build.sh
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -33,11 +33,14 @@
#
#
-#export CXXFLAGS="-pg -g"
-#export CFLAGS="-pg -g"
-export CXXFLAGS="-O3"
-export CFLAGS="-O3"
+# Define the following DEBUGFLAGS to enable debug build
+# The default value for DEBUGFLAGS may be conditionally defined in the makefile for each module
+#export DEBUGFLAGS="-g -pg"
+# This way is similar to enable the following two (depending on the way invoked makefiles are defined):
+#export CFLAGS="-g -pg"
+#export CXXFLAGS="-g -pg"
+# re-compile parse tree generators
(
cd ../ptgen/ || exit 1
make clean
@@ -49,6 +52,7 @@ if [ $errcode -ne 0 ]; then
fi
)
+# re-compile vector generator and vector grouping code
(
cd ../vgen/treeTra/ || exit 1
make clean
@@ -68,6 +72,7 @@ if [ $errcode -ne 0 ]; then
fi
)
+# re-compile code for main entries
make clean
make
errcode=$?
@@ -76,13 +81,51 @@ if [ $errcode -ne 0 ]; then
exit $errcode
fi
+# re-compile LSH
(
cd ../lsh/ || exit 1
make clean_all
make
errcode=$?
if [ $errcode -ne 0 ]; then
- echo "Error: lsh make failed. Exit."
+ echo "error: lsh make failed. exit."
+ exit $errcode
+fi
+)
+
+# re-compile additional library code for trees and graphs
+(
+cd ../lib || exit 1
+make clean
+make
+errcode=$?
+if [ $errcode -ne 0 ]; then
+ echo "error: lib make failed. exit."
+ exit $errcode
+fi
+)
+
+# re-compile .dot parser generator
+(
+# assume antlr has been run; otherwise, please manually run antlrworks-1.4.3 in the repository first
+cd ../dot2d/grammars/output || exit 1
+make clean
+make
+errcode=$?
+if [ $errcode -ne 0 ]; then
+ echo "error: dot parser make failed. exit."
+ exit $errcode
+fi
+)
+
+# re-compile main entries for deckard 2.x
+(
+cd ../dot2d || exit 1
+make clean
+make
+errcode=$?
+if [ $errcode -ne 0 ]; then
+ echo "error: dot2d make failed. exit."
exit $errcode
fi
)
View
4 src/main/clean.sh
@@ -2,8 +2,8 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
4 src/main/main.cc
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
View
4 src/main/out2html.C
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
View
4 src/main/out2xml.C
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
View
8 src/main/parseTreeMain.cc
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
@@ -119,9 +119,9 @@ int main( int argc, char **argv )
}
if(argc>=6) {
- pt->dumpParseTree(true); // to overide existing file
+ pt->dumpParseTree(NULL, true); // to overide existing file
} else {
- pt->dumpParseTree(false);
+ pt->dumpParseTree(NULL, false);
}
Tree* node = pt->tokenRange2Tree(tbid, teid);
View
275 src/main/ptree.cc
@@ -1,7 +1,7 @@
/*
*
- * Copyright (c) 2007-2012,
- * Lingxiao Jiang <lxjiang@ucdavis.edu>
+ * Copyright (c) 2007-2013, University of California / Singapore Management University
+ * Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
* Ghassan Misherghi <ghassanm@ucdavis.edu>
* Zhendong Su <su@ucdavis.edu>
* Stephane Glondu <steph@glondu.net>
@@ -37,9 +37,11 @@
using namespace std;
+/*******************************
+ * class ParseTree
+ */
ParseTree::ParseTree(Tree *root, int nTypes,
- map<int,string> *typeNames,
- map<string,int> *typeIds )
+ map<int,string> *typeNames, map<string,int> *typeIds )
{
this->root= root;
this->nTypes= nTypes;
@@ -68,10 +70,10 @@ int ParseTree::typeCount()
const string & ParseTree::getTypeName(int id)
{
- assert(id<nTypes && id >=0);
+ assert(id < nTypes && id >= 0);
map<int,string>::iterator i= typeNames->find(id);
if (i== typeNames->end()) {
- throw "not found";
+ throw "node type id not found";
} else {
return i->second;
}
@@ -88,76 +90,112 @@ int ParseTree::getTypeID( const string &name)
}
}
-bool ParseTree::dumpParseTree(bool toOveride)
+bool ParseTree::dumpParseTree(const char* fn, bool toOveride)
{
- ifstream inp;
- ofstream out;
- string outputfn = filename + ".grp";
-
- // prepare the output file:
- if(!toOveride) {
- inp.open(outputfn.c_str(), ifstream::in);
- inp.close();
- if(!inp.fail()) {
- cerr << "Warning: parse tree dump file exists already: " << outputfn << " ...skip" << endl;
- return false;
- }
- inp.clear(ios::failbit);
- }
- out.open(outputfn.c_str(), ofstream::out);
- if(out.fail()) {
- cerr << "Error: cannot open dump file: " << outputfn << endl;
- return false;
- }
+ ifstream inp;
+ ofstream out;
+ string outputfn = (fn==NULL ? filename : string(fn)) + ".grp";
+
+ // prepare the output file:
+ if(!toOveride) {
+ inp.open(outputfn.c_str(), ifstream::in);
+ inp.close();
+ if(!inp.fail()) {
+ cerr << "Warning: parse tree dump file exists already: " << outputfn << " ...skip" << endl;
+ return false;
+ }
+ inp.clear(ios::failbit);
+ }
+ out.open(outputfn.c_str(), ofstream::out);
+ if(out.fail()) {
+ cerr << "Error: cannot open dump file: " << outputfn << endl;
+ return false;
+ }
- // dump the tree to the file:
- out << "# " << filename << endl;
- long ncount = 1;
- ncount = root->dumpTree(out, ncount);
+ // dump the tree to the file:
+ cerr << "# Dumping the parse tree to file: " << outputfn << endl;
+ out << "# " << filename << endl;
+ long ncount = 1;
+ ncount = root->dumpTree(out, ncount);
- // close the file:
- out.close();
- return true;
+ // close the file:
+ out.close();
+ return true;
+}
+
+bool ParseTree::outputParseTree2Dot(const char* fn, bool toOveride)
+{
+ bool flag = true;
+ ifstream inp;
+ ofstream out;
+ string outputfn = (fn==NULL ? filename : string(fn)) + ".dot";
+
+ // prepare the output file:
+ if(!toOveride) {
+ inp.open(outputfn.c_str(), ifstream::in);
+ inp.close();
+ if(!inp.fail()) {
+ cerr << "Warning: parse tree dot file exists already: " << outputfn << " ...skip" << endl;
+ return false;
+ }
+ inp.clear(ios::failbit);
+ }
+ out.open(outputfn.c_str(), ofstream::out);
+ if(out.fail()) {
+ cerr << "Error: cannot open dot file: " << outputfn << endl;
+ return false;
+ }
+
+ // Write the tree to the dot file:
+ cerr << "# Writing the parse tree to dot file: " << outputfn << endl;
+ out << "# This graph is supposed to be a parse tree." << endl;
+ out << "digraph " << filename << " {" << endl;
+ flag = flag && root->outputTree2Dot(out, 1);
+ out << "}" << endl;
+
+ // close the file:
+ out.close();
+ return flag;
}
Tree* ParseTree::line2Tree(int ln)
{
- // precondition: the line range of each node is set, and the line range of a parent node contains the line ranges of all of its children
- // Note that there may be more than one token in the same line
- return line2Tree(ln, ln);
+ // precondition: the line range of each node is set, and the line range of a parent node contains the line ranges of all of its children
+ // Note that there may be more than one token in the same line
+ return line2Tree(ln, ln);
}
Tree* ParseTree::line2Tree(int startln, int endln)
{
- // precondition: the line range of each node is set, and the line range of a parent node contains the line ranges of all of its children
- if(startln>endln || startln<0)
- return NULL;
- if(startln==0)
- return root;
- //else
- Tree* itr = root;
- while(itr!=NULL) {
+ // precondition: the line range of each node is set, and the line range of a parent node contains the line ranges of all of its children
+ if(startln>endln || startln<0)
+ return NULL;
+ if(startln==0)
+ return root;
+ //else
+ Tree* itr = root;
+ while(itr!=NULL) {
// cerr << "children size: " << itr->children.size() << ". Comparing with line range: " << itr->min << ":" << itr->max << endl;
- if(itr->min<=endln && startln<=itr->max) {
- int inRangeCount = 0;
- Tree* inRangeNode = NULL;
- for(int i=0; i<itr->children.size(); i++) {
-// cerr << "comparing with children " << i << "'s line range: " << itr->children[i]->min << ":" << itr->children[i]->max << endl;
- if(itr->children[i]->min<=endln && startln<=itr->children[i]->max) {
- inRangeCount++;
- if(inRangeCount>=2)
+ if(itr->min<=endln && startln<=itr->max) {
+ int inRangeCount = 0;
+ Tree* inRangeNode = NULL;
+ for(int i=0; i<itr->children.size(); i++) {
+// cerr << "comparing with children " << i << "'s line range: " << itr->children[i]->min << ":" << itr->children[i]->max << endl;
+ if(itr->children[i]->min<=endln && startln<=itr->children[i]->max) {
+ inRangeCount++;
+ if(inRangeCount>=2)
+ break;
+ inRangeNode = itr->children[i];
+ }
+ }
+ if(inRangeCount==1)
+ itr = inRangeNode;
+ else
break;
- inRangeNode = itr->children[i];
- }
- }
- if(inRangeCount==1)
- itr = inRangeNode;
- else
- break;
- } else
- itr = NULL;
- }
- return itr;
+ } else
+ itr = NULL;
+ }
+ return itr;
}
Tree* ParseTree::tokenRange2Tree(long startTokenId, long endTokenId)
@@ -206,18 +244,15 @@ Tree* ParseTree::getContextualNode(Tree* node)
if ( node==NULL )
return root;
- map<NodeAttributeName_t, void*>::iterator attr_itr = node->attributes.find(NODE_TOKEN_ID);
- assert ( attr_itr != node->attributes.end() );
- pair<long, long>* startrange = (pair<long, long>*)(*attr_itr).second;
if (node->parent==NULL)
return root;
Tree* startnode = node->parent;
while ( startnode!=NULL ) {
if ( isContextualNode(startnode) ) { // this condition is language-dependant
- break;
+ break;
} else
- startnode = startnode->parent;
+ startnode = startnode->parent;
}
if ( startnode==NULL )
return root;
@@ -230,6 +265,30 @@ Tree* ParseTree::getContextualNode(long startTokenId, long endTokenId)
return getContextualNode(node);
}
+int ParseTree::setNodeIDs(vector<int>& nids, const set<string>& nnames)
+{
+ int c = 0;
+ for (set<string>::const_iterator nitr = nnames.begin();
+ nitr!=nnames.end(); ++nitr) {
+ map<string, int>::iterator i= name2id.find(*nitr);
+ if (i == name2id.end()) {
+ cerr << "ERROR: ParseTree::setNodeIDs: unknown node type name: " << *nitr << endl;
+ if ( DEBUG_LEVEL>0 ) {
+ for(map<string, int>::const_iterator it = name2id.begin();
+ it!=name2id.end(); ++it) {
+ cerr << it->first << "\t" << it->second << endl;
+ }
+ assert(name2id.size()>0);
+ assert(name2id.size()==id2name.size());
+ }
+ continue;
+ }
+ nids.push_back(i->second);
+ c++;
+ }
+ return c;
+}
+
list<Tree*>* ParseTree::root2Token(long tid)
{
list<Tree*>* path = new list<Tree*>();
@@ -275,17 +334,78 @@ long ParseTree::tree2sn(Tree* nd)
}
+/*******************************************
+ * class Tree
+ */
+
+Tree::~Tree()
+{
+ /* tree nodes can not be shared: */
+ for (int i= 0; i < children.size(); i++) {
+ if ( children[i]!=NULL ) {
+ delete children[i];
+ children[i] = NULL;
+ }
+ }
+ nextSibbling = NULL;
+ parent = NULL;
+
+ /* clear up the attributes, but the types used may depend on other files, increasing chances of circular dependency.
+ * We need to include "tree vector" since we use its 'delete' operator, and simple forward declaration isn't enough of the types used.
+ * TODO: break the circular dependence in a better way.
+ */
+ std::map<NodeAttributeName_t, void*>::iterator attr_itr;
+ attr_itr = attributes.find(NODE_VECTOR);
+ if ( attr_itr!=attributes.end() ) {
+ TreeVector* attr = (TreeVector*)(*attr_itr).second;
+ if ( attr!=NULL )
+ delete attr;
+ }
+ attr_itr = attributes.find(NODE_ID);
+ if ( attr_itr!=attributes.end() ) {
+ std::pair<long, long>* attr = (std::pair<long, long>*)(*attr_itr).second;
+ if ( attr!=NULL )
+ delete attr;
+ }
+ attr_itr = attributes.find(NODE_TOKEN_ID);
+ if ( attr_itr!=attributes.end() ) {
+ std::pair<long, long>* attr = (std::pair<long, long>*)(*attr_itr).second;
+ if ( attr!=NULL )
+ delete attr;
+ }
+ attr_itr = attributes.find(NODE_SERIALIZED_NEIGHBOR);
+ if ( attr_itr!=attributes.end() ) {
+ std::pair<Tree*, Tree*>* attr = (std::pair<Tree*, Tree*>*)(*attr_itr).second;
+ if ( attr!=NULL )
+ delete attr;
+ }
+ attributes.clear();
+}
+
long Tree::dumpTree(ofstream & out, long n)
{
long c = n++;
- out << "n " << c << " " << getTypeName(id2name, type) << endl;
+ out << "n " << c << " " << getTypeName(id2name, type)
+ << " [min,max]=[" << min << "," << max << "] TC=" << terminal_number << endl;
for (int i= 0; i < children.size(); i++) {
out << "e " << c << " " << n << endl;
n = children[i]->dumpTree(out, n);
}
return n;
}
+long Tree::outputTree2Dot(ofstream & out, long n)
+{
+ long c = n++;
+ out << "nodeid" << c << " [ type=" << type << ", typeName=\"" << getTypeName(id2name, type)
+ << "\", min=" << min << ", max=" << max << ", tokenCount=" << terminal_number << " ] ;" << endl;
+ for (int i= 0; i < children.size(); i++) {
+ out << "nodeid" << c << " -> " << "nodeid" << n << endl;
+ n = children[i]->outputTree2Dot(out, n);
+ }
+ return n;
+}
+
long Tree::tree2sn(Tree* t, long& n)
{
long c = n++;
@@ -313,10 +433,10 @@ int typeCount(map<string, int>& name2id)
const string & getTypeName(map<int, string>& id2name, int id)
{
- assert( id<id2name.size() && id >=0 );
+ assert( id < typeCount(id2name) && id >= 0 );
map<int,string>::iterator i= id2name.find(id);
if (i == id2name.end()) {
- throw "not found";
+ throw "node type id not found";
} else {
return i->second;
}
@@ -361,9 +481,8 @@ ParseTree* parseFile(const char * fn)
if (!yyin) {
cerr << "Error: Can't open file for yyin: " << fn << endl;
}
- yyrestart(yyin); // This may be unnecessary because BISON's manual
- // says this is equivalent to (but I doubt it)
- // changing yyin directly.
+ yyrestart(yyin); /* This may be unnecessary because BISON's manual says
+ this is equivalent to (but I doubt it) changing yyin directly. */
yyparse();
fclose(yyin);
yyin = NULL;
@@ -399,12 +518,12 @@ static const char * contextualNodes[] = {
bool setContextualNodes() /* internal use only */
{
bool errflag = false;
- assert ( name2id.size() > 0 );
- ctxNodes = vector<bool>(id2name.size(), false);
+ assert ( ! name2id.empty() );
+ ctxNodes = vector<bool>(typeCount(id2name), false);
for (const char **s= contextualNodes; *s != NULL; s++) {
map<string,int>::iterator i= name2id.find(*s);
if (i == name2id.end()) {
- cerr << "unknown node type when setting contextual nodes: " << *s << endl;
+ cerr << "ERROR: setContextualNodes: unknown node type name when setting contextual nodes: " << *s << endl;
errflag = true;
continue;
}
@@ -415,7 +534,7 @@ bool setContextualNodes() /* internal use only */
bool isContextualNode(Tree* node)
{
- assert( node->type >= 0 && node->type < id2name.size() );
+ assert( node->type >= 0 && node->type < typeCount(id2name) );
if ( ctxNodes.empty() )
setContextualNodes();
return ctxNodes[node->type];
View
4 src/ptgen/Makefile
@@ -1,7 +1,7 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
View
17 src/ptgen/gcc/Makefile
@@ -1,7 +1,7 @@
#
#
-# Copyright (c) 2007-2012,
-# Lingxiao Jiang <lxjiang@ucdavis.edu>
+# Copyright (c) 2007-2013, University of California / Singapore Management University
+# Lingxiao Jiang <lxjiang@ucdavis.edu> <lxjiang@smu.edu.sg>
# Ghassan Misherghi <ghassanm@ucdavis.edu>
# Zhendong Su <su@ucdavis.edu>
# Stephane Glondu <steph@glondu.net>
@@ -30,18 +30,24 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
-CXX= g++ -I../../include
+
+DEBUGFLAGS?=-O3
+
+CPPFLAGS+=-I../../include
+CXX=g++
+CXXFLAGS+=$(DEBUGFLAGS)
+LDFLAGS+=$(DEBUGFLAGS)
OBJS= lex.yy.o pt_c.tab.o head.o
#TARGET=c_ptgen # this is just a test driver
TARGET=gccptgen.a
all: $(TARGET)
gccptgen.a: $(OBJS)
- ar -qcs $@ $(OBJS)
+ ar -csrv $@ $(OBJS)
c_ptgen: $(OBJS) main.o
- $(CXX) -o c_ptgen $(OBJS) main.o
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -o c_ptgen $(OBJS) main.o
#${TARGET}: ${OBJS} main.o
# $(CXX) -o ${TARGET} ${OBJS} main.o
@@ -58,3 +64,4 @@ head.cc pt_c.y: c.y c.y.head c.y.foot
.PHONY: clean
clean:
rm -f *.o lex.yy.cc pt_c.tab* pt_c.y head.cc $(TARGET)
+
View
20 src/ptgen/gcc/c.l
@@ -145,7 +145,22 @@ default {
count();
yylvalp->t = new Terminal(name2id["TYPENAME"],yytext,line);
return(TYPENAME); }
-"__init"|"__iomem"|"__user"|"__exit"|"__devexit"|"__devinit"|"__cpuinit"|"__cpuexit"|"__INLINE__"|"__kprobes"|"__inline__"|"__lockfunc"|"__force"|"__sched"|"__deprecated"|"__memzero"|"__pminit"|"__weak"|"__xipram"|"__attribute_used__"|"__apicdebuginit"|"__unused"|"__initdata" {count();}
+"__init"|"__iomem"|"__user"|"__exit"|"__devexit"|"__devinit"|"__devinitdata"|"__cpuinit"|"__cpuexit"|"__INLINE__"|"__kprobes"|"__inline__"|"__lockfunc"|"__force"|"__sched"|"__deprecated"|"__memzero"|"__pminit"|"__weak"|"__xipram"|"__attribute_used__"|"__apicdebuginit"|"__unused"|"__maybe_unused"|"__initdata" {count();}
+
+"__"{L}+({D})+ {
+ count();
+ yylvalp->t = new Terminal(name2id["TYPENAME"],yytext,line);
+ return(TYPENAME); }
+
+"u"({D})* {
+ count();
+ yylvalp->t = new Terminal(name2id["TYPENAME"],yytext,line);
+ return(TYPENAME); }
+
+"s"({D})* {
+ count();
+ yylvalp->t = new Terminal(name2id["TYPENAME"],yytext,line);
+ return(TYPENAME); }
{L}({L}|{D})* {
count();
@@ -379,7 +394,8 @@ void macro()
again:
last= 0;
while ((c = yyinput()) != '\n' && c != 0 && c!=EOF) {