From b4cd0b623c953979cd4f0419554dfb2fe8eb8ee2 Mon Sep 17 00:00:00 2001 From: Raul Martinez Jr Date: Fri, 5 Aug 2011 14:31:01 +0800 Subject: [PATCH 1/3] Adding MoreLikeThis Query and Unit Test --- lib/Elastica/Query/MoreLikeThis.php | 244 +++++++++++++++++++ test/lib/Elastica/Query/MoreLikeThisTest.php | 54 ++++ 2 files changed, 298 insertions(+) create mode 100644 lib/Elastica/Query/MoreLikeThis.php create mode 100644 test/lib/Elastica/Query/MoreLikeThisTest.php diff --git a/lib/Elastica/Query/MoreLikeThis.php b/lib/Elastica/Query/MoreLikeThis.php new file mode 100644 index 0000000000..80acfe5c24 --- /dev/null +++ b/lib/Elastica/Query/MoreLikeThis.php @@ -0,0 +1,244 @@ + + * @link http://www.elasticsearch.org/guide/reference/query-dsl/mlt-query.html + */ +class Elastica_Query_MoreLikeThis extends Elastica_Query_Abstract +{ + /** + * @var array + */ + protected $_fields = array(); + + /** + * @var string + */ + protected $_likeText = null; + + protected $_percTermsToMatch = 0.3; + protected $_minTermFreq = 2; + protected $_maxQueryTerms = 25; + protected $_minDocFreq = 5; + protected $_maxDocFreq = null; + protected $_minWordLen = 0; + protected $_maxWordLen = 0; + protected $_boostTerms = 1; + protected $_boost = 1.0; + + /** + * @var string + */ + protected $_analyzer = null; + + /** + * @var array + */ + protected $_stopWords = null; + + + /** + * Adds field to flt query + * + * @param array $fields Field names + * @return Elastica_Query_FuzzyLikeThis Current object + */ + public function addFields(Array $fields) { + $this->_fields = $fields; + return $this; + } + + /** + * Set the "like_text" value + * + * @param string $text + * @return Elastica_Query_FuzzyLikeThis This current object + */ + public function setLikeText($text) { + $text = trim($text); + $this->_likeText = $text; + return $this; + } + + /** + * @param float $value Boost value + * @return Elastica_Query_FuzzyLikeThis Query object + */ + public function setBoost($value) { + $this->_boost = (float) $value; + return $this; + } + + /** + * Set max_query_terms + * + * @param int $value Max query terms value + * @return Elastica_Query_FuzzyLikeThis + */ + public function setMaxQueryTerms($value) { + $this->_maxQueryTerms = (int)$value; + return $this; + } + + + /** + * @param float $perc percentage + * @return Elastica_Query_MoreLikeThis + */ + public function setPercentTermsToMatch( $perc ) { + $perc= (float)$perc; + + $this->_percTermsToMatch = $perc; + + return $this; + } + + /** + * @param int $value + * @return Elastica_Query_MoreLikeThis + */ + public function setMinTermFrequency( $value ) { + $value = (int)$value; + if ($value < 0) { + $value = 0; + } + + $this->_minTermFreq = $value; + + return $this; + } + + + /** + * @param int $value + * @return Elastica_Query_MoreLikeThis + */ + public function setMinDocFrequency( $value ) { + $value = (int)$value; + $value = ($value < 0) ? 1 : $value; + + $this->_minDocFreq = $value; + return $this; + } + + /** + * @param int $value + * @return Elastica_Query_MoreLikeThis + */ + public function setMaxDocFrequency($value) { + $value = (int)$value; + $value = ($value < 0) ? 1 : $value; + + $this->_maxDocFreq = $value; + return $this; + } + + + /** + * @param int $value + * @return Elastica_Query_MoreLikeThis + */ + public function setMinWordLength( $value ) { + $value = (int)$value; + $value = ($value < 0) ? 1 : $value; + + $this->_minWordLen = $value; + return $this; + } + + /** + * @param int $value + * @return Elastica_Query_MoreLikeThis + */ + public function setMaxWordLength($value) { + $value = (int)$value; + $value = ($value < 0) ? 1 : $value; + + $this->_maxWordLen = $value; + return $this; + } + + + /** + * + * @param int $value; + * @link http://www.elasticsearch.org/guide/reference/query-dsl/mlt-query.html + * @return void + */ + public function setBoostTerms($value) { + $value = (int)$value; + $value = ($value < 0) ? 1 : $value; + $this->_boostTerms = $value; + return $this; + } + + + /** + * @param string $value + * @return void + */ + public function setAnalyzer( $value ) { + $value = trim($value); + if (!empty($value)) { + $this->_analyzer = $value; + } + + return $this; + } + + + /** + * @param array $words + * @return Elastica_Query_MoreLikeThis + */ + public function setStopWords(Array $words) { + $this->_stopWords = $words; + return $this; + } + + /** + * Converts fuzzy like this query to array + * + * @return array Query array + * @see Elastica_Query_Abstract::toArray() + */ + public function toArray() { + + if (!empty($this->_fields)) { + $args['fields'] = $this->_fields; + } + + if (!empty($this->_boost)) { + $args['boost'] = $this->_boost; + } + + if (!empty($this->_likeText)) { + $args['like_text'] = $this->_likeText; + } + + $args['max_query_terms'] = $this->_maxQueryTerms; + + $args['percent_terms_to_match'] = $this->_percTermsToMatch; + $args['min_term_freq'] = $this->_minTermFreq; + + if (!empty($this->_stopWords)) { + $args['stop_words'] = $this->_stopWords; + } + + if (!empty($this->_analyzer)) { + $args['analyzer'] = $this->_analyzer; + } + + $args['min_doc_freq'] = $this->_minDocFreq; + $args['max_doc_freq'] = $this->_maxDocFreq; + $args['min_word_len'] = $this->_minWordLen; + $args['max_word_len'] = $this->_maxWordLen; + $args['boost_terms'] = $this->_boostTerms; + + return array('mlt' => $args); + } +} diff --git a/test/lib/Elastica/Query/MoreLikeThisTest.php b/test/lib/Elastica/Query/MoreLikeThisTest.php new file mode 100644 index 0000000000..c88f798d2b --- /dev/null +++ b/test/lib/Elastica/Query/MoreLikeThisTest.php @@ -0,0 +1,54 @@ +create(array(), true); + $index->getSettings()->setNumberOfReplicas(0); + //$index->getSettings()->setNumberOfShards(1); + + $type = new Elastica_Type($index, 'helloworldmlt'); + $mapping = new Elastica_Type_Mapping($type , array( + 'email' => array('store' => 'yes', 'type' => 'string', 'index' => 'analyzed'), + 'content' => array('store' => 'yes', 'type' => 'string', 'index' => 'analyzed'), + )); + + $mapping->setSource(array('enabled' => false)); + $type->setMapping($mapping); + + + $doc = new Elastica_Document(1000, array('email' => 'testemail@gmail.com', 'content' => 'This is a sample post. Hello World Fuzzy Like This!')); + $type->addDocument($doc); + + $doc = new Elastica_Document(1001, array('email' => 'nospam@gmail.com', 'content' => 'This is a fake nospam email address for gmail')); + $type->addDocument($doc); + + // Refresh index + $index->refresh(); + + $mltQuery = new Elastica_Query_MoreLikeThis(); + $mltQuery->setLikeText("fake gmail sample"); + $mltQuery->addFields(array("email","content")); + $mltQuery->setMaxQueryTerms(1); + $mltQuery->setMinDocFrequency(1); + $mltQuery->setMinTermFrequency(1); + + $query = new Elastica_Query(); + $query->setFields(array("email", "content")); + $query->setQuery($mltQuery); + + $resultSet = $type->search($query); + $resultSet->getResponse()->getData(); + $this->assertEquals(2, $resultSet->count()); + } +} From 29ec490e23899d2db5a53944e6765d717bf8d8d4 Mon Sep 17 00:00:00 2001 From: Raul Martinez Jr Date: Fri, 5 Aug 2011 22:31:48 +0800 Subject: [PATCH 2/3] Fixes based on the code review comments --- lib/Elastica/Query/MoreLikeThis.php | 38 +++++++++++++++++------------ 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/lib/Elastica/Query/MoreLikeThis.php b/lib/Elastica/Query/MoreLikeThis.php index 80acfe5c24..5d0010b56b 100644 --- a/lib/Elastica/Query/MoreLikeThis.php +++ b/lib/Elastica/Query/MoreLikeThis.php @@ -26,7 +26,7 @@ class Elastica_Query_MoreLikeThis extends Elastica_Query_Abstract protected $_minDocFreq = 5; protected $_maxDocFreq = null; protected $_minWordLen = 0; - protected $_maxWordLen = 0; + protected $_maxWordLen = null; protected $_boostTerms = 1; protected $_boost = 1.0; @@ -45,7 +45,7 @@ class Elastica_Query_MoreLikeThis extends Elastica_Query_Abstract * Adds field to flt query * * @param array $fields Field names - * @return Elastica_Query_FuzzyLikeThis Current object + * @return Elastica_Query_MoreLikeThis Current object */ public function addFields(Array $fields) { $this->_fields = $fields; @@ -56,7 +56,7 @@ public function addFields(Array $fields) { * Set the "like_text" value * * @param string $text - * @return Elastica_Query_FuzzyLikeThis This current object + * @return Elastica_Query_MoreLikeThis This current object */ public function setLikeText($text) { $text = trim($text); @@ -66,7 +66,7 @@ public function setLikeText($text) { /** * @param float $value Boost value - * @return Elastica_Query_FuzzyLikeThis Query object + * @return Elastica_Query_MoreLikeThis Query object */ public function setBoost($value) { $this->_boost = (float) $value; @@ -77,7 +77,7 @@ public function setBoost($value) { * Set max_query_terms * * @param int $value Max query terms value - * @return Elastica_Query_FuzzyLikeThis + * @return Elastica_Query_MoreLikeThis */ public function setMaxQueryTerms($value) { $this->_maxQueryTerms = (int)$value; @@ -119,7 +119,7 @@ public function setMinTermFrequency( $value ) { */ public function setMinDocFrequency( $value ) { $value = (int)$value; - $value = ($value < 0) ? 1 : $value; + $value = ($value < 0) ? 5 : $value; $this->_minDocFreq = $value; return $this; @@ -131,7 +131,7 @@ public function setMinDocFrequency( $value ) { */ public function setMaxDocFrequency($value) { $value = (int)$value; - $value = ($value < 0) ? 1 : $value; + $value = ($value < 0) ? null : $value; $this->_maxDocFreq = $value; return $this; @@ -144,7 +144,7 @@ public function setMaxDocFrequency($value) { */ public function setMinWordLength( $value ) { $value = (int)$value; - $value = ($value < 0) ? 1 : $value; + $value = ($value <= 0) ? 0 : $value; $this->_minWordLen = $value; return $this; @@ -156,7 +156,7 @@ public function setMinWordLength( $value ) { */ public function setMaxWordLength($value) { $value = (int)$value; - $value = ($value < 0) ? 1 : $value; + $value = ($value < 0) ? null : $value; $this->_maxWordLen = $value; return $this; @@ -167,7 +167,7 @@ public function setMaxWordLength($value) { * * @param int $value; * @link http://www.elasticsearch.org/guide/reference/query-dsl/mlt-query.html - * @return void + * @return Elastica_Query_MoreLikeThis */ public function setBoostTerms($value) { $value = (int)$value; @@ -179,7 +179,7 @@ public function setBoostTerms($value) { /** * @param string $value - * @return void + * @return Elastica_Query_MoreLikeThis */ public function setAnalyzer( $value ) { $value = trim($value); @@ -192,7 +192,7 @@ public function setAnalyzer( $value ) { /** - * @param array $words + * @param Array $words * @return Elastica_Query_MoreLikeThis */ public function setStopWords(Array $words) { @@ -201,9 +201,9 @@ public function setStopWords(Array $words) { } /** - * Converts fuzzy like this query to array + * Converts more_like_this query to array * - * @return array Query array + * @return Array Query array * @see Elastica_Query_Abstract::toArray() */ public function toArray() { @@ -234,9 +234,15 @@ public function toArray() { } $args['min_doc_freq'] = $this->_minDocFreq; - $args['max_doc_freq'] = $this->_maxDocFreq; + + if ($this->_maxDocFreq == null) { + $args['max_doc_freq'] = $this->_maxDocFreq; + } + $args['min_word_len'] = $this->_minWordLen; - $args['max_word_len'] = $this->_maxWordLen; + if ($this->_maxWordLen == null) { + $args['max_word_len'] = $this->_maxWordLen; + } $args['boost_terms'] = $this->_boostTerms; return array('mlt' => $args); From 0aeb7e3dfd0bf8d792b9d919c8a450391c6edc24 Mon Sep 17 00:00:00 2001 From: Raul Martinez Jr Date: Fri, 5 Aug 2011 23:28:17 +0800 Subject: [PATCH 3/3] Small fix to not include a parameter with null value --- lib/Elastica/Query/MoreLikeThis.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Elastica/Query/MoreLikeThis.php b/lib/Elastica/Query/MoreLikeThis.php index 5d0010b56b..600a4d540e 100644 --- a/lib/Elastica/Query/MoreLikeThis.php +++ b/lib/Elastica/Query/MoreLikeThis.php @@ -235,12 +235,12 @@ public function toArray() { $args['min_doc_freq'] = $this->_minDocFreq; - if ($this->_maxDocFreq == null) { + if ($this->_maxDocFreq != null) { $args['max_doc_freq'] = $this->_maxDocFreq; } $args['min_word_len'] = $this->_minWordLen; - if ($this->_maxWordLen == null) { + if ($this->_maxWordLen != null) { $args['max_word_len'] = $this->_maxWordLen; } $args['boost_terms'] = $this->_boostTerms;