Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'aae-3'

  • Loading branch information...
commit 7a32c6fce7b3d30de0f5f3f0d7a9a6ac29460f9f 2 parents 7de5a0b + eda7f66
Ryan Zezeski authored
23 include/yokozuna.hrl
View
@@ -96,6 +96,8 @@
-define(INT_TO_STR(I), integer_to_list(I)).
-define(PARTITION_BINARY(S), S#state.partition_binary).
+-define(DATA_DIR, application:get_env(riak_core, platform_data_dir)).
+
-define(YZ_DEFAULT_SOLR_PORT, "8983").
-define(YZ_DEFAULT_SOLR_STARTUP_WAIT, 15).
-define(YZ_DEFAULT_TICK_INTERVAL, 60000).
@@ -114,6 +116,22 @@
"Not enough nodes are up to service this request.").
%%%===================================================================
+%%% Anti Entropy
+%%%===================================================================
+
+-define(YZ_AE_DIR,
+ application:get_env(?YZ_APP_NAME, anti_entropy_data_dir)).
+-define(YZ_ENTROPY_TICK,
+ app_helper:get_env(?YZ_APP_NAME, entropy_tick, 60000)).
+
+-type hashtree() :: hashtree:hashtree().
+-type exchange() :: {p(), {p(), n()}}.
+-type exchange_mode() :: automatic | manual.
+-type tree() :: pid().
+-type trees() :: orddict(p(), tree()).
+
+
+%%%===================================================================
%%% Riak KV
%%%===================================================================
@@ -164,6 +182,9 @@
-type index_info() :: #index_info{}.
-type index_name() :: string().
+-define(YZ_DEFAULT_INDEX, "_yz_default").
+-define(YZ_INDEX_CONTENT, yz_index_content).
+
%%%===================================================================
%%% Schemas
%%%===================================================================
@@ -204,3 +225,5 @@
%% Riak key
-define(YZ_RK_FIELD, '_yz_rk').
+-define(YZ_RK_FIELD_S, "_yz_rk").
+-define(YZ_RK_FIELD_B, <<"_yz_rk">>).
81 priv/java/com/basho/yokozuna/handler/EntropyData.java
View
@@ -16,16 +16,18 @@
package com.basho.yokozuna.handler;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
+import java.io.IOException;
import org.apache.commons.codec.binary.Base64;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
@@ -73,6 +75,12 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
throw new Exception("Parameter 'before' is required");
}
int n = req.getParams().getInt("n", DEFAULT_N);
+
+ String partition = req.getParams().get("partition");
+ if (partition == null) {
+ throw new Exception("Parameter 'partition' is required");
+ }
+
SolrDocumentList docs = new SolrDocumentList();
// Add docs here and modify object inline in code
@@ -83,6 +91,12 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
AtomicReader rdr = searcher.getAtomicReader();
BytesRef tmp = null;
Terms terms = rdr.terms(ENTROPY_DATA_FIELD);
+
+ if (terms == null) {
+ rsp.add("more", false);
+ return;
+ }
+
TermsEnum te = terms.iterator(null);
if (isContinue(cont)) {
@@ -111,32 +125,43 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
String text = null;
String[] vals = null;
String ts = null;
- String docId = null;
- String vectorClock = null;
+ String docPartition = null;
+ String riakBucket = null;
+ String riakKey = null;
+ String hash = null;
int count = 0;
BytesRef current = null;
+ DocsEnum de = null;
+ Bits liveDocs = rdr.getLiveDocs();
while(!endOfItr(tmp) && count < n) {
- current = BytesRef.deepCopyOf(tmp);
- text = tmp.utf8ToString();
- log.debug("text: " + text);
- vals = text.split(" ");
- ts = vals[0];
-
- // TODO: what if null?
- if (! (ts.compareTo(before) < 0)) {
- rsp.add("more", false);
- docs.setNumFound(count);
- return;
- }
+ if (isLive(liveDocs, te)) {
+ current = BytesRef.deepCopyOf(tmp);
+ text = tmp.utf8ToString();
+ log.debug("text: " + text);
+ vals = text.split(" ");
+ ts = vals[0];
+
+ // TODO: what if null?
+ if (! (ts.compareTo(before) < 0)) {
+ rsp.add("more", false);
+ docs.setNumFound(count);
+ return;
+ }
- docId = vals[1];
- vectorClock = vals[2];
- SolrDocument tmpDoc = new SolrDocument();
- tmpDoc.addField("doc_id", docId);
- tmpDoc.addField("base64_vclock", Base64.encodeBase64String(sha(vectorClock)));
- docs.add(tmpDoc);
- count++;
+ docPartition = vals[1];
+ riakBucket = vals[2];
+ riakKey = vals[3];
+ hash = vals[4];
+ if (partition.equals(docPartition)) {
+ SolrDocument tmpDoc = new SolrDocument();
+ tmpDoc.addField("riak_bucket", riakBucket);
+ tmpDoc.addField("riak_key", riakKey);
+ tmpDoc.addField("base64_hash", hash);
+ docs.add(tmpDoc);
+ count++;
+ }
+ }
tmp = te.next();
}
@@ -157,6 +182,10 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
}
}
+ static boolean isLive(Bits liveDocs, TermsEnum te) throws IOException {
+ DocsEnum de = te.docs(liveDocs, null);
+ return de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
+ }
static BytesRef decodeCont(String cont) {
byte[] bytes = Base64.decodeBase64(cont);
@@ -167,12 +196,6 @@ static boolean endOfItr(BytesRef returnValue) {
return returnValue == null;
}
- static byte[] sha(String s) throws NoSuchAlgorithmException {
- MessageDigest md = MessageDigest.getInstance("SHA");
- md.update(s.getBytes());
- return md.digest();
- }
-
static boolean isContinue(BytesRef cont) {
return DEFAULT_CONT != cont;
}
2  rebar.config
View
@@ -4,8 +4,6 @@
{deps,
[
- %% {esolr, ".*",
- %% {git, "git://github.com/lennart/esolr.git", {branch, "master"}}},
{ibrowse, ".*",
{git, "git://github.com/cmullaparthi/ibrowse.git", {tag, "v3.0.4"}}},
{lager, ".*",
118 riak_test/yokozuna_essential.erl
View
@@ -7,22 +7,43 @@
-define(INDEX_B, <<"fruit">>).
-define(NUM_KEYS, 10000).
-define(SUCCESS, 0).
+-define(CFG,
+ [{riak_kv,
+ [
+ %% build/expire often
+ {anti_entropy_build_limit, {100, 1000}},
+ {anti_entropy_expire, 10000},
+ {anti_entropy_concurrency, 12}
+ ]},
+ {yokozuna,
+ [
+ {entropy_tick, 1000}
+ ]},
+ {lager,
+ [{handlers,
+ [{lager_file_backend,
+ [{"./log/error.log",error,10485760,"$D0",5},
+ {"./log/console.log",debug,104857600,"$D0",10}]}]}]}
+ ]).
confirm() ->
YZBenchDir = rt:get_os_env("YZ_BENCH_DIR"),
code:add_path(filename:join([YZBenchDir, "ebin"])),
random:seed(now()),
- Nodes = rt:deploy_nodes(4),
+ Nodes = rt:deploy_nodes(4, ?CFG),
Cluster = join_three(Nodes),
wait_for_joins(Cluster),
setup_indexing(Cluster, YZBenchDir),
- load_data(Cluster, YZBenchDir),
+ load_data(Cluster, "fruit", YZBenchDir),
Ref = async_query(Cluster, YZBenchDir),
+ %% Verify data exists before running join
+ timer:sleep(10000),
Cluster2 = join_rest(Cluster, Nodes),
check_status(wait_for(Ref)),
ok = test_tagging(Cluster),
KeysDeleted = delete_some_data(Cluster2, reap_sleep()),
verify_deletes(Cluster2, KeysDeleted, YZBenchDir),
+ ok = verify_aae(Cluster2, YZBenchDir),
ok = test_siblings(Cluster),
pass.
@@ -104,6 +125,60 @@ allow_mult(Cluster, Bucket) ->
%% end || N <- Cluster],
ok.
+verify_aae(Cluster, YZBenchDir) ->
+ lager:info("Verify AAE"),
+ load_data(Cluster, "fruit_aae", YZBenchDir),
+ Keys = random_keys(),
+ {DelKeys, _ChangeKeys} = lists:split(length(Keys) div 2, Keys),
+ [ok = delete_ids(Cluster, "fruit_aae", K) || K <- DelKeys],
+ %% wait for soft commit
+ timer:sleep(1000),
+ %% ok = change_random_ids(Cluster, ChangeKeys),
+ HP = hd(host_entries(rt:connection_info(Cluster))),
+ ok = wait_for_aae(HP, "fruit_aae", ?NUM_KEYS).
+
+wait_for_aae(HP, Index, ExpectedNumFound) ->
+ wait_for_aae(HP, Index, ExpectedNumFound, 0).
+
+wait_for_aae(_, Index, _, 24) ->
+ lager:error("Hit limit waiting for AAE to repair indexes for ~p", [Index]),
+ aae_failed;
+wait_for_aae(HP, Index, ExpectedNumFound, Tries) ->
+ case search(HP, "fruit_aae", "text", "apricot", ExpectedNumFound) of
+ true -> ok;
+ _ ->
+ timer:sleep(5000),
+ wait_for_aae(HP, Index, ExpectedNumFound, Tries + 1)
+ end.
+
+delete_ids(Cluster, Index, Key) ->
+ BKey = {list_to_binary(Index), list_to_binary(Key)},
+ Node = hd(Cluster),
+ Preflist = get_preflist(Node, BKey),
+ SolrIds = solr_ids(Node, Preflist, BKey),
+ ok = solr_delete(Cluster, SolrIds).
+
+get_preflist(Node, BKey) ->
+ Ring = rpc:call(Node, yz_misc, get_ring, [transformed]),
+ DocIdx = rpc:call(Node, riak_core_util, chash_std_keyfun, [BKey]),
+ Preflist = rpc:call(Node, riak_core_ring, preflist, [DocIdx, Ring]),
+ lists:sublist(Preflist, 3).
+
+solr_ids(Node, Preflist, {B,K}) ->
+ LPL = rpc:call(Node, yz_misc, convert_preflist, [Preflist, logical]),
+ [begin
+ Suffix = "_" ++ integer_to_list(P),
+ {binary_to_list(B), binary_to_list(K) ++ Suffix}
+ end
+ || {P,_} <- LPL].
+
+solr_delete(Cluster, SolrIds) ->
+ [begin
+ lager:info("Deleting solr id ~p/~p", [B, Id]),
+ rpc:multicall(Cluster, yz_solr, delete, [B, Id])
+ end|| {B, Id} <- SolrIds],
+ ok.
+
test_tagging(Cluster) ->
lager:info("Test tagging"),
HP = hd(host_entries(rt:connection_info(Cluster))),
@@ -113,7 +188,8 @@ test_tagging(Cluster) ->
R1 = search(HP, "tagging", "user_s", "rzezeski"),
verify_count(1, R1),
R2 = search(HP, "tagging", "desc_t", "description"),
- verify_count(1, R2).
+ verify_count(1, R2),
+ ok.
write_with_tag({Host, Port}) ->
lager:info("Tag the object tagging/test"),
@@ -128,6 +204,10 @@ write_with_tag({Host, Port}) ->
{ok, "204", _, _} = ibrowse:send_req(URL, Headers, put, Body, Opts),
ok.
+search(HP, Index, Name, Term, Expect) ->
+ R = search(HP, Index, Name, Term),
+ verify_count(Expect, R).
+
search({Host, Port}, Index, Name, Term) ->
URL = lists:flatten(io_lib:format("http://~s:~s/search/~s?q=~s:~s&wt=json",
[Host, integer_to_list(Port), Index, Name, Term])),
@@ -141,10 +221,12 @@ search({Host, Port}, Index, Name, Term) ->
{bad_response, Other}
end.
-verify_count(Expected, Resp) ->
+get_count(Resp) ->
Struct = mochijson2:decode(Resp),
- NumFound = yz_driver:get_path(Struct, [<<"response">>, <<"numFound">>]),
- ?assertEqual(Expected, NumFound).
+ yz_driver:get_path(Struct, [<<"response">>, <<"numFound">>]).
+
+verify_count(Expected, Resp) ->
+ Expected == get_count(Resp).
async_query(Cluster, YZBenchDir) ->
lager:info("Run async query against cluster ~p", [Cluster]),
@@ -184,10 +266,8 @@ delete_key(Cluster, Key) ->
C:delete(?INDEX_B, list_to_binary(Key)).
delete_some_data(Cluster, ReapSleep) ->
- Num = random:uniform(100),
- lager:info("Deleting ~p keys", [Num]),
- Keys = [integer_to_list(random:uniform(?NUM_KEYS))
- || _ <- lists:seq(1, Num)],
+ Keys = random_keys(),
+ lager:info("Deleting ~p keys", [length(Keys)]),
[delete_key(Cluster, K) || K <- Keys],
lager:info("Sleeping ~ps to allow for reap", [ReapSleep]),
timer:sleep(timer:seconds(ReapSleep)),
@@ -210,8 +290,8 @@ join_rest([NodeA|_]=Cluster, Nodes) ->
[begin rt:join(Node, NodeA) end || Node <- ToJoin],
Nodes.
-load_data(Cluster, YZBenchDir) ->
- lager:info("Load data onto cluster ~p", [Cluster]),
+load_data(Cluster, Index, YZBenchDir) ->
+ lager:info("Load data for index ~p onto cluster ~p", [Index, Cluster]),
Hosts = host_entries(rt:connection_info(Cluster)),
KeyGen = {function, yz_driver, fruit_key_val_gen, [?NUM_KEYS]},
Cfg = [{mode,max},
@@ -219,12 +299,12 @@ load_data(Cluster, YZBenchDir) ->
{concurrent, 3},
{code_paths, [YZBenchDir]},
{driver, yz_driver},
- {index_path, "/riak/fruit"},
+ {index_path, "/riak/" ++ Index},
{http_conns, Hosts},
{pb_conns, []},
{key_generator, KeyGen},
{operations, [{load_fruit, 1}]}],
- File = "bb-load-" ++ ?INDEX_S,
+ File = "bb-load-" ++ Index,
write_terms(File, Cfg),
run_bb(sync, File).
@@ -257,6 +337,8 @@ setup_indexing(Cluster, YZBenchDir) ->
ok = store_schema(Node, ?FRUIT_SCHEMA_NAME, RawSchema),
ok = create_index(Node, ?INDEX_S, ?FRUIT_SCHEMA_NAME),
ok = install_hook(Node, ?INDEX_B),
+ ok = create_index(Node, "fruit_aae", ?FRUIT_SCHEMA_NAME),
+ ok = install_hook(Node, <<"fruit_aae">>),
ok = create_index(Node, "tagging"),
ok = install_hook(Node, <<"tagging">>),
ok = create_index(Node, "siblings"),
@@ -293,6 +375,7 @@ wait_for(Ref) ->
rt:wait_for_cmd(Ref).
wait_for_joins(Cluster) ->
+ lager:info("Waiting for ownership handoff to finish"),
rt:wait_until_nodes_ready(Cluster),
rt:wait_until_no_pending_changes(Cluster).
@@ -300,3 +383,10 @@ write_terms(File, Terms) ->
{ok, IO} = file:open(File, [write]),
[io:fwrite(IO, "~p.~n", [T]) || T <- Terms],
file:close(IO).
+
+random_keys() ->
+ random_keys(random:uniform(100)).
+
+random_keys(Num) ->
+ lists:usort([integer_to_list(random:uniform(?NUM_KEYS))
+ || _ <- lists:seq(1, Num)]).
64 src/yokozuna.erl
View
@@ -27,11 +27,6 @@
%%% API
%%%===================================================================
-%% @doc Index the given object `O'.
--spec index(string(), riak_object:riak_object()) -> ok | {error, term()}.
-index(Index, O) ->
- yz_solr:index(Index, yz_doc:make_docs(O, <<"FPN">>, <<"Partition">>)).
-
%% @doc Return the set of unique logical partitions stored on this
%% node for the given `Index'.
-spec partition_list(string()) -> ordset(lp()).
@@ -51,62 +46,3 @@ search(Index, Query, Mapping) ->
solr_port(Node, Ports) ->
proplists:get_value(Node, Ports).
-
-%%%===================================================================
-%%% Private
-%%%===================================================================
-
-test_it(Index) ->
- B = <<"fruit">>,
- O1 = riak_object:new(B, <<"apples">>, <<"2">>),
- O2 = riak_object:new(B, <<"oranges">>, <<"1">>),
- O3 = riak_object:new(B, <<"strawberries">>, <<"6">>),
- O4 = riak_object:new(B, <<"lemons">>, <<"1">>),
- O5 = riak_object:new(B, <<"celery">>, <<"4">>),
- O6 = riak_object:new(B, <<"lime">>, <<"1">>),
- [index(Index, O) || O <- [O1, O2, O3, O4, O5, O6]],
- yz_solr:commit(Index).
-
-demo_write_objs(Index) ->
- ibrowse:start(),
- write_n_objs(Index, 1000),
- yz_solr:commit(Index).
-
-demo_build_tree(Index, Name) ->
- ibrowse:start(),
- TP = yz_entropy:new_tree_proc(Index, Name),
- Pid = element(3, TP),
- Ref = make_ref(),
- Pid ! {get_tree, self(), Ref},
- receive {tree, Ref, Tree} -> Tree end,
- %% returning TreeProc too in case want to play with it
- {Tree, TP}.
-
-demo_new_vclock(Index, N) ->
- %% the timestamp will change causing hash to change
- NS = list_to_binary(integer_to_list(N)),
- B = <<"test">>,
- K = <<"key_",NS/binary>>,
- V = <<"val_",NS/binary>>,
- O = riak_object:new(B, K, V),
- O2 = riak_object:increment_vclock(O, dummy_node),
- index(Index, O2),
- yz_solr:commit(Index).
-
-demo_delete(Index, N) ->
- NS = integer_to_list(N),
- K = "key_" ++ NS,
- ok = yz_solr:delete(Index, K),
- ok = yz_solr:commit(Index).
-
-write_n_objs(_, 0) ->
- ok;
-write_n_objs(Index, N) ->
- NS = list_to_binary(integer_to_list(N)),
- B = <<"test">>,
- K = <<"key_",NS/binary>>,
- V = <<"val_",NS/binary>>,
- O = riak_object:new(B, K, V),
- O2 = riak_object:increment_vclock(O, dummy_node),
- index(Index, O2),
- write_n_objs(Index, N-1).
12 src/yokozuna_sup.erl
View
@@ -45,4 +45,14 @@ init(_Args) ->
{yz_events, start_link, []},
permanent, 5000, worker, [yz_events]},
- {ok, {{one_for_one, 5, 10}, [SolrProc, Events]}}.
+ HashtreeSup = {yz_index_hashtree_sup,
+ {yz_index_hashtree_sup, start_link, []},
+ permanent, infinity, supervisor, [yz_index_hashtree_sup]},
+
+ EntropyMgr = {yz_entropy_mgr,
+ {yz_entropy_mgr, start_link, []},
+ permanent, 5000, worker, [yz_entropy_mgr]},
+
+ Children = [SolrProc, Events, HashtreeSup, EntropyMgr],
+
+ {ok, {{one_for_one, 5, 10}, Children}}.
57 src/yz_doc.erl
View
@@ -33,9 +33,9 @@
add_to_doc({doc, Fields}, Field) ->
{doc, [Field|Fields]}.
--spec doc_id(riak_object:riak_object(), binary()) -> binary().
+-spec doc_id(obj(), binary()) -> binary().
doc_id(O, Partition) ->
- <<(riak_object:key(O))/binary,"_",Partition/binary>>.
+ <<(yz_kv:get_obj_key(O))/binary,"_",Partition/binary>>.
doc_id(O, Partition, none) ->
doc_id(O, Partition);
@@ -47,29 +47,36 @@ doc_id(O, Partition, Sibling) ->
has_siblings(O) -> riak_object:value_count(O) > 1.
%% @doc Given an object generate the doc to be indexed by Solr.
--spec make_docs(riak_object:riak_object(), binary(), binary()) -> [doc()].
-make_docs(O, FPN, Partition) ->
- [make_doc(O, Content, FPN, Partition) || Content <- riak_object:get_contents(O)].
+-spec make_docs(obj(), binary(), binary(), boolean()) -> [doc()].
+make_docs(O, FPN, Partition, IndexContent) ->
+ [make_doc(O, Content, FPN, Partition, IndexContent)
+ || Content <- riak_object:get_contents(O)].
--spec make_doc(riak_object:riak_object(), {dict(), dict()}, binary(), binary()) -> doc().
-make_doc(O, {MD, V}, FPN, Partition) ->
+-spec make_doc(obj(), {dict(), dict()}, binary(), binary(), boolean()) -> doc().
+make_doc(O, {MD, V}, FPN, Partition, IndexContent) ->
Vtag = get_vtag(O, MD),
DocId = doc_id(O, Partition, Vtag),
- Fields = make_fields({DocId, riak_key(O), gen_vc(O), FPN, Partition, Vtag}),
- ExtractedFields = extract_fields({MD, V}),
+ EntropyData = gen_ed(O, Partition),
+ Fields = make_fields({DocId, yz_kv:get_obj_key(O), FPN,
+ Partition, Vtag, EntropyData}),
+ ExtractedFields =
+ case IndexContent of
+ true -> extract_fields({MD, V});
+ false -> []
+ end,
Tags = extract_tags(MD),
{doc, lists:append([Tags, ExtractedFields, Fields])}.
-make_fields({DocId, Key, VC, FPN, Partition, none}) ->
+make_fields({DocId, Key, FPN, Partition, none, EntropyData}) ->
[{id, DocId},
- {?YZ_ED_FIELD, VC},
+ {?YZ_ED_FIELD, EntropyData},
{?YZ_FPN_FIELD, FPN},
{?YZ_NODE_FIELD, ?ATOM_TO_BIN(node())},
{?YZ_PN_FIELD, Partition},
{?YZ_RK_FIELD, Key}];
-make_fields({DocId, Key, VC, FPN, Partition, Vtag}) ->
- make_fields({DocId, Key, VC, FPN, Partition, none}) ++
+make_fields({DocId, Key, FPN, Partition, Vtag, EntropyData}) ->
+ make_fields({DocId, Key, FPN, Partition, none, EntropyData}) ++
[{?YZ_VTAG_FIELD, Vtag}].
%% @doc If this is a sibling, return its binary vtag
@@ -79,7 +86,7 @@ get_vtag(O, MD) ->
_ -> none
end.
-% -spec extract_fields(obj()) -> fields() | {error, any()}.
+-spec extract_fields({obj_metadata(), term()}) -> fields() | {error, any()}.
extract_fields({MD, V}) ->
case yz_kv:is_tombstone(MD) of
false ->
@@ -179,23 +186,19 @@ split_tag_names(TagNames) ->
doc_ts(MD) ->
dict:fetch(<<"X-Riak-Last-Modified">>, MD).
-doc_vclock(O) ->
- riak_object:vclock(O).
-
gen_ts() ->
{{Year, Month, Day},
{Hour, Min, Sec}} = calendar:now_to_universal_time(erlang:now()),
list_to_binary(io_lib:format("~4..0B~2..0B~2..0BT~2..0B~2..0B~2..0B",
[Year,Month,Day,Hour,Min,Sec])).
-gen_vc(O) ->
+%% NOTE: All of this data needs to be in one field to efficiently
+%% iterate. Otherwise the doc would have to be fetched for each
+%% entry.
+gen_ed(O, Partition) ->
TS = gen_ts(),
- RiakKey = riak_key(O),
- VClock = base64:encode(crypto:sha(term_to_binary(doc_vclock(O)))),
- <<TS/binary," ",RiakKey/binary," ",VClock/binary>>.
-
-riak_key(O) ->
- riak_object:key(O).
-
-value(O) ->
- riak_object:get_value(O).
+ RiakBucket = yz_kv:get_obj_bucket(O),
+ RiakKey = yz_kv:get_obj_key(O),
+ %% TODO: do this in KV vnode and pass to hook
+ Hash = base64:encode(yz_kv:hash_object(O)),
+ <<TS/binary," ",Partition/binary," ",RiakBucket/binary," ",RiakKey/binary," ",Hash/binary>>.
65 src/yz_entropy.erl
View
@@ -30,21 +30,6 @@
%% TODO: proper supervision and probably make tree proc a gen_server
%%%===================================================================
-%%% API
-%%%===================================================================
-
--spec new_tree_proc(string(), tree_name()) -> tree_ref() | already_running.
-new_tree_proc(Index, Name) ->
- case whereis(Name) of
- undefined ->
- {Pid, Ref} = spawn_monitor(?MODULE, tree_loop, [Index]),
- register(Name, Pid),
- #tree_ref{index=Index, name=Name, pid=Pid, ref=Ref};
- Pid ->
- {already_running, Pid}
- end.
-
-%%%===================================================================
%%% Private
%%%===================================================================
@@ -52,25 +37,32 @@ gen_before() ->
DateTime = calendar:now_to_universal_time(os:timestamp()),
to_datetime(minus_period(DateTime, [{mins, 5}])).
-build_tree(Index) ->
- Before = gen_before(),
- T1 = hashtree:new(),
- SV = yz_solr:get_vclocks(Index, Before, none, 100),
- iterate_vclocks(Index, Before, T1, SV).
-
ht_insert({Key, VCHash}, Tree) ->
hashtree:insert(Key, VCHash, Tree).
-iterate_vclocks(Index, Before, Tree, #solr_vclocks{more=true,
- continuation=Cont,
- pairs=Pairs}) ->
- Tree2 = lists:foldl(fun ht_insert/2, Tree, Pairs),
- SV = yz_solr:get_vclocks(Index, Before, Cont, 100),
- iterate_vclocks(Index, Before, Tree2, SV);
-iterate_vclocks(_, _, Tree, #solr_vclocks{more=false,
- pairs=Pairs}) ->
- Tree2 = lists:foldl(fun ht_insert/2, Tree, Pairs),
- hashtree:update_tree(Tree2).
+%% @doc Iterate all the entropy data in `Index' calling `Fun' for
+%% every 100 entries.
+-spec iterate_entropy_data(index_name(), list(), function()) -> ok.
+iterate_entropy_data(Index, Filter, Fun) ->
+ case yz_solr:ping(Index) of
+ true ->
+ DateTime = calendar:now_to_universal_time(os:timestamp()),
+ Before = to_datetime(minus_period(DateTime, [{mins, 5}])),
+ SV = yz_solr:get_vclocks(Index, Before, Filter, none, 100),
+ iterate_entropy_data(Index, Before, Filter, Fun, SV);
+ false ->
+ ok
+ end.
+
+iterate_entropy_data(Index, Before, Filter, Fun, #solr_vclocks{more=true,
+ continuation=Cont,
+ pairs=Pairs}) ->
+ lists:foreach(Fun, Pairs),
+ SV = yz_solr:get_vclocks(Index, Before, Filter, Cont, 100),
+ iterate_entropy_data(Index, Before, Filter, Fun, SV);
+iterate_entropy_data(_, _, _, Fun, #solr_vclocks{more=false,
+ pairs=Pairs}) ->
+ lists:foreach(Fun, Pairs).
%% @doc Minus Period from DateTime.
%%
@@ -101,14 +93,3 @@ to_datetime({_Mega, _Secs, _Micro}=Now) ->
to_datetime({{Year, Month, Day}, {Hour, Min, Sec}}) ->
list_to_binary(io_lib:format("~4..0B~2..0B~2..0BT~2..0B~2..0B~2..0B",
[Year,Month,Day,Hour,Min,Sec])).
-
-tree_loop(Index) ->
- Tree = build_tree(Index),
- tree_cache_loop(Tree).
-
-tree_cache_loop(Tree) ->
- receive
- {get_tree, Pid, Ref} ->
- Pid ! {tree, Ref, Tree},
- tree_cache_loop(Tree)
- end.
570 src/yz_entropy_mgr.erl
View
@@ -0,0 +1,570 @@
+%% -------------------------------------------------------------------
+%%
+%% Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
+%%
+%% This file is provided to you under the Apache License,
+%% Version 2.0 (the "License"); you may not use this file
+%% except in compliance with the License. You may obtain
+%% a copy of the License at
+%%
+%% http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing,
+%% software distributed under the License is distributed on an
+%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+%% KIND, either express or implied. See the License for the
+%% specific language governing permissions and limitations
+%% under the License.
+%%
+%% -------------------------------------------------------------------
+
+-module(yz_entropy_mgr).
+-compile(export_all).
+-behaviour(gen_server).
+-include("yokozuna.hrl").
+
+%% gen_server callbacks
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+ terminate/2, code_change/3]).
+
+-record(state, {mode :: exchange_mode(),
+ trees :: trees(),
+ tree_queue :: trees(),
+ locks :: [{pid(),reference()}],
+ build_tokens = 0 :: non_neg_integer(),
+ exchange_queue :: [exchange()],
+ exchanges :: [{p(),reference(), pid()}]}).
+-type state() :: #state{}.
+
+-define(DEFAULT_CONCURRENCY, 2).
+-define(DEFAULT_BUILD_LIMIT, {1, 3600000}). %% Once per hour
+
+%%%===================================================================
+%%% API
+%%%===================================================================
+
+start_link() ->
+ gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+%% @doc Acquire an exchange concurrency lock if available, and associate
+%% the lock with the calling process.
+-spec get_lock(term()) -> ok | max_concurrency.
+get_lock(Type) ->
+ get_lock(Type, self()).
+
+-spec get_lock(term(), pid()) -> ok | max_concurrency.
+get_lock(Type, Pid) ->
+ gen_server:call(?MODULE, {get_lock, Type, Pid}, infinity).
+
+-spec get_tree(p()) -> {ok, tree()} | not_registered.
+get_tree(Index) ->
+ %% NOTE: This is called by yz_kv:get_tree which is running on KV
+ %% vnode process. Think about putting tree register in ETS
+ %% table and making it public for read to avoid blocking
+ %% vnode when entryopy mgr is backed up.
+ gen_server:call(?MODULE, {get_tree, Index}, infinity).
+
+%% @doc Used by {@link yz_index_hashtree} to requeue a poke on build
+%% failure.
+-spec requeue_poke(p()) -> ok.
+requeue_poke(Index) ->
+ gen_server:cast(?MODULE, {requeue_poke, Index}).
+
+%% @doc Used by {@link yz_exchange_fsm} to inform the entropy manager
+%% about the status of an exchange (ie. completed without issue,
+%% failed, etc)
+-spec exchange_status(p(), {p(), n()}, term()) -> ok.
+exchange_status(Index, IndexN, Status) ->
+ gen_server:cast(?MODULE, {exchange_status, self(), Index, IndexN, Status}).
+
+%% @doc Returns true of AAE is enabled, false otherwise.
+-spec enabled() -> boolean().
+enabled() ->
+ riak_kv_entropy_manager:enabled().
+
+%% @doc Set AAE to either `automatic' or `manual' mode. In automatic mode, the
+%% entropy manager triggers all necessary hashtree exchanges. In manual
+%% mode, exchanges must be triggered using {@link manual_exchange/1}.
+%% Regardless of exchange mode, the entropy manager will always ensure
+%% local hashtrees are built and rebuilt as necessary.
+-spec set_mode(automatic | manual) -> ok.
+set_mode(Mode=automatic) ->
+ ok = gen_server:call(?MODULE, {set_mode, Mode}, infinity);
+set_mode(Mode=manual) ->
+ ok = gen_server:call(?MODULE, {set_mode, Mode}, infinity).
+
+%% NOTE: Yokozuna only runs when KV AAE runs, but this API is needed
+%% so that the Yokozuna hashtrees may be stopped.
+disable() ->
+ gen_server:call(?MODULE, disable, infinity).
+
+%% @doc Manually trigger hashtree exchanges.
+%%
+%% -- If a partition is provided, trigger exchanges between Yokozuna
+%% and KV for all preflists stored by the partition.
+%%
+%% -- If both a partition and preflist are provided, trigger
+%% exchange between Yokozuna and KV for that index/preflist.
+-spec manual_exchange(p() | {p(), {p(), n()}}) -> ok.
+manual_exchange(Exchange) ->
+ gen_server:call(?MODULE, {manual_exchange, Exchange}, infinity).
+
+%% @doc Stop the exchange currently executing for `Index', if there
+%% is one.
+-spec cancel_exchange(p()) -> ok | undefined.
+cancel_exchange(Index) ->
+ gen_server:call(?MODULE, {cancel_exchange, Index}, infinity).
+
+%% @doc Stop all currently executing exchanges.
+-spec cancel_exchanges() -> [p()].
+cancel_exchanges() ->
+ gen_server:call(?MODULE, cancel_exchanges, infinity).
+
+%%%===================================================================
+%%% gen_server callbacks
+%%%===================================================================
+
+init([]) ->
+ Trees = get_trees_from_sup(),
+ schedule_tick(),
+ {_, Opts} = settings(),
+ Mode = case proplists:is_defined(manual, Opts) of
+ true -> manual;
+ false -> automatic
+ end,
+ S = #state{mode=Mode,
+ trees=Trees,
+ tree_queue=[],
+ locks=[],
+ exchanges=[],
+ exchange_queue=[]},
+ S2 = reset_build_tokens(S),
+ schedule_reset_build_tokens(),
+ {ok, S2}.
+
+handle_call({get_lock, Type, Pid}, _From, S) ->
+ {Reply, S2} = do_get_lock(Type, Pid, S),
+ {reply, Reply, S2};
+
+handle_call({get_tree, Index}, _From, S) ->
+ Resp = get_tree(Index, S),
+ {reply, Resp, S};
+
+handle_call({manual_exchange, Exchange}, _From, S) ->
+ S2 = enqueue_exchange(Exchange, S),
+ {reply, ok, S2};
+
+handle_call({cancel_exchange, Index}, _From, S) ->
+ case lists:keyfind(Index, 1, S#state.exchanges) of
+ false ->
+ {reply, undefined, S};
+ {Index, _Ref, Pid} ->
+ exit(Pid, kill),
+ {reply, ok, S}
+ end;
+
+handle_call(cancel_exchanges, _From, S=#state{exchanges=Exchanges}) ->
+ Indices = [begin
+ exit(Pid, kill),
+ Index
+ end || {Index, _Ref, Pid} <- Exchanges],
+ {reply, Indices, S};
+
+handle_call(disable, _From, S) ->
+ [yz_index_hashtree:stop(T) || {_,T} <- S#state.trees],
+ {reply, ok, S};
+
+handle_call({set_mode, Mode}, _From, S) ->
+ S2 = S#state{mode=Mode},
+ {reply, ok, S2};
+
+handle_call(Request, From, S) ->
+ lager:warning("Unexpected call: ~p from ~p", [Request, From]),
+ {reply, unexpected_call, S}.
+
+handle_cast({requeue_poke, Index}, S) ->
+ S2 = requeue_poke(Index, S),
+ {noreply, S2};
+
+handle_cast({exchange_status, Pid, Index, {StartIdx, N}, Status}, S) ->
+ S2 = do_exchange_status(Pid, Index, {StartIdx, N}, Status, S),
+ {noreply, S2};
+
+handle_cast(_Msg, S) ->
+ lager:warning("Unexpected cast: ~p", [_Msg]),
+ {noreply, S}.
+
+handle_info(tick, S) ->
+ S2 = maybe_tick(S),
+ {noreply, S2};
+
+handle_info(reset_build_tokens, S) ->
+ S2 = reset_build_tokens(S),
+ schedule_reset_build_tokens(),
+ {noreply, S2};
+
+handle_info({'DOWN', Ref, _, Obj, Status}, S) ->
+ %% NOTE: The down msg could be for exchange FSM or tree
+ S2 = maybe_release_lock(Ref, S),
+ S3 = maybe_clear_exchange(Ref, Status, S2),
+ S4 = maybe_clear_registered_tree(Obj, S3),
+ {noreply, S4};
+
+handle_info(_Msg, S) ->
+ lager:warning("Unexpected msg: ~p", [_Msg]),
+ {noreply, S}.
+
+terminate(_Reason, _S) ->
+ ok.
+
+code_change(_OldVsn, S, _Extra) ->
+ {ok, S}.
+
+%%%===================================================================
+%%% Internal functions
+%%%===================================================================
+
+schedule_reset_build_tokens() ->
+ {_, Reset} = app_helper:get_env(riak_kv, anti_entropy_build_limit,
+ ?DEFAULT_BUILD_LIMIT),
+ erlang:send_after(Reset, self(), reset_build_tokens).
+
+reset_build_tokens(S) ->
+ {Tokens, _} = app_helper:get_env(riak_kv, anti_entropy_build_limit,
+ ?DEFAULT_BUILD_LIMIT),
+ S#state{build_tokens=Tokens}.
+
+-spec settings() -> {boolean(), proplists:proplist()}.
+settings() ->
+ case app_helper:get_env(riak_kv, anti_entropy, {off, []}) of
+ {on, Opts} ->
+ {true, Opts};
+ {off, Opts} ->
+ {false, Opts};
+ X ->
+ lager:warning("Invalid setting for riak_kv/anti_entropy: ~p", [X]),
+ application:set_env(riak_kv, anti_entropy, {off, []}),
+ {false, []}
+ end.
+
+%% @private
+-spec get_tree(p(), state()) -> {ok, tree()} | not_registered.
+get_tree(Index, S) ->
+ case orddict:find(Index, S#state.trees) of
+ {ok, Tree} -> {ok, Tree};
+ error -> not_registered
+ end.
+
+%% @private
+%%
+%% @doc Generate a list of all the trees currently active. It enables
+%% the entropy manager to rediscover the trees in the case of a
+%% crash.
+-spec get_trees_from_sup() -> trees().
+get_trees_from_sup() ->
+ Trees = yz_index_hashtree_sup:trees(),
+ lists:foldl(fun get_index/2, [], Trees).
+
+%% @private
+%%
+%% @doc Get the index for the `Child' make a pair and add to `Trees'.
+-spec get_index(tree(), trees()) -> trees().
+get_index(Tree, Trees) ->
+ case yz_index_hashtree:get_index(Tree) of
+ {error, _} -> Trees;
+ Index -> [{Index,Tree}|Trees]
+ end.
+
+reload_hashtrees(Ring, S) ->
+ reload_hashtrees(enabled(), Ring, S).
+
+-spec reload_hashtrees(boolean(), ring(), state()) -> state().
+reload_hashtrees(true, Ring, S=#state{mode=Mode, trees=Trees}) ->
+ Indices = riak_core_ring:my_indices(Ring),
+ Existing = orddict:from_list(Trees),
+
+ MissingIdx = [Idx || Idx <- Indices, not orddict:is_key(Idx, Existing)],
+ L = lists:foldl(fun(Idx, NewTrees) ->
+ RPs = riak_kv_util:responsible_preflists(Idx),
+ {ok, Tree} = yz_index_hashtree:start(Idx, RPs),
+ [{Idx,Tree}|NewTrees]
+ end, [], MissingIdx),
+ Trees2 = orddict:from_list(Trees ++ L),
+
+ Moved = [E || E={Idx,_} <- Trees2, not lists:member(Idx, Indices)],
+ Trees3 = remove_trees(Trees2, Moved),
+
+ S2 = S#state{trees=Trees3},
+ S3 = lists:foldl(fun({Idx,Pid}, SAcc) ->
+ monitor(process, Pid),
+ case Mode of
+ manual -> SAcc;
+ automatic -> enqueue_exchange(Idx, SAcc)
+ end
+ end, S2, L),
+ S3;
+reload_hashtrees(false, _, S) ->
+ S.
+
+%% @private
+%%
+%% @doc Remove trees from `Trees' and destroy the hashtrees.
+-spec remove_trees(trees(), trees()) -> trees().
+remove_trees(Trees, ToRemove) ->
+ F = fun({Idx, Tree}, TreesAcc) ->
+ yz_index_hashtree:destroy(Tree),
+ orddict:erase(Idx, TreesAcc)
+ end,
+ lists:foldl(F, Trees, ToRemove).
+
+-spec do_get_lock(term(), pid(), state()) ->
+ {ok | max_concurrency | build_limit_reached, state()}.
+do_get_lock(Type, Pid, S=#state{locks=Locks}) ->
+ Concurrency = app_helper:get_env(riak_kv,
+ anti_entropy_concurrency,
+ ?DEFAULT_CONCURRENCY),
+ case length(Locks) >= Concurrency of
+ true ->
+ {max_concurrency, S};
+ false ->
+ case check_lock_type(Type, S) of
+ {ok, S2} ->
+ Ref = monitor(process, Pid),
+ S3 = S2#state{locks=[{Pid,Ref}|Locks]},
+ {ok, S3};
+ Error ->
+ {Error, S}
+ end
+ end.
+
+
+-spec check_lock_type(term(), state()) -> {ok, state()} | build_limit_reached.
+check_lock_type(build, S=#state{build_tokens=Tokens}) ->
+ if Tokens > 0 ->
+ {ok, S#state{build_tokens=Tokens-1}};
+ true ->
+ build_limit_reached
+ end;
+check_lock_type(_Type, S) ->
+ {ok, S}.
+
+-spec maybe_release_lock(reference(), state()) -> state().
+maybe_release_lock(Ref, S) ->
+ Locks = lists:keydelete(Ref, 2, S#state.locks),
+ S#state{locks=Locks}.
+
+-spec maybe_clear_exchange(reference(), term(), state()) -> state().
+maybe_clear_exchange(Ref, Status, S) ->
+ case lists:keytake(Ref, 2, S#state.exchanges) of
+ false ->
+ S;
+ {value, {Idx,Ref,_Pid}, Exchanges} ->
+ lager:debug("Untracking exchange: ~p :: ~p", [Idx, Status]),
+ S#state{exchanges=Exchanges}
+ end.
+
+-spec maybe_clear_registered_tree(pid(), state()) -> state().
+maybe_clear_registered_tree(Pid, S) when is_pid(Pid) ->
+ Trees = lists:keydelete(Pid, 2, S#state.trees),
+ S#state{trees=Trees};
+maybe_clear_registered_tree(_, S) ->
+ S.
+
+-spec next_tree(state()) -> {pid(), state()} | {none, state()}.
+next_tree(S=#state{tree_queue=Queue, trees=Trees}) ->
+ More = fun() -> Trees end,
+ case yz_misc:queue_pop(Queue, More) of
+ {{_,Pid}, Rest} ->
+ S2 = S#state{tree_queue=Rest},
+ {Pid, S2};
+ empty ->
+ {none, S}
+ end.
+
+-spec schedule_tick() -> reference().
+schedule_tick() ->
+ erlang:send_after(?YZ_ENTROPY_TICK, ?MODULE, tick).
+
+maybe_tick(S) ->
+ case enabled() of
+ true ->
+ case riak_core_capability:get({riak_kv, anti_entropy}, disabled) of
+ disabled -> S2 = S;
+ enabled_v1 -> S2 = tick(S)
+ end;
+ false ->
+ %% Ensure we do not have any running index_hashtrees, which can
+ %% happen when disabling anti-entropy on a live system.
+ [yz_index_hashtree:stop(T) || {_,T} <- S#state.trees],
+ S2 = S
+ end,
+ schedule_tick(),
+ S2.
+
+-spec tick(state()) -> state().
+tick(S) ->
+ Ring = yz_misc:get_ring(transformed),
+ S2 = reload_hashtrees(Ring, S),
+ S3 = lists:foldl(fun(_,SAcc) ->
+ maybe_poke_tree(SAcc)
+ end, S2, lists:seq(1,10)),
+ maybe_exchange(Ring, S3).
+
+-spec maybe_poke_tree(state()) -> state().
+maybe_poke_tree(S) ->
+ case next_tree(S) of
+ {none, S2} ->
+ S2;
+ {Tree, S2} ->
+ yz_index_hashtree:poke(Tree),
+ S2
+ end.
+
+%%%===================================================================
+%%% Exchanging
+%%%===================================================================
+
+-spec do_exchange_status(pid(), p(), {p(), n()}, any(), state()) -> state().
+do_exchange_status(_Pid, Index, {StartIdx, N}, Status, S) ->
+ case Status of
+ ok ->
+ lager:debug("Finished exhcange for partition ~p of preflist ~p",
+ [Index, {StartIdx, N}]),
+ S;
+ _ ->
+ lager:debug("Requeue exhcange for partition ~p of preflist ~p "
+ "for reason ~p",
+ [Index, {StartIdx, N}, Status]),
+ requeue_exchange(Index, {StartIdx, N}, S)
+ end.
+
+-spec start_exchange(p(), {p(),n()}, ring(), state()) -> {any(), state()}.
+start_exchange(Index, Preflist, Ring, S) ->
+ case riak_core_ring:index_owner(Ring, Index) == node() of
+ false ->
+ {not_responsible, S};
+ true ->
+ %% TODO: check for not_registered
+ {ok, YZTree} = get_tree(Index, S),
+ %% TODO: use async version in case vnode is backed up
+ %%
+ %% TODO: hashtree_pid can return {error, wrong_node}
+ %% during ownership transfer, handle that case
+ {ok, KVTree} = riak_kv_vnode:hashtree_pid(Index),
+ case yz_exchange_fsm:start(Index, Preflist, YZTree,
+ KVTree, self()) of
+ {ok, FsmPid} ->
+ Ref = monitor(process, FsmPid),
+ E = S#state.exchanges,
+ %% TODO: add timestamp so we know how long ago
+ %% exchange was started
+ {ok, S#state{exchanges=[{Index,Ref,FsmPid}|E]}};
+ {error, Reason} ->
+ {Reason, S}
+ end
+ end.
+
+-spec all_pairwise_exchanges(p(), ring()) -> [exchange()].
+all_pairwise_exchanges(Index, Ring) ->
+ RPs = riak_kv_util:responsible_preflists(Index, Ring),
+ [{Index, {StartIdx, N}} || {StartIdx, N} <- RPs].
+
+-spec all_exchanges(ring(), trees()) -> [exchange()].
+all_exchanges(Ring, Trees) ->
+ Indices = orddict:fetch_keys(Trees),
+ lists:flatmap(fun(Index) ->
+ all_pairwise_exchanges(Index, Ring)
+ end, Indices).
+
+-spec enqueue_exchange(p() | {p(), {p(),n()}}, state()) -> state().
+enqueue_exchange(E={_Index,_IndexN}, S) ->
+ case verify_exchange(E) of
+ true ->
+ enqueue_exchanges([E], S);
+ false ->
+ S
+ end;
+
+enqueue_exchange(Index, S) ->
+ Ring = yz_misc:get_ring(transformed),
+ Exchanges = all_pairwise_exchanges(Index, Ring),
+ enqueue_exchanges(Exchanges, S).
+
+enqueue_exchanges(Exchanges, S) ->
+ EQ = S#state.exchange_queue ++ Exchanges,
+ S#state{exchange_queue=EQ}.
+
+-spec verify_exchange({p(), n()}) -> boolean().
+verify_exchange(E={Index,_N}) ->
+ Ring = yz_misc:get_ring(transformed),
+ ValidExchanges = all_pairwise_exchanges(Index, Ring),
+ lists:member(E, ValidExchanges).
+
+-spec already_exchanging(p(), state()) -> boolean().
+already_exchanging(Index, #state{exchanges=E}) ->
+ case lists:keyfind(Index, 1, E) of
+ false -> false;
+ {Index,_,_} -> true
+ end.
+
+-spec maybe_exchange(ring(), state()) -> state().
+maybe_exchange(Ring, S) ->
+ case next_exchange(Ring, S) of
+ {none, S2} ->
+ S2;
+ {NextExchange, S2} ->
+ {Index, IndexN} = NextExchange,
+ case already_exchanging(Index, S) of
+ true ->
+ requeue_exchange(Index, IndexN, S2);
+ false ->
+ case start_exchange(Index, IndexN, Ring, S2) of
+ {ok, S3} -> S3;
+ {_, S3} -> S3
+ end
+ end
+ end.
+
+-spec init_next_exchange(state()) -> state().
+init_next_exchange(S) ->
+ {ok, Ring} = riak_core_ring_manager:get_my_ring(),
+ Trees = S#state.trees,
+ Exchanges = all_exchanges(Ring, Trees),
+ S#state{exchange_queue=Exchanges}.
+
+-spec next_exchange(ring(), state()) -> {exchange(), state()} | {none, state()}.
+next_exchange(Ring, S=#state{exchange_queue=Exchanges}) ->
+ More = fun() ->
+ case S#state.mode of
+ automatic -> all_exchanges(Ring, S#state.trees);
+ manual -> []
+ end
+ end,
+ case yz_misc:queue_pop(Exchanges, More) of
+ {Exchange, Rest} ->
+ S2 = S#state{exchange_queue=Rest},
+ {Exchange, S2};
+ empty ->
+ {none, S}
+ end.
+
+-spec requeue_poke(p(), state()) -> state().
+requeue_poke(Index, S=#state{trees=Trees}) ->
+ case orddict:find(Index, Trees) of
+ {ok, Tree} ->
+ Queue = S#state.tree_queue ++ [{Index,Tree}],
+ S#state{tree_queue=Queue};
+ _ ->
+ S
+ end.
+
+-spec requeue_exchange(p(), {p(), n()}, state()) -> state().
+requeue_exchange(Index, {StartIdx, N}, S) ->
+ Exchange = {Index, {StartIdx, N}},
+ case lists:member(Exchange, S#state.exchange_queue) of
+ true ->
+ S;
+ false ->
+ Exchanges = S#state.exchange_queue ++ [Exchange],
+ S#state{exchange_queue=Exchanges}
+ end.
8 src/yz_events.erl
View
@@ -86,6 +86,14 @@ handle_info(tick, S) ->
Mapping2 = check_unkown(Mapping),
ok = set_mapping(Mapping2),
+ %% TODO: should default index go into ring?
+ case yz_index:exists(?YZ_DEFAULT_INDEX) of
+ false ->
+ ok = yz_index:create(?YZ_DEFAULT_INDEX, ?YZ_DEFAULT_SCHEMA_NAME);
+ true ->
+ ok
+ end,
+
ok = set_tick(),
{noreply, S}.
270 src/yz_exchange_fsm.erl
View
@@ -0,0 +1,270 @@
+%% -------------------------------------------------------------------
+%%
+%% Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
+%%
+%% This file is provided to you under the Apache License,
+%% Version 2.0 (the "License"); you may not use this file
+%% except in compliance with the License. You may obtain
+%% a copy of the License at
+%%
+%% http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing,
+%% software distributed under the License is distributed on an
+%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+%% KIND, either express or implied. See the License for the
+%% specific language governing permissions and limitations
+%% under the License.
+%%
+%% -------------------------------------------------------------------
+
+-module(yz_exchange_fsm).
+-behaviour(gen_fsm).
+-include("yokozuna.hrl").
+-compile(export_all).
+
+%% gen_fsm callbacks
+-export([init/1, handle_event/3, handle_sync_event/4, handle_info/3,
+ terminate/3, code_change/4]).
+
+-record(state, {index :: p(),
+ index_n :: {p(),n()},
+ yz_tree :: tree(),
+ kv_tree :: tree(),
+ built :: integer(),
+ timeout :: pos_integer()}).
+
+%% Per state transition timeout used by certain transitions
+-define(DEFAULT_ACTION_TIMEOUT, 300000). %% 5 minutes
+
+%%%===================================================================
+%%% API
+%%%===================================================================
+
+%% @doc Initialize the exchange FSM to exchange between Yokozuna and
+%% KV for the `Preflist' replicas on `Index'.
+-spec start(p(), {p(),n()}, tree(), tree(), pid()) ->
+ {ok, pid()} | {error, any()}.
+start(Index, Preflist, YZTree, KVTree, Manager) ->
+ gen_fsm:start(?MODULE, [Index, Preflist, YZTree, KVTree, Manager], []).
+
+%%%===================================================================
+%%% gen_fsm callbacks
+%%%===================================================================
+
+init([Index, Preflist, YZTree, KVTree, Manager]) ->
+ Timeout = app_helper:get_env(riak_kv,
+ anti_entropy_timeout,
+ ?DEFAULT_ACTION_TIMEOUT),
+
+ monitor(process, Manager),
+ monitor(process, YZTree),
+ monitor(process, KVTree),
+
+ S = #state{index=Index,
+ index_n=Preflist,
+ yz_tree=YZTree,
+ kv_tree=KVTree,
+ built=0,
+ timeout=Timeout},
+ gen_fsm:send_event(self(), start_exchange),
+ lager:debug("Starting exchange between KV and Yokozuna: ~p", [Index]),
+ {ok, prepare_exchange, S}.
+
+handle_event(_Event, StateName, S) ->
+ {next_state, StateName, S}.
+
+handle_sync_event(_Event, _From, StateName, S) ->
+ {reply, ok, StateName, S}.
+
+handle_info({'DOWN', _, _, _, _}, _StateName, S) ->
+ %% Either the entropy manager, local hashtree, or remote hashtree has
+ %% exited. Stop exchange.
+ {stop, normal, S};
+
+handle_info(_Info, StateName, S) ->
+ {next_state, StateName, S}.
+
+terminate(_Reason, _StateName, _S) ->
+ ok.
+
+code_change(_OldVsn, StateName, S, _Extra) ->
+ {ok, StateName, S}.
+
+%%%===================================================================
+%%% States
+%%%===================================================================
+
+prepare_exchange(start_exchange, S) ->
+ YZTree = S#state.yz_tree,
+ KVTree = S#state.kv_tree,
+
+ case yz_entropy_mgr:get_lock(?MODULE) of
+ ok ->
+ case yz_index_hashtree:get_lock(YZTree, ?MODULE) of
+ ok ->
+ case riak_kv_entropy_manager:get_lock(?MODULE) of
+ ok ->
+ case riak_kv_index_hashtree:get_lock(KVTree,
+ ?MODULE) of
+ ok ->
+ update_trees(start_exchange, S);
+ _ ->
+ send_exchange_status(already_locked, S),
+ {stop, normal, S}
+ end;
+ Error ->
+ send_exchange_status(Error, S),
+ {stop, normal, S}
+ end;
+ _ ->
+ send_exchange_status(already_locked, S),
+ {stop, normal, S}
+ end;
+ Error ->
+ send_exchange_status(Error, S),
+ {stop, normal, S}
+ end;
+
+prepare_exchange(timeout, S) ->
+ do_timeout(S).
+
+update_trees(start_exchange, S=#state{yz_tree=YZTree,
+ kv_tree=KVTree,
+ index=Index,
+ index_n=IndexN}) ->
+
+ update_request(yz_index_hashtree, YZTree, Index, IndexN),
+ update_request(riak_kv_index_hashtree, KVTree, Index, IndexN),
+ {next_state, update_trees, S};
+
+update_trees({not_responsible, Index, IndexN}, S) ->
+ lager:debug("Index ~p does not cover preflist ~p", [Index, IndexN]),
+ send_exchange_status({not_responsible, Index, IndexN}, S),
+ {stop, normal, S};
+
+update_trees({tree_built, _, _}, S) ->
+ Built = S#state.built + 1,
+ case Built of
+ 2 ->
+ lager:debug("Moving to key exchange"),
+ {next_state, key_exchange, S, 0};
+ _ ->
+ {next_state, update_trees, S#state{built=Built}}
+ end.
+
+key_exchange(timeout, S=#state{index=Index,
+ yz_tree=YZTree,
+ kv_tree=KVTree,
+ index_n=IndexN}) ->
+ lager:debug("Starting key exchange for partition ~p preflist ~p",
+ [Index, IndexN]),
+
+ Remote = fun(get_bucket, {L, B}) ->
+ exchange_bucket_kv(KVTree, IndexN, L, B);
+ (key_hashes, Segment) ->
+ exchange_segment_kv(KVTree, IndexN, Segment)
+ end,
+
+ {ok, RC} = riak:local_client(),
+ AccFun = fun(KeyDiff, Acc) ->
+ lists:foldl(fun(Diff, Acc2) ->
+ read_repair_keydiff(RC, Diff),
+ case Acc2 of
+ [] -> [1];
+ [Count] -> [Count+1]
+ end
+ end, Acc, KeyDiff)
+ end,
+
+ case yz_index_hashtree:compare(IndexN, Remote, AccFun, YZTree) of
+ [] ->
+ ok;
+ [Count] ->
+ lager:info("Repaired ~b keys during active anti-entropy exchange "
+ "of ~p", [Count, IndexN])
+ end,
+ {stop, normal, S}.
+
+%%%===================================================================
+%%% Internal functions
+%%%===================================================================
+
+%% @private
+exchange_bucket_kv(Tree, IndexN, Level, Bucket) ->
+ riak_kv_index_hashtree:exchange_bucket(IndexN, Level, Bucket, Tree).
+
+%% @private
+exchange_segment_kv(Tree, IndexN, Segment) ->
+ riak_kv_index_hashtree:exchange_segment(IndexN, Segment, Tree).
+
+%% @private
+read_repair_keydiff(_RC, {remote_missing, KeyBin}) ->
+ %% Yokozuna has it but KV doesn't
+ BKey = {Bucket, Key} = binary_to_term(KeyBin),
+ Ring = yz_misc:get_ring(transformed),
+ BucketProps = riak_core_bucket:get_bucket(Bucket, Ring),
+ Idx = riak_core_util:chash_key(BKey),
+ N = proplists:get_value(n_val,BucketProps),
+ Preflist = lists:sublist(riak_core_ring:preflist(Idx, Ring), N),
+ FakeObj = fake_kv_object(Bucket, Key),
+
+ lists:foreach(fun({Partition, Node}) ->
+ FakeState = fake_kv_vnode_state(Partition),
+ rpc:call(Node, yz_kv, index, [FakeObj, delete, FakeState])
+ end, Preflist),
+
+ ok;
+
+read_repair_keydiff(RC, {Reason, KeyBin}) ->
+ BKey = {Bucket, Key} = binary_to_term(KeyBin),
+ lager:debug("Anti-entropy forced read repair and re-index: ~p/~p (~p)", [Bucket, Key, Reason]),
+ {ok, Obj} = RC:get(Bucket, Key),
+ Ring = yz_misc:get_ring(transformed),
+ BucketProps = riak_core_bucket:get_bucket(Bucket, Ring),
+ Idx = riak_core_util:chash_key(BKey),
+ N = proplists:get_value(n_val,BucketProps),
+ Preflist = lists:sublist(riak_core_ring:preflist(Idx, Ring), N),
+ lists:foreach(fun({Partition, Node}) ->
+ FakeState = fake_kv_vnode_state(Partition),
+ rpc:call(Node, yz_kv, index, [Obj, anti_entropy, FakeState])
+ end, Preflist),
+ ok.
+
+%% @private
+fake_kv_object(Bucket, Key) ->
+ riak_object:new(Bucket, Key, <<"fake object">>).
+
+%% @private
+fake_kv_vnode_state(Partition) ->
+ {state,Partition,fake,fake,fake,fake,fake,fake,fake,fake,fake,fake,fake}.
+
+%% @private
+update_request(Module, Tree, Index, IndexN) ->
+ as_event(fun() ->
+ case Module:update(IndexN, Tree) of
+ ok -> {tree_built, Index, IndexN};
+ not_responsible -> {not_responsible, Index, IndexN}
+ end
+ end).
+
+%% @private
+as_event(F) ->
+ Self = self(),
+ spawn_link(fun() ->
+ Result = F(),
+ gen_fsm:send_event(Self, Result)
+ end),
+ ok.
+
+%% @private
+do_timeout(S=#state{index=Index, index_n=Preflist}) ->
+ lager:info("Timeout during exchange of partition ~p for preflist ~p ",
+ [Index, Preflist]),
+ send_exchange_status({timeout, Index, Preflist}, S),
+ {stop, normal, S}.
+
+%% @private
+send_exchange_status(Status, #state{index=Index,
+ index_n=IndexN}) ->
+ yz_entropy_mgr:exchange_status(Index, IndexN, Status).
35 src/yz_index.erl
View
@@ -83,21 +83,26 @@ local_create(Ring, Name) ->
DataDir = filename:join([IndexDir, "data"]),
Info = get_info_from_ring(Ring, Name),
SchemaName = schema_name(Info),
- RawSchema = yz_schema:get(SchemaName),
- SchemaFile = filename:join([ConfDir, yz_schema:filename(SchemaName)]),
-
- yz_misc:make_dirs([ConfDir, DataDir]),
- yz_misc:copy_files(ConfFiles, ConfDir),
- ok = file:write_file(SchemaFile, RawSchema),
-
- CoreProps = [
- {name, Name},
- {index_dir, IndexDir},
- {cfg_file, ?YZ_CORE_CFG_FILE},
- {schema_file, SchemaFile}
- ],
- {ok, _, _} = yz_solr:core(create, CoreProps),
- ok.
+ case yz_schema:get(SchemaName) of
+ {ok, RawSchema} ->
+ SchemaFile = filename:join([ConfDir, yz_schema:filename(SchemaName)]),
+
+ yz_misc:make_dirs([ConfDir, DataDir]),
+ yz_misc:copy_files(ConfFiles, ConfDir),
+ ok = file:write_file(SchemaFile, RawSchema),
+
+ CoreProps = [
+ {name, Name},
+ {index_dir, IndexDir},
+ {cfg_file, ?YZ_CORE_CFG_FILE},
+ {schema_file, SchemaFile}
+ ],
+ {ok, _, _} = yz_solr:core(create, CoreProps),
+ ok;
+ {error, _, Reason} ->
+ lager:warning("Couldn't create index ~s: ~p", [Name, Reason]),
+ ok
+ end.
%% @doc Remove the index `Name' locally.
-spec local_remove(string()) -> ok.
500 src/yz_index_hashtree.erl
View
@@ -0,0 +1,500 @@
+%% -------------------------------------------------------------------
+%%
+%% Copyright (c) 2012 Basho Technologies, Inc. All Rights Reserved.
+%%
+%% This file is provided to you under the Apache License,
+%% Version 2.0 (the "License"); you may not use this file
+%% except in compliance with the License. You may obtain
+%% a copy of the License at
+%%
+%% http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing,
+%% software distributed under the License is distributed on an
+%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+%% KIND, either express or implied. See the License for the
+%% specific language governing permissions and limitations
+%% under the License.
+%%
+%% -------------------------------------------------------------------
+
+-module(yz_index_hashtree).
+-behaviour(gen_server).
+-include("yokozuna.hrl").
+
+%% gen_server callbacks
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+ terminate/2, code_change/3]).
+
+-record(state, {index,
+ built,
+ lock :: undefined | reference(),
+ path,
+ build_time,
+ trees}).
+-type state() :: #state{}.
+
+-compile(export_all).
+
+%% Time from build to expiration of tree, in microseconds.
+-define(DEFAULT_EXPIRE, 604800000). %% 1 week
+
+%%%===================================================================
+%%% API
+%%%===================================================================
+
+%% @doc Spawn a hashtree process that manages a hashtree for each
+%% preflist `Index' is responsible for (`RPs').
+start(Index, RPs) ->
+ supervisor:start_child(yz_index_hashtree_sup, [Index, RPs]).
+
+start_link(Index, RPs) ->
+ gen_server:start_link(?MODULE, [Index, RPs], []).
+
+%% @doc Insert the given `Key' and `Hash' pair on `Tree' for the given `Id'
+-spec insert({p(),n()}, {binary(),binary()}, binary(), tree(), list()) -> ok.
+insert(Id, BKey, Hash, Tree, Options) ->
+ gen_server:cast(Tree, {insert, Id, BKey, Hash, Options}).
+
+%% @doc Delete the `BKey' from `Tree'. The id will be determined from `BKey'.
+-spec delete({p(),n()}, {binary(),binary()}, tree()) -> ok.
+delete(Id, BKey, Tree) ->
+ gen_server:cast(Tree, {delete, Id, BKey}).
+
+-spec update({p(),n()}, tree()) -> ok.
+update(Id, Tree) ->
+ gen_server:call(Tree, {update_tree, Id}, infinity).
+
+-spec compare({p(),n()}, hashtree:remote_fun(), tree()) ->
+ [hashtree:keydiff()].
+compare(Id, Remote, Tree) ->
+ compare(Id, Remote, undefined, Tree).
+
+-spec compare({p(),n()}, hashtree:remote_fun(),
+ undefined | hashtree:acc_fun(T), tree()) -> T.
+compare(Id, Remote, AccFun, Tree) ->
+ gen_server:call(Tree, {compare, Id, Remote, AccFun}, infinity).
+
+get_index(Tree) ->
+ gen_server:call(Tree, get_index, infinity).
+
+%% @doc Acquire the lock for the specified index_hashtree if not already
+%% locked, and associate the lock with the calling process.
+-spec get_lock(tree(), term()) -> ok | not_build | already_locked.
+get_lock(Tree, Type) ->
+ get_lock(Tree, Type, self()).
+
+%% @doc Acquire the lock for the specified index_hashtree if not already
+%% locked, and associate the lock with the provided pid.
+-spec get_lock(tree(), term(), pid()) -> ok | not_build | already_locked.
+get_lock(Tree, Type, Pid) ->
+ gen_server:call(Tree, {get_lock, Type, Pid}, infinity).
+
+%% @doc Poke the specified `Tree' to ensure the it is built/rebuilt as
+%% needed. This is periodically called by the {@link
+%% yz_entropy_mgr}.
+-spec poke(tree()) -> ok.
+poke(Tree) ->
+ gen_server:cast(Tree, poke).
+
+%% @doc Terminate the `Tree'.
+stop(Tree) ->
+ gen_server:cast(Tree, stop).
+
+%% @doc Destroy the specified `Tree', which will destroy all
+%% associated hashtrees and terminate.
+-spec destroy(tree()) -> ok.
+destroy(Tree) ->
+ gen_server:call(Tree, destroy, infinity).
+
+
+%%%===================================================================
+%%% gen_server callbacks
+%%%===================================================================
+
+init([Index, RPs]) ->
+ case determine_data_root() of
+ undefined ->
+ case riak_kv_entropy_manager:enabled() of
+ true ->
+ lager:warning("Neither yokozuna/anti_entropy_data_dir or "
+ "riak_core/platform_data_dir are defined. "
+ "Disabling active anti-entropy."),
+ riak_kv_entropy_manager:disable();
+ false ->
+ ok
+ end,
+ ignore;
+ Root ->
+ Path = filename:join(Root, integer_to_list(Index)),
+ S = #state{index=Index,
+ trees=orddict:new(),
+ built=false,
+ path=Path},
+ S2 = init_trees(RPs, S),
+ {ok, S2}
+ end.
+
+handle_call(get_index, _From, S) ->
+ {reply, S#state.index, S};
+
+handle_call({get_lock, Type, Pid}, _From, S) ->
+ {Reply, S2} = do_get_lock(Type, Pid, S),
+ {reply, Reply, S2};
+
+handle_call({update_tree, Id}, From, S) ->
+ lager:debug("Updating tree for partition ~p preflist ~p",
+ [S#state.index, Id]),
+ apply_tree(Id,
+ fun(Tree) ->
+ {SnapTree, Tree2} = hashtree:update_snapshot(Tree),
+ spawn_link(fun() ->
+ hashtree:update_perform(SnapTree),
+ gen_server:reply(From, ok)
+ end),
+ {noreply, Tree2}
+ end,
+ S);
+
+handle_call({compare, Id, Remote, AccFun}, From, S) ->
+ do_compare(Id, Remote, AccFun, From, S),
+ {noreply, S};
+
+handle_call(destroy, _From, S) ->
+ {_,Tree0} = hd(S#state.trees),
+ hashtree:destroy(Tree0),
+ {stop, normal, ok, S};
+
+handle_call(_Request, _From, S) ->
+ Reply = ok,
+ {reply, Reply, S}.
+
+handle_cast(poke, S) ->
+ S2 = do_poke(S),
+ {noreply, S2};
+
+handle_cast(build_failed, S) ->
+ yz_entropy_mgr:requeue_poke(S#state.index),
+ S2 = S#state{built=false},
+ {noreply, S2};
+
+handle_cast(build_finished, S) ->
+ S2 = do_build_finished(S),
+ {noreply, S2};
+
+handle_cast({insert, Id, BKey, Hash, Options}, S) ->
+ S2 = do_insert(Id, term_to_binary(BKey), Hash, Options, S),
+ {noreply, S2};
+
+handle_cast({delete, IdxN, BKey}, S) ->
+ S2 = do_delete(IdxN, term_to_binary(BKey), S),
+ {noreply, S2};
+
+handle_cast(stop, S) ->
+ {_,Tree0} = hd(S#state.trees),
+ hashtree:close(Tree0),
+ {stop, normal, S};
+
+handle_cast(_Msg, S) ->
+ {noreply, S}.
+
+handle_info({'DOWN', Ref, _, _, _}, S) ->
+ S2 = maybe_release_lock(Ref, S),
+ {noreply, S2};
+
+handle_info(_Info, S) ->
+ {noreply, S}.
+
+terminate(_Reason, _) ->
+ ok.
+
+code_change(_OldVsn, S, _Extra) ->
+ {ok, S}.
+
+%%%===================================================================
+%%% Internal functions
+%%%===================================================================
+
+-spec determine_data_root() -> string() | undefined.
+determine_data_root() ->
+ case ?YZ_AE_DIR of
+ {ok, EntropyRoot} ->
+ EntropyRoot;
+ undefined ->
+ case ?DATA_DIR of
+ {ok, PlatformRoot} ->
+ Root = filename:join(PlatformRoot, "yz_anti_entropy"),
+ lager:warning("Config yokozuna/anti_entropy_data_dir is "
+ "missing. Defaulting to: ~p", [Root]),
+ application:set_env(?YZ_APP_NAME, anti_entropy_data_dir, Root),
+ Root;
+ undefined ->
+ undefined
+ end
+ end.
+
+-spec init_trees([{p(),n()}], state()) -> state().
+init_trees(RPs, S) ->
+ S2 = lists:foldl(fun(Id, SAcc) ->
+ do_new_tree(Id, SAcc)
+ end, S, RPs),
+ S2#state{built=false}.
+
+-spec load_built(state()) -> boolean().
+load_built(#state{trees=Trees}) ->
+ {_,Tree0} = hd(Trees),
+ case hashtree:read_meta(<<"built">>, Tree0) of
+ {ok, <<1>>} -> true;
+ _ -> false
+ end.
+
+-spec fold_keys(p(), tree()) -> ok.
+fold_keys(Partition, Tree) ->
+ LI = yz_cover:logical_index(yz_misc:get_ring(transformed)),
+ LogicalPartition = yz_cover:logical_partition(LI, Partition),
+ Indexes = yz_index:get_indexes_from_ring(yz_misc:get_ring(transformed)),
+ F = fun({BKey, Hash}) ->
+ %% TODO: return _yz_fp from iterator and use that for
+ %% more efficient get_index_N
+ IndexN = get_index_n(BKey),
+ insert(IndexN, BKey, Hash, Tree, [if_missing])
+ end,
+ Filter = [{partition, LogicalPartition}],
+ [yz_entropy:iterate_entropy_data(Name, Filter, F) || {Name,_} <- Indexes],
+ ok.
+
+-spec do_new_tree({p(),n()}, state()) -> state().
+do_new_tree(Id, S=#state{trees=Trees, path=Path}) ->
+ Index = S#state.index,
+ IdBin = tree_id(Id),
+ NewTree = case Trees of
+ [] ->
+ hashtree:new({Index,IdBin}, [{segment_path, Path}]);
+ [{_,Other}|_] ->
+ hashtree:new({Index,IdBin}, Other)
+ end,
+ Trees2 = orddict:store(Id, NewTree, Trees),
+ S#state{trees=Trees2}.
+
+-spec do_get_lock(term(), pid(), state()) ->
+ {ok | not_build | already_locked, state()}.
+do_get_lock(_, _, S) when S#state.built /= true ->
+ lager:debug("Not built: ~p", [S#state.index]),
+ {not_built, S};
+
+do_get_lock(_Type, Pid, S=#state{lock=undefined}) ->
+ Ref = monitor(process, Pid),
+ S2 = S#state{lock=Ref},
+ {ok, S2};
+
+do_get_lock(_, _, S) ->
+ lager:debug("Already locked: ~p", [S#state.index]),
+ {already_locked, S}.
+
+-spec maybe_release_lock(reference(), state()) -> state().
+maybe_release_lock(Ref, S) ->
+ case S#state.lock of
+ Ref -> S#state{lock=undefined};
+ _ -> S
+ end.
+
+%% @private
+%%
+%% @doc Utility function for passing a specific hashtree into a
+%% provided function and storing the possibly-modified hashtree
+%% back in the index_hashtree state.
+-spec apply_tree({p(),n()},
+ fun((hashtree()) -> {'noreply' | any(), hashtree()}),
+ state())
+ -> {'reply', 'not_responsible', state()} |
+ {'reply', any(), state()} |
+ {'noreply', state()}.
+apply_tree(Id, Fun, S=#state{trees=Trees}) ->
+ case orddict:find(Id, Trees) of
+ error ->
+ {reply, not_responsible, S};
+ {ok, Tree} ->
+ {Result, Tree2} = Fun(Tree),
+ Trees2 = orddict:store(Id, Tree2, Trees),
+ S2 = S#state{trees=Trees2},
+ case Result of
+ noreply -> {noreply, S2};
+ _ -> {reply, Result, S2}
+ end
+ end.
+
+-spec do_build_finished(state()) -> state().
+do_build_finished(S=#state{index=Index, built=_Pid}) ->
+ lager:debug("Finished build: ~p", [Index]),
+ {_,Tree0} = hd(S#state.trees),
+ hashtree:write_meta(<<"built">>, <<1>>, Tree0),
+ S#state{built=true, build_time=os:timestamp()}.
+
+-spec do_insert({p(),n()}, binary(), binary(), proplist(), state()) -> state().
+do_insert(Id, Key, Hash, Opts, S=#state{trees=Trees}) ->
+ case orddict:find(Id, Trees) of
+ {ok, Tree} ->
+ Tree2 = hashtree:insert(Key, Hash, Tree, Opts),
+ Trees2 = orddict:store(Id, Tree2, Trees),
+ S#state{trees=Trees2};
+ _ ->
+ handle_unexpected_key(Id, Key, S)
+ end.
+
+-spec do_delete({p(),n()}, binary(), state()) -> state().
+do_delete(Id, Key, S=#state{trees=Trees}) ->
+ case orddict:find(Id, Trees) of
+ {ok, Tree} ->
+ Tree2 = hashtree:delete(Key, Tree),
+ Trees2 = orddict:store(Id, Tree2, Trees),
+ S#state{trees=Trees2};
+ _ ->
+ handle_unexpected_key(Id, Key, S)
+ end.
+
+-spec handle_unexpected_key({p(),n()}, binary(), state()) -> state().
+handle_unexpected_key(Id, Key, S=#state{index=Partition}) ->
+ RP = riak_kv_util:responsible_preflists(Partition),
+ case lists:member(Id, RP) of
+ false ->
+ %% The encountered object does not belong to any preflists thata
+ %% this partition is associated with. Under normal Riak operation,
+ %% this should only happen when the `n_val' for an object is
+ %% reduced. For example, write an object with N=3, then change N to
+ %% 2. There will be an extra replica of the object that is no
+ %% longer needed. We should probably just delete these objects, but
+ %% to be safe rather than sorry, the first version of AAE simply
+ %% ignores these objects.
+ %%
+ %% TODO: We should probably remove these warnings before final
+ %% release, as reducing N will result in a ton of log/console
+ %% spam.
+ lager:warning("Object ~p encountered during fold over partition "
+ "~p, but key does not hash to an index handled by "
+ "this partition", [Key, Partition]),
+ S;
+
+ true ->
+ %% The encountered object belongs to a preflist that is currently
+ %% associated with this partition, but was not when the
+ %% index_hashtree process was created. This occurs when increasing
+ %% the `n_val' for an object. For example, write an object with N=3
+ %% and it will map to the index/n preflist `{<index>, 3}'. Increase
+ %% N to 4, and the object now maps to preflist '{<index>, 4}' which
+ %% may not have an existing hashtree if there were previously no
+ %% objects with N=4.
+ lager:info("Partition/tree ~p/~p does not exist to hold object ~p",
+ [Partition, Id, Key]),
+ case S#state.built of
+ true ->
+ %% If the tree is already built, clear the tree to trigger
+ %% a rebuild that will re-distribute objects into the
+ %% proper hashtrees based on current N values.
+ lager:info("Clearing tree to trigger future rebuild"),
+ clear_tree(S);
+ _ ->
+ %% Initialize a new index_n tree to prevent future errors.
+ %% The various hashtrees will likely be inconsistent, with
+ %% some trees containing key/hash pairs that should be in
+ %% other trees (eg. due to a change in N value). This will
+ %% be resolved whenever trees are eventually rebuilt, either
+ %% after normal expiration or after a future unexpected value
+ %% triggers the alternate case clause above.
+ do_new_tree(Id, S)
+ end
+ end.
+
+tree_id({Index, N}) ->
+ %% hashtree is hardcoded for 22-byte (176-bit) tree id
+ <<Index:160/integer,N:16/integer>>;
+tree_id(_) ->
+ erlang:error(badarg).
+
+%% TODO: handle non-existent tree
+do_compare(Id, Remote, AccFun, From, S) ->
+ case orddict:find(Id, S#state.trees) of
+ error ->
+ %% This case shouldn't happen, but might as well safely handle it.
+ lager:warning("Tried to compare nonexistent tree "
+ "(vnode)=~p (preflist)=~p", [S#state.index, Id]),
+ gen_server:reply(From, []);
+ {ok, Tree} ->
+ spawn_link(
+ fun() ->
+ Result = case AccFun of
+ undefined ->
+ hashtree:compare(Tree, Remote);
+ _ ->
+ hashtree:compare(Tree, Remote, AccFun)
+ end,
+ gen_server:reply(From, Result)
+ end)
+ end,
+ ok.
+
+%% TODO: OMG cache this with entry in proc dict, use `_yz_fp` as Index
+%% and keep an orddict(Bucket,N) in proc dict
+get_index_n(BKey) ->
+ {ok, Ring} = riak_core_ring_manager:get_my_ring(),
+ get_index_n(BKey, Ring).
+
+get_index_n({Bucket, Key}, Ring) ->
+ BucketProps = riak_core_bucket:get_bucket(Bucket, Ring),
+ N = proplists:get_value(n_val, BucketProps),
+ ChashKey = riak_core_util:chash_key({Bucket, Key}),
+ Index = riak_core_ring:responsible_index(ChashKey, Ring),
+ {Index, N}.
+
+do_poke(S) ->
+ maybe_build(maybe_clear(S)).
+
+maybe_clear(S=#state{lock=undefined, built=true}) ->
+ Diff = timer:now_diff(os:timestamp(), S#state.build_time),
+ Expire = app_helper:get_env(riak_kv,
+ anti_entropy_expire,
+ ?DEFAULT_EXPIRE),
+ case Diff > (Expire * 1000) of
+ true -> clear_tree(S);
+ false -> S
+ end;
+
+maybe_clear(S) ->
+ S.
+
+clear_tree(S=#state{index=Index, trees=Trees}) ->
+ lager:debug("Clearing tree ~p", [S#state.index]),
+ {_,Tree0} = hd(Trees),
+ hashtree:destroy(Tree0),
+ IndexN = riak_kv_util:responsible_preflists(Index),
+ S2 = init_trees(IndexN, S#state{trees=orddict:new()}),
+ S2#state{built=false}.
+
+maybe_build(S=#state{built=false}) ->
+ Self = self(),
+ Pid = spawn_link(fun() -> build_or_rehash(Self, S) end),
+ S#state{built=Pid};
+
+maybe_build(S) ->
+ %% Already built or build in progress
+ S.
+
+build_or_rehash(Self, S=#state{index=Index, trees=Trees}) ->
+ Type = case load_built(S) of
+ false -> build;
+ true -> rehash
+ end,
+ Lock = yz_entropy_mgr:get_lock(Type),
+ case {Lock, Type} of
+ {ok, build} ->
+ lager:debug("Starting build: ~p", [Index]),
+ fold_keys(Index, Self),
+ lager:debug("Finished build: ~p", [Index]),
+ gen_server:cast(Self, build_finished);
+ {ok, rehash} ->
+ lager:debug("Starting rehash: ~p", [Index]),
+ _ = [hashtree:rehash_tree(T) || {_,T} <- Trees],
+ lager:debug("Finished rehash: ~p", [Index]),
+ gen_server:cast(Self, build_finished);
+ {_Error, _} ->
+ gen_server:cast(Self, build_failed)
+ end.
21 src/yz_index_hashtree_sup.erl
View
@@ -0,0 +1,21 @@
+-module(yz_index_hashtree_sup).
+-behavior(supervisor).
+-include("yokozuna.hrl").
+-compile(export_all).
+-export([init/1]).
+
+%% @doc Get the list of trees.
+-spec trees() -> trees().
+trees() ->
+ Children = supervisor:which_children(?MODULE),
+ [Pid || {_,Pid,_,_} <- Children].
+
+start_link() ->
+ supervisor:start_link({local, ?MODULE}, ?MODULE, []).
+
+init(_Args) ->
+ %% TODO: should shutdown be longer to account for leveldb?
+ Spec = {ignored,
+ {yz_index_hashtree, start_link, []},
+ temporary, 5000, worker, [yz_index_hashtree]},
+ {ok, {{simple_one_for_one, 10, 1}, [Spec]}}.
163 src/yz_kv.erl
View
@@ -52,11 +52,38 @@ get(C, Bucket, Key) ->
Other
end.
+-spec hash_object(obj()) -> binary().
+hash_object(Obj) ->
+ Vclock = riak_object:vclock(Obj),
+ Obj2 = riak_object:set_vclock(Obj, lists:sort(Vclock)),
+ Hash = erlang:phash2(term_to_binary(Obj2)),
+ term_to_binary(Hash).
+
%% @doc Get the content-type of the object.
-spec get_obj_ct(obj_metadata()) -> binary().
get_obj_ct(MD) ->
dict:fetch(<<"content-type">>, MD).
+-spec get_obj_bucket(obj()) -> binary().
+get_obj_bucket(Obj) ->
+ riak_object:bucket(Obj).
+
+-spec get_obj_key(obj()) -> binary().
+get_obj_key(Obj) ->
+ riak_object:key(Obj).
+
+-spec get_obj_md(obj()) -> undefined | dict().
+get_obj_md(Obj) ->
+ riak_object:get_metadata(Obj).
+
+-spec get_obj_value(obj()) -> binary().
+get_obj_value(Obj) ->
+ riak_object:get_value(Obj).
+
+-spec index_content(term()) -> boolean().
+index_content(BProps) ->
+ proplists:get_value(?YZ_INDEX_CONTENT, BProps, false).
+
%% @doc Determine if the `Obj' is a tombstone.
-spec is_tombstone(obj_metadata()) -> boolean().
is_tombstone(MD) ->
@@ -71,68 +98,109 @@ get_md_entry(MD, Key) ->
%% @doc An object modified hook to create indexes as object data is
%% written or modified.
%%
-%% NOTE: This code runs on the vnode process.
+%% NOTE: For a normal update this hook runs on the vnode process.
+%% During active anti-entropy runs on spawned process.
%%
%% NOTE: Index is doing double duty of index and delete.
-spec index(obj(), write_reason(), term()) -> ok.
-index(Obj, delete, _) ->
+index(Obj, delete, VNodeState) ->
+ Ring = yz_misc:get_ring(transformed),
{Bucket, Key} = BKey = {riak_object:bucket(Obj), riak_object:key(Obj)},
+ BProps = riak_core_bucket:get_bucket(Bucket, Ring),
+ NVal = riak_core_bucket:n_val(BProps),
+ Idx = riak_core_util:chash_key(BKey),
+ IdealPreflist = lists:sublist(riak_core_ring:preflist(Idx, Ring), NVal),
+ IdxN = {first_partition(IdealPreflist), riak_core_bucket:n_val(BProps)},
+
try
XML = yz_solr:encode_delete({key, Key}),
- yz_solr:delete_by_query(binary_to_list(Bucket), XML)
+ yz_solr:delete_by_query(which_index(Bucket, BProps), XML)
catch _:Err ->
?ERROR("failed to delete docid ~p with error ~p", [BKey, Err])
- end;
+ end,
+
+ update_hashtree(delete, get_partition(VNodeState), IdxN, BKey),
+ ok;
index(Obj, Reason, VNodeState) ->
- {ok, Ring} = riak_core_ring_manager:get_my_ring(),
+ Ring = yz_misc:get_ring(transformed),
LI = yz_cover:logical_index(Ring),
{Bucket, Key} = BKey = {riak_object:bucket(Obj), riak_object:key(Obj)},
- ok = maybe_wait(Reason, Bucket),
BProps = riak_core_bucket:get_bucket(Bucket, Ring),
+ Index = which_index(Bucket, BProps),
+ ok = maybe_wait(Reason, Index),
AllowMult = proplists:get_value(allow_mult, BProps),
NVal = riak_core_bucket:n_val(BProps),
Idx = riak_core_util:chash_key(BKey),
IdealPreflist = lists:sublist(riak_core_ring:preflist(Idx, Ring), NVal),
LFPN = yz_cover:logical_partition(LI, first_partition(IdealPreflist)),
- LP = yz_cover:logical_partition(LI, get_partition(VNodeState)),
- Docs = yz_doc:make_docs(Obj, ?INT_TO_BIN(LFPN), ?INT_TO_BIN(LP)),
+ P = get_partition(VNodeState),
+ LP = yz_cover:logical_partition(LI, P),
+ Docs = yz_doc:make_docs(Obj, ?INT_TO_BIN(LFPN), ?INT_TO_BIN(LP),
+ index_content(BProps)),
+
try
- ok = yz_solr:index(binary_to_list(Bucket), Docs),
+ ok = yz_solr:index(Index, Docs),
case riak_object:value_count(Obj) of
2 ->
case AllowMult of
true ->
- %% An object has crossed the threshold from being a single value
- %% Object, to a sibling value Object, delete the non-sibling ID
- DocID = binary_to_list(yz_doc:doc_id(Obj, ?INT_TO_BIN(LP))),
- yz_solr:delete(binary_to_list(Bucket), DocID);
+ %% An object has crossed the threshold from
+ %% being a single value Object, to a sibling
+ %% value Object, delete the non-sibling ID
+ DocID = binary_to_list(yz_doc:doc_id(Obj,
+ ?INT_TO_BIN(LP))),
+ yz_solr:delete(Index, DocID);
_ -> ok
end;
- %% Delete any siblings
1 ->
+ %% Delete any siblings
XML = yz_solr:encode_delete({key, Key, siblings}),
- yz_solr:delete_by_query(binary_to_list(Bucket), XML);
- _ -> ok
+ yz_solr:delete_by_query(Index, XML);
+ _ ->
+ ok
end
catch _:Err ->
?ERROR("failed to index object ~p with error ~p", [BKey, Err])
+ end,
+
+ IdxN = {first_partition(IdealPreflist), NVal},
+ update_hashtree({insert, yz_kv:hash_object(Obj)}, P, IdxN, BKey),
+ ok.
+
+-spec update_hashtree(delete | {insert, binary()}, p(), {p(),n()},