diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt index 475bc657dbb..db76032683f 100644 --- a/fdbcli/CMakeLists.txt +++ b/fdbcli/CMakeLists.txt @@ -14,6 +14,7 @@ set(FDBCLI_SRCS ProfileCommand.actor.cpp SetClassCommand.actor.cpp SnapshotCommand.actor.cpp + StatusCommand.actor.cpp SuspendCommand.actor.cpp ThrottleCommand.actor.cpp TriggerDDTeamInfoLogCommand.actor.cpp diff --git a/fdbcli/ProfileCommand.actor.cpp b/fdbcli/ProfileCommand.actor.cpp index 4c290c9913c..2ab56488f75 100644 --- a/fdbcli/ProfileCommand.actor.cpp +++ b/fdbcli/ProfileCommand.actor.cpp @@ -51,18 +51,16 @@ ACTOR Future profileCommandActor(Reference tr, std::vector sampleRateValue = - wait(safeThreadFutureToFuture(tr->get(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate)))); - if (sampleRateValue.present() && - !std::isinf(boost::lexical_cast(sampleRateValue.get().toString()))) { - sampleRateStr = sampleRateValue.get().toString(); + std::string sampleRateStr = "default"; + std::string sizeLimitStr = "default"; + const double sampleRateDbl = GlobalConfig::globalConfig().get( + fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); + if (!std::isinf(sampleRateDbl)) { + sampleRateStr = std::to_string(sampleRateDbl); } - Optional sizeLimitValue = - wait(safeThreadFutureToFuture(tr->get(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit)))); - if (sizeLimitValue.present() && boost::lexical_cast(sizeLimitValue.get().toString()) != -1) { - sizeLimitStr = sizeLimitValue.get().toString(); + const int64_t sizeLimit = GlobalConfig::globalConfig().get(fdbClientInfoTxnSizeLimit, -1); + if (sizeLimit != -1) { + sizeLimitStr = boost::lexical_cast(sizeLimit); } printf("Client profiling rate is set to %s and size limit is set to %s.\n", sampleRateStr.c_str(), diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp new file mode 100644 index 00000000000..37606308857 --- /dev/null +++ b/fdbcli/StatusCommand.actor.cpp @@ -0,0 +1,1237 @@ +/* + * StatusCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/StatusClient.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. +#include + +namespace { + +std::string getCoordinatorsInfoString(StatusObjectReader statusObj) { + std::string outputString; + try { + StatusArray coordinatorsArr = statusObj["client.coordinators.coordinators"].get_array(); + for (StatusObjectReader coor : coordinatorsArr) + outputString += format("\n %s (%s)", + coor["address"].get_str().c_str(), + coor["reachable"].get_bool() ? "reachable" : "unreachable"); + } catch (std::runtime_error&) { + outputString = "\n Unable to retrieve list of coordination servers"; + } + + return outputString; +} + +std::string lineWrap(const char* text, int col) { + const char* iter = text; + const char* start = text; + const char* space = nullptr; + std::string out = ""; + do { + iter++; + if (*iter == '\n' || *iter == ' ' || *iter == '\0') + space = iter; + if (*iter == '\n' || *iter == '\0' || (iter - start == col)) { + if (!space) + space = iter; + out += format("%.*s\n", (int)(space - start), start); + start = space; + if (*start == ' ' /* || *start == '\n'*/) + start++; + space = nullptr; + } + } while (*iter); + return out; +} + +std::pair getNumOfNonExcludedProcessAndZones(StatusObjectReader statusObjCluster) { + StatusObjectReader processesMap; + std::set zones; + int numOfNonExcludedProcesses = 0; + if (statusObjCluster.get("processes", processesMap)) { + for (auto proc : processesMap.obj()) { + StatusObjectReader process(proc.second); + if (process.has("excluded") && process.last().get_bool()) + continue; + numOfNonExcludedProcesses++; + std::string zoneId; + if (process.get("locality.zoneid", zoneId)) { + zones.insert(zoneId); + } + } + } + return { numOfNonExcludedProcesses, zones.size() }; +} + +int getNumofNonExcludedMachines(StatusObjectReader statusObjCluster) { + StatusObjectReader machineMap; + int numOfNonExcludedMachines = 0; + if (statusObjCluster.get("machines", machineMap)) { + for (auto mach : machineMap.obj()) { + StatusObjectReader machine(mach.second); + if (machine.has("excluded") && !machine.last().get_bool()) + numOfNonExcludedMachines++; + } + } + return numOfNonExcludedMachines; +} + +std::string getDateInfoString(StatusObjectReader statusObj, std::string key) { + time_t curTime; + if (!statusObj.has(key)) { + return ""; + } + curTime = statusObj.last().get_int64(); + char buffer[128]; + struct tm* timeinfo; + timeinfo = localtime(&curTime); + strftime(buffer, 128, "%m/%d/%y %H:%M:%S", timeinfo); + return std::string(buffer); +} + +std::string getProcessAddressByServerID(StatusObjectReader processesMap, std::string serverID) { + if (serverID == "") + return "unknown"; + + for (auto proc : processesMap.obj()) { + try { + StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); + for (StatusObjectReader role : rolesArray) { + if (role["id"].get_str().find(serverID) == 0) { + // If this next line throws, then we found the serverID but the role has no address, so the role is + // skipped. + return proc.second.get_obj()["address"].get_str(); + } + } + } catch (std::exception&) { + // If an entry in the process map is badly formed then something will throw. Since we are + // looking for a positive match, just ignore any read execeptions and move on to the next proc + } + } + return "unknown"; +} + +std::string getWorkloadRates(StatusObjectReader statusObj, + bool unknown, + std::string first, + std::string second, + bool transactionSection = false) { + // Re-point statusObj at either the transactions sub-doc or the operations sub-doc depending on transactionSection + // flag + if (transactionSection) { + if (!statusObj.get("transactions", statusObj)) + return "unknown"; + } else { + if (!statusObj.get("operations", statusObj)) + return "unknown"; + } + + std::string path = first + "." + second; + double value; + if (!unknown && statusObj.get(path, value)) { + return format("%d Hz", (int)round(value)); + } + return "unknown"; +} + +void getBackupDRTags(StatusObjectReader& statusObjCluster, + const char* context, + std::map& tagMap) { + std::string path = format("layers.%s.tags", context); + StatusObjectReader tags; + if (statusObjCluster.tryGet(path, tags)) { + for (auto itr : tags.obj()) { + JSONDoc tag(itr.second); + bool running = false; + tag.tryGet("running_backup", running); + if (running) { + std::string uid; + if (tag.tryGet("mutation_stream_id", uid)) { + tagMap[itr.first] = uid; + } else { + tagMap[itr.first] = ""; + } + } + } + } +} + +std::string logBackupDR(const char* context, std::map const& tagMap) { + std::string outputString = ""; + if (tagMap.size() > 0) { + outputString += format("\n\n%s:", context); + for (auto itr : tagMap) { + outputString += format("\n %-22s", itr.first.c_str()); + if (itr.second.size() > 0) { + outputString += format(" - %s", itr.second.c_str()); + } + } + } + + return outputString; +} + +} // namespace + +namespace fdb_cli { + +void printStatus(StatusObjectReader statusObj, + StatusClient::StatusLevel level, + bool displayDatabaseAvailable, + bool hideErrorMessages) { + if (FlowTransport::transport().incompatibleOutgoingConnectionsPresent()) { + fprintf( + stderr, + "WARNING: One or more of the processes in the cluster is incompatible with this version of fdbcli.\n\n"); + } + + try { + bool printedCoordinators = false; + + // status or status details + if (level == StatusClient::NORMAL || level == StatusClient::DETAILED) { + + StatusObjectReader statusObjClient; + statusObj.get("client", statusObjClient); + + // The way the output string is assembled is to add new line character before addition to the string rather + // than after + std::string outputString = ""; + std::string clusterFilePath; + if (statusObjClient.get("cluster_file.path", clusterFilePath)) + outputString = format("Using cluster file `%s'.\n", clusterFilePath.c_str()); + else + outputString = "Using unknown cluster file.\n"; + + StatusObjectReader statusObjCoordinators; + StatusArray coordinatorsArr; + + if (statusObjClient.get("coordinators", statusObjCoordinators)) { + // Look for a second "coordinators", under the first one. + if (statusObjCoordinators.has("coordinators")) + coordinatorsArr = statusObjCoordinators.last().get_array(); + } + + // Check if any coordination servers are unreachable + bool quorum_reachable; + if (statusObjCoordinators.get("quorum_reachable", quorum_reachable) && !quorum_reachable) { + outputString += "\nCould not communicate with a quorum of coordination servers:"; + outputString += getCoordinatorsInfoString(statusObj); + + printf("%s\n", outputString.c_str()); + return; + } else { + for (StatusObjectReader coor : coordinatorsArr) { + bool reachable; + if (coor.get("reachable", reachable) && !reachable) { + outputString += "\nCould not communicate with all of the coordination servers." + "\n The database will remain operational as long as we" + "\n can connect to a quorum of servers, however the fault" + "\n tolerance of the system is reduced as long as the" + "\n servers remain disconnected.\n"; + outputString += getCoordinatorsInfoString(statusObj); + outputString += "\n"; + printedCoordinators = true; + break; + } + } + } + + // print any client messages + if (statusObjClient.has("messages")) { + for (StatusObjectReader message : statusObjClient.last().get_array()) { + std::string desc; + if (message.get("description", desc)) + outputString += "\n" + lineWrap(desc.c_str(), 80); + } + } + + bool fatalRecoveryState = false; + StatusObjectReader statusObjCluster; + try { + if (statusObj.get("cluster", statusObjCluster)) { + + StatusObjectReader recoveryState; + if (statusObjCluster.get("recovery_state", recoveryState)) { + std::string name; + std::string description; + if (recoveryState.get("name", name) && recoveryState.get("description", description) && + name != "accepting_commits" && name != "all_logs_recruited" && + name != "storage_recovered" && name != "fully_recovered") { + fatalRecoveryState = true; + + if (name == "recruiting_transaction_servers") { + description += + format("\nNeed at least %d log servers across unique zones, %d commit proxies, " + "%d GRV proxies and %d resolvers.", + recoveryState["required_logs"].get_int(), + recoveryState["required_commit_proxies"].get_int(), + recoveryState["required_grv_proxies"].get_int(), + recoveryState["required_resolvers"].get_int()); + if (statusObjCluster.has("machines") && statusObjCluster.has("processes")) { + auto numOfNonExcludedProcessesAndZones = + getNumOfNonExcludedProcessAndZones(statusObjCluster); + description += + format("\nHave %d non-excluded processes on %d machines across %d zones.", + numOfNonExcludedProcessesAndZones.first, + getNumofNonExcludedMachines(statusObjCluster), + numOfNonExcludedProcessesAndZones.second); + } + } else if (name == "locking_old_transaction_servers" && + recoveryState["missing_logs"].get_str().size()) { + description += format("\nNeed one or more of the following log servers: %s", + recoveryState["missing_logs"].get_str().c_str()); + } + description = lineWrap(description.c_str(), 80); + if (!printedCoordinators && + (name == "reading_coordinated_state" || name == "locking_coordinated_state" || + name == "configuration_never_created" || name == "writing_coordinated_state")) { + description += getCoordinatorsInfoString(statusObj); + description += "\n"; + printedCoordinators = true; + } + + outputString += "\n" + description; + } + } + } + } catch (std::runtime_error&) { + } + + // Check if cluster controllable is reachable + try { + // print any cluster messages + if (statusObjCluster.has("messages") && statusObjCluster.last().get_array().size()) { + + // any messages we don't want to display + std::set skipMsgs = { "unreachable_process", "" }; + if (fatalRecoveryState) { + skipMsgs.insert("status_incomplete"); + skipMsgs.insert("unreadable_configuration"); + skipMsgs.insert("immediate_priority_transaction_start_probe_timeout"); + skipMsgs.insert("batch_priority_transaction_start_probe_timeout"); + skipMsgs.insert("transaction_start_probe_timeout"); + skipMsgs.insert("read_probe_timeout"); + skipMsgs.insert("commit_probe_timeout"); + } + + for (StatusObjectReader msgObj : statusObjCluster.last().get_array()) { + std::string messageName; + if (!msgObj.get("name", messageName)) { + continue; + } + if (skipMsgs.count(messageName)) { + continue; + } else if (messageName == "client_issues") { + if (msgObj.has("issues")) { + for (StatusObjectReader issue : msgObj["issues"].get_array()) { + std::string issueName; + if (!issue.get("name", issueName)) { + continue; + } + + std::string description; + if (!issue.get("description", description)) { + description = issueName; + } + + std::string countStr; + StatusArray addresses; + if (!issue.has("addresses")) { + countStr = "Some client(s)"; + } else { + addresses = issue["addresses"].get_array(); + countStr = format("%d client(s)", addresses.size()); + } + outputString += + format("\n%s reported: %s\n", countStr.c_str(), description.c_str()); + + if (level == StatusClient::StatusLevel::DETAILED) { + for (int i = 0; i < addresses.size() && i < 4; ++i) { + outputString += format(" %s\n", addresses[i].get_str().c_str()); + } + if (addresses.size() > 4) { + outputString += " ...\n"; + } + } + } + } + } else { + if (msgObj.has("description")) + outputString += "\n" + lineWrap(msgObj.last().get_str().c_str(), 80); + } + } + } + } catch (std::runtime_error&) { + } + + if (fatalRecoveryState) { + printf("%s", outputString.c_str()); + return; + } + + StatusObjectReader statusObjConfig; + StatusArray excludedServersArr; + Optional activePrimaryDC; + + if (statusObjCluster.has("active_primary_dc")) { + activePrimaryDC = statusObjCluster["active_primary_dc"].get_str(); + } + if (statusObjCluster.get("configuration", statusObjConfig)) { + if (statusObjConfig.has("excluded_servers")) + excludedServersArr = statusObjConfig.last().get_array(); + } + + // If there is a configuration message then there is no configuration information to display + outputString += "\nConfiguration:"; + std::string outputStringCache = outputString; + bool isOldMemory = false; + try { + // Configuration section + // FIXME: Should we suppress this if there are cluster messages implying that the database has no + // configuration? + + outputString += "\n Redundancy mode - "; + std::string strVal; + + if (statusObjConfig.get("redundancy_mode", strVal)) { + outputString += strVal; + } else + outputString += "unknown"; + + outputString += "\n Storage engine - "; + if (statusObjConfig.get("storage_engine", strVal)) { + if (strVal == "memory-1") { + isOldMemory = true; + } + outputString += strVal; + } else + outputString += "unknown"; + + int intVal; + outputString += "\n Coordinators - "; + if (statusObjConfig.get("coordinators_count", intVal)) { + outputString += std::to_string(intVal); + } else + outputString += "unknown"; + + if (excludedServersArr.size()) { + outputString += format("\n Exclusions - %d (type `exclude' for details)", + excludedServersArr.size()); + } + + if (statusObjConfig.get("commit_proxies", intVal)) + outputString += format("\n Desired Commit Proxies - %d", intVal); + + if (statusObjConfig.get("grv_proxies", intVal)) + outputString += format("\n Desired GRV Proxies - %d", intVal); + + if (statusObjConfig.get("resolvers", intVal)) + outputString += format("\n Desired Resolvers - %d", intVal); + + if (statusObjConfig.get("logs", intVal)) + outputString += format("\n Desired Logs - %d", intVal); + + if (statusObjConfig.get("remote_logs", intVal)) + outputString += format("\n Desired Remote Logs - %d", intVal); + + if (statusObjConfig.get("log_routers", intVal)) + outputString += format("\n Desired Log Routers - %d", intVal); + + if (statusObjConfig.get("tss_count", intVal) && intVal > 0) { + int activeTss = 0; + if (statusObjCluster.has("active_tss_count")) { + statusObjCluster.get("active_tss_count", activeTss); + } + outputString += format("\n TSS - %d/%d", activeTss, intVal); + + if (statusObjConfig.get("tss_storage_engine", strVal)) + outputString += format("\n TSS Storage Engine - %s", strVal.c_str()); + } + + outputString += "\n Usable Regions - "; + if (statusObjConfig.get("usable_regions", intVal)) { + outputString += std::to_string(intVal); + } else { + outputString += "unknown"; + } + + StatusArray regions; + if (statusObjConfig.has("regions")) { + outputString += "\n Regions: "; + regions = statusObjConfig["regions"].get_array(); + for (StatusObjectReader region : regions) { + bool isPrimary = false; + std::vector regionSatelliteDCs; + std::string regionDC; + for (StatusObjectReader dc : region["datacenters"].get_array()) { + if (!dc.has("satellite")) { + regionDC = dc["id"].get_str(); + if (activePrimaryDC.present() && dc["id"].get_str() == activePrimaryDC.get()) { + isPrimary = true; + } + } else if (dc["satellite"].get_int() == 1) { + regionSatelliteDCs.push_back(dc["id"].get_str()); + } + } + if (activePrimaryDC.present()) { + if (isPrimary) { + outputString += "\n Primary -"; + } else { + outputString += "\n Remote -"; + } + } else { + outputString += "\n Region -"; + } + outputString += format("\n Datacenter - %s", regionDC.c_str()); + if (regionSatelliteDCs.size() > 0) { + outputString += "\n Satellite datacenters - "; + for (int i = 0; i < regionSatelliteDCs.size(); i++) { + if (i != regionSatelliteDCs.size() - 1) { + outputString += format("%s, ", regionSatelliteDCs[i].c_str()); + } else { + outputString += format("%s", regionSatelliteDCs[i].c_str()); + } + } + } + isPrimary = false; + if (region.get("satellite_redundancy_mode", strVal)) { + outputString += format("\n Satellite Redundancy Mode - %s", strVal.c_str()); + } + if (region.get("satellite_anti_quorum", intVal)) { + outputString += format("\n Satellite Anti Quorum - %d", intVal); + } + if (region.get("satellite_logs", intVal)) { + outputString += format("\n Satellite Logs - %d", intVal); + } + if (region.get("satellite_log_policy", strVal)) { + outputString += format("\n Satellite Log Policy - %s", strVal.c_str()); + } + if (region.get("satellite_log_replicas", intVal)) { + outputString += format("\n Satellite Log Replicas - %d", intVal); + } + if (region.get("satellite_usable_dcs", intVal)) { + outputString += format("\n Satellite Usable DCs - %d", intVal); + } + } + } + } catch (std::runtime_error&) { + outputString = outputStringCache; + outputString += "\n Unable to retrieve configuration status"; + } + + // Cluster section + outputString += "\n\nCluster:"; + StatusObjectReader processesMap; + StatusObjectReader machinesMap; + + outputStringCache = outputString; + + bool machinesAreZones = true; + std::map zones; + try { + outputString += "\n FoundationDB processes - "; + if (statusObjCluster.get("processes", processesMap)) { + + outputString += format("%d", processesMap.obj().size()); + + int errors = 0; + int processExclusions = 0; + for (auto p : processesMap.obj()) { + StatusObjectReader process(p.second); + bool excluded = process.has("excluded") && process.last().get_bool(); + if (excluded) { + processExclusions++; + } + if (process.has("messages") && process.last().get_array().size()) { + errors++; + } + + std::string zoneId; + if (process.get("locality.zoneid", zoneId)) { + std::string machineId; + if (!process.get("locality.machineid", machineId) || machineId != zoneId) { + machinesAreZones = false; + } + int& nonExcluded = zones[zoneId]; + if (!excluded) { + nonExcluded = 1; + } + } + } + + if (errors > 0 || processExclusions) { + outputString += format(" (less %d excluded; %d with errors)", processExclusions, errors); + } + + } else + outputString += "unknown"; + + if (zones.size() > 0) { + outputString += format("\n Zones - %d", zones.size()); + int zoneExclusions = 0; + for (auto itr : zones) { + if (itr.second == 0) { + ++zoneExclusions; + } + } + if (zoneExclusions > 0) { + outputString += format(" (less %d excluded)", zoneExclusions); + } + } else { + outputString += "\n Zones - unknown"; + } + + outputString += "\n Machines - "; + if (statusObjCluster.get("machines", machinesMap)) { + outputString += format("%d", machinesMap.obj().size()); + + int machineExclusions = 0; + for (auto mach : machinesMap.obj()) { + StatusObjectReader machine(mach.second); + if (machine.has("excluded") && machine.last().get_bool()) + machineExclusions++; + } + + if (machineExclusions) { + outputString += format(" (less %d excluded)", machineExclusions); + } + + int64_t minMemoryAvailable = std::numeric_limits::max(); + for (auto proc : processesMap.obj()) { + StatusObjectReader process(proc.second); + int64_t availBytes; + if (process.get("memory.available_bytes", availBytes)) { + minMemoryAvailable = std::min(minMemoryAvailable, availBytes); + } + } + + if (minMemoryAvailable < std::numeric_limits::max()) { + double worstServerGb = minMemoryAvailable / (1024.0 * 1024 * 1024); + outputString += "\n Memory availability - "; + outputString += format("%.1f GB per process on machine with least available", worstServerGb); + outputString += minMemoryAvailable < 4294967296 + ? "\n >>>>> (WARNING: 4.0 GB recommended) <<<<<" + : ""; + } + + double retransCount = 0; + for (auto mach : machinesMap.obj()) { + StatusObjectReader machine(mach.second); + double hz; + if (machine.get("network.tcp_segments_retransmitted.hz", hz)) + retransCount += hz; + } + + if (retransCount > 0) { + outputString += format("\n Retransmissions rate - %d Hz", (int)round(retransCount)); + } + } else + outputString += "\n Machines - unknown"; + + StatusObjectReader faultTolerance; + if (statusObjCluster.get("fault_tolerance", faultTolerance)) { + int availLoss, dataLoss; + + if (faultTolerance.get("max_zone_failures_without_losing_availability", availLoss) && + faultTolerance.get("max_zone_failures_without_losing_data", dataLoss)) { + + outputString += "\n Fault Tolerance - "; + + int minLoss = std::min(availLoss, dataLoss); + const char* faultDomain = machinesAreZones ? "machine" : "zone"; + outputString += format("%d %ss", minLoss, faultDomain); + + if (dataLoss > availLoss) { + outputString += format(" (%d without data loss)", dataLoss); + } + + if (dataLoss == -1) { + ASSERT_WE_THINK(availLoss == -1); + outputString += format( + "\n\n Warning: the database may have data loss and availability loss. Please restart " + "following tlog interfaces, otherwise storage servers may never be able to catch " + "up.\n"); + StatusObjectReader logs; + if (statusObjCluster.has("logs")) { + for (StatusObjectReader logEpoch : statusObjCluster.last().get_array()) { + bool possiblyLosingData; + if (logEpoch.get("possibly_losing_data", possiblyLosingData) && + !possiblyLosingData) { + continue; + } + // Current epoch doesn't have an end version. + int64_t epoch, beginVersion, endVersion = invalidVersion; + bool current; + logEpoch.get("epoch", epoch); + logEpoch.get("begin_version", beginVersion); + logEpoch.get("end_version", endVersion); + logEpoch.get("current", current); + std::string missing_log_interfaces; + if (logEpoch.has("log_interfaces")) { + for (StatusObjectReader logInterface : logEpoch.last().get_array()) { + bool healthy; + std::string address, id; + if (logInterface.get("healthy", healthy) && !healthy) { + logInterface.get("id", id); + logInterface.get("address", address); + missing_log_interfaces += format("%s,%s ", id.c_str(), address.c_str()); + } + } + } + outputString += format( + " %s log epoch: %ld begin: %ld end: %s, missing " + "log interfaces(id,address): %s\n", + current ? "Current" : "Old", + epoch, + beginVersion, + endVersion == invalidVersion ? "(unknown)" : format("%ld", endVersion).c_str(), + missing_log_interfaces.c_str()); + } + } + } + } + } + + std::string serverTime = getDateInfoString(statusObjCluster, "cluster_controller_timestamp"); + if (serverTime != "") { + outputString += "\n Server time - " + serverTime; + } + } catch (std::runtime_error&) { + outputString = outputStringCache; + outputString += "\n Unable to retrieve cluster status"; + } + + StatusObjectReader statusObjData; + statusObjCluster.get("data", statusObjData); + + // Data section + outputString += "\n\nData:"; + outputStringCache = outputString; + try { + outputString += "\n Replication health - "; + + StatusObjectReader statusObjDataState; + statusObjData.get("state", statusObjDataState); + + std::string dataState; + statusObjDataState.get("name", dataState); + + std::string description = ""; + statusObjDataState.get("description", description); + + bool healthy; + if (statusObjDataState.get("healthy", healthy) && healthy) { + outputString += "Healthy" + (description != "" ? " (" + description + ")" : ""); + } else if (dataState == "missing_data") { + outputString += "UNHEALTHY" + (description != "" ? ": " + description : ""); + } else if (dataState == "healing") { + outputString += "HEALING" + (description != "" ? ": " + description : ""); + } else if (description != "") { + outputString += description; + } else { + outputString += "unknown"; + } + + if (statusObjData.has("moving_data")) { + StatusObjectReader movingData = statusObjData.last(); + double dataInQueue, dataInFlight; + if (movingData.get("in_queue_bytes", dataInQueue) && + movingData.get("in_flight_bytes", dataInFlight)) + outputString += format("\n Moving data - %.3f GB", + ((double)dataInQueue + (double)dataInFlight) / 1e9); + } else if (dataState == "initializing") { + outputString += "\n Moving data - unknown (initializing)"; + } else { + outputString += "\n Moving data - unknown"; + } + + outputString += "\n Sum of key-value sizes - "; + + if (statusObjData.has("total_kv_size_bytes")) { + double totalDBBytes = statusObjData.last().get_int64(); + + if (totalDBBytes >= 1e12) + outputString += format("%.3f TB", (totalDBBytes / 1e12)); + + else if (totalDBBytes >= 1e9) + outputString += format("%.3f GB", (totalDBBytes / 1e9)); + + else + // no decimal points for MB + outputString += format("%d MB", (int)round(totalDBBytes / 1e6)); + } else { + outputString += "unknown"; + } + + outputString += "\n Disk space used - "; + + if (statusObjData.has("total_disk_used_bytes")) { + double totalDiskUsed = statusObjData.last().get_int64(); + + if (totalDiskUsed >= 1e12) + outputString += format("%.3f TB", (totalDiskUsed / 1e12)); + + else if (totalDiskUsed >= 1e9) + outputString += format("%.3f GB", (totalDiskUsed / 1e9)); + + else + // no decimal points for MB + outputString += format("%d MB", (int)round(totalDiskUsed / 1e6)); + } else + outputString += "unknown"; + + } catch (std::runtime_error&) { + outputString = outputStringCache; + outputString += "\n Unable to retrieve data status"; + } + + // Operating space section + outputString += "\n\nOperating space:"; + std::string operatingSpaceString = ""; + try { + int64_t val; + if (statusObjData.get("least_operating_space_bytes_storage_server", val)) + operatingSpaceString += format("\n Storage server - %.1f GB free on most full server", + std::max(val / 1e9, 0.0)); + + if (statusObjData.get("least_operating_space_bytes_log_server", val)) + operatingSpaceString += format("\n Log server - %.1f GB free on most full server", + std::max(val / 1e9, 0.0)); + + } catch (std::runtime_error&) { + operatingSpaceString = ""; + } + + if (operatingSpaceString.empty()) { + operatingSpaceString += "\n Unable to retrieve operating space status"; + } + outputString += operatingSpaceString; + + // Workload section + outputString += "\n\nWorkload:"; + outputStringCache = outputString; + bool foundLogAndStorage = false; + try { + // Determine which rates are unknown + StatusObjectReader statusObjWorkload; + statusObjCluster.get("workload", statusObjWorkload); + + std::string performanceLimited = ""; + bool unknownMCT = false; + bool unknownRP = false; + + // Print performance limit details if known. + try { + StatusObjectReader limit = statusObjCluster["qos.performance_limited_by"]; + std::string name = limit["name"].get_str(); + if (name != "workload") { + std::string desc = limit["description"].get_str(); + std::string serverID; + limit.get("reason_server_id", serverID); + std::string procAddr = getProcessAddressByServerID(processesMap, serverID); + performanceLimited = format("\n Performance limited by %s: %s", + (procAddr == "unknown") + ? ("server" + (serverID == "" ? "" : (" " + serverID))).c_str() + : "process", + desc.c_str()); + if (procAddr != "unknown") + performanceLimited += format("\n Most limiting process: %s", procAddr.c_str()); + } + } catch (std::exception&) { + // If anything here throws (such as for an incompatible type) ignore it. + } + + // display the known rates + outputString += "\n Read rate - "; + outputString += getWorkloadRates(statusObjWorkload, unknownRP, "reads", "hz"); + + outputString += "\n Write rate - "; + outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "writes", "hz"); + + outputString += "\n Transactions started - "; + outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "started", "hz", true); + + outputString += "\n Transactions committed - "; + outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "committed", "hz", true); + + outputString += "\n Conflict rate - "; + outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "conflicted", "hz", true); + + outputString += unknownRP ? "" : performanceLimited; + + // display any process messages + // FIXME: Above comment is not what this code block does, it actually just looks for a specific message + // in the process map, *by description*, and adds process addresses that have it to a vector. Either + // change the comment or the code. + std::vector messagesAddrs; + for (auto proc : processesMap.obj()) { + StatusObjectReader process(proc.second); + if (process.has("roles")) { + StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); + bool storageRole = false; + bool logRole = false; + for (StatusObjectReader role : rolesArray) { + if (role["role"].get_str() == "storage") { + storageRole = true; + } else if (role["role"].get_str() == "log") { + logRole = true; + } + } + if (storageRole && logRole) { + foundLogAndStorage = true; + } + } + if (process.has("messages")) { + StatusArray processMessagesArr = process.last().get_array(); + if (processMessagesArr.size()) { + for (StatusObjectReader msg : processMessagesArr) { + std::string desc; + std::string addr; + if (msg.get("description", desc) && desc == "Unable to update cluster file." && + process.get("address", addr)) { + messagesAddrs.push_back(addr); + } + } + } + } + } + if (messagesAddrs.size()) { + outputString += format("\n\n%d FoundationDB processes reported unable to update cluster file:", + messagesAddrs.size()); + for (auto msg : messagesAddrs) { + outputString += "\n " + msg; + } + } + } catch (std::runtime_error&) { + outputString = outputStringCache; + outputString += "\n Unable to retrieve workload status"; + } + + // Backup and DR section + outputString += "\n\nBackup and DR:"; + + std::map backupTags; + getBackupDRTags(statusObjCluster, "backup", backupTags); + + std::map drPrimaryTags; + getBackupDRTags(statusObjCluster, "dr_backup", drPrimaryTags); + + std::map drSecondaryTags; + getBackupDRTags(statusObjCluster, "dr_backup_dest", drSecondaryTags); + + outputString += format("\n Running backups - %d", backupTags.size()); + outputString += format("\n Running DRs - "); + + if (drPrimaryTags.size() == 0 && drSecondaryTags.size() == 0) { + outputString += format("%d", 0); + } else { + if (drPrimaryTags.size() > 0) { + outputString += format("%d as primary", drPrimaryTags.size()); + if (drSecondaryTags.size() > 0) { + outputString += ", "; + } + } + if (drSecondaryTags.size() > 0) { + outputString += format("%d as secondary", drSecondaryTags.size()); + } + } + + // status details + if (level == StatusClient::DETAILED) { + outputString += logBackupDR("Running backup tags", backupTags); + outputString += logBackupDR("Running DR tags (as primary)", drPrimaryTags); + outputString += logBackupDR("Running DR tags (as secondary)", drSecondaryTags); + + outputString += "\n\nProcess performance details:"; + outputStringCache = outputString; + try { + // constructs process performance details output + std::map workerDetails; + for (auto proc : processesMap.obj()) { + StatusObjectReader procObj(proc.second); + std::string address; + procObj.get("address", address); + + std::string line; + + NetworkAddress parsedAddress; + try { + parsedAddress = NetworkAddress::parse(address); + } catch (Error&) { + // Groups all invalid IP address/port pair in the end of this detail group. + line = format(" %-22s (invalid IP address or port)", address.c_str()); + IPAddress::IPAddressStore maxIp; + for (int i = 0; i < maxIp.size(); ++i) { + maxIp[i] = std::numeric_limits::type>::max(); + } + std::string& lastline = + workerDetails[NetworkAddress(IPAddress(maxIp), std::numeric_limits::max())]; + if (!lastline.empty()) + lastline.append("\n"); + lastline += line; + continue; + } + + try { + double tx = -1, rx = -1, mCPUUtil = -1; + int64_t processTotalSize; + + // Get the machine for this process + // StatusObjectReader mach = machinesMap[procObj["machine_id"].get_str()]; + StatusObjectReader mach; + if (machinesMap.get(procObj["machine_id"].get_str(), mach, false)) { + StatusObjectReader machCPU; + if (mach.get("cpu", machCPU)) { + + machCPU.get("logical_core_utilization", mCPUUtil); + + StatusObjectReader network; + if (mach.get("network", network)) { + network.get("megabits_sent.hz", tx); + network.get("megabits_received.hz", rx); + } + } + } + + procObj.get("memory.used_bytes", processTotalSize); + + StatusObjectReader procCPUObj; + procObj.get("cpu", procCPUObj); + + line = format(" %-22s (", address.c_str()); + + double usageCores; + if (procCPUObj.get("usage_cores", usageCores)) + line += format("%3.0f%% cpu;", usageCores * 100); + + line += mCPUUtil != -1 ? format("%3.0f%% machine;", mCPUUtil * 100) : ""; + line += std::min(tx, rx) != -1 ? format("%6.3f Gbps;", std::max(tx, rx) / 1000.0) : ""; + + double diskBusy; + if (procObj.get("disk.busy", diskBusy)) + line += format("%3.0f%% disk IO;", 100.0 * diskBusy); + + line += processTotalSize != -1 + ? format("%4.1f GB", processTotalSize / (1024.0 * 1024 * 1024)) + : ""; + + double availableBytes; + if (procObj.get("memory.available_bytes", availableBytes)) + line += format(" / %3.1f GB RAM )", availableBytes / (1024.0 * 1024 * 1024)); + else + line += " )"; + + if (procObj.has("messages")) { + for (StatusObjectReader message : procObj.last().get_array()) { + std::string desc; + if (message.get("description", desc)) { + if (message.has("type")) { + line += "\n Last logged error: " + desc; + } else { + line += "\n " + desc; + } + } + } + } + + workerDetails[parsedAddress] = line; + } + + catch (std::runtime_error&) { + std::string noMetrics = format(" %-22s (no metrics available)", address.c_str()); + workerDetails[parsedAddress] = noMetrics; + } + } + for (auto w : workerDetails) + outputString += "\n" + format("%s", w.second.c_str()); + } catch (std::runtime_error&) { + outputString = outputStringCache; + outputString += "\n Unable to retrieve process performance details"; + } + + if (!printedCoordinators) { + printedCoordinators = true; + outputString += "\n\nCoordination servers:"; + outputString += getCoordinatorsInfoString(statusObj); + } + } + + // client time + std::string clientTime = getDateInfoString(statusObjClient, "timestamp"); + if (clientTime != "") { + outputString += "\n\nClient time: " + clientTime; + } + + if (processesMap.obj().size() > 1 && isOldMemory) { + outputString += "\n\nWARNING: type `configure memory' to switch to a safer method of persisting data " + "on the transaction logs."; + } + if (processesMap.obj().size() > 9 && foundLogAndStorage) { + outputString += + "\n\nWARNING: A single process is both a transaction log and a storage server.\n For best " + "performance use dedicated disks for the transaction logs by setting process classes."; + } + + if (statusObjCluster.has("data_distribution_disabled")) { + outputString += "\n\nWARNING: Data distribution is off."; + } else { + if (statusObjCluster.has("data_distribution_disabled_for_ss_failures")) { + outputString += "\n\nWARNING: Data distribution is currently turned on but disabled for all " + "storage server failures."; + } + if (statusObjCluster.has("data_distribution_disabled_for_rebalance")) { + outputString += "\n\nWARNING: Data distribution is currently turned on but shard size balancing is " + "currently disabled."; + } + } + + printf("%s\n", outputString.c_str()); + } + + // status minimal + else if (level == StatusClient::MINIMAL) { + // Checking for field exsistence is not necessary here because if a field is missing there is no additional + // information that we would be able to display if we continued execution. Instead, any missing fields will + // throw and the catch will display the proper message. + try { + // If any of these throw, can't get status because the result makes no sense. + StatusObjectReader statusObjClient = statusObj["client"].get_obj(); + StatusObjectReader statusObjClientDatabaseStatus = statusObjClient["database_status"].get_obj(); + + bool available = statusObjClientDatabaseStatus["available"].get_bool(); + + // Database unavailable + if (!available) { + printf("%s", "The database is unavailable; type `status' for more information.\n"); + } else { + try { + bool healthy = statusObjClientDatabaseStatus["healthy"].get_bool(); + + // Database available without issues + if (healthy) { + if (displayDatabaseAvailable) { + printf("The database is available.\n"); + } + } else { // Database running but with issues + printf("The database is available, but has issues (type 'status' for more information).\n"); + } + } catch (std::runtime_error&) { + printf("The database is available, but has issues (type 'status' for more information).\n"); + } + } + + bool upToDate; + if (!statusObjClient.get("cluster_file.up_to_date", upToDate) || !upToDate) { + fprintf(stderr, + "WARNING: The cluster file is not up to date. Type 'status' for more information.\n"); + } + } catch (std::runtime_error&) { + printf("Unable to determine database state, type 'status' for more information.\n"); + } + + } + + // status JSON + else if (level == StatusClient::JSON) { + printf("%s\n", + json_spirit::write_string(json_spirit::mValue(statusObj.obj()), + json_spirit::Output_options::pretty_print) + .c_str()); + } + } catch (Error&) { + if (hideErrorMessages) + return; + if (level == StatusClient::MINIMAL) { + printf("Unable to determine database state, type 'status' for more information.\n"); + } else if (level == StatusClient::JSON) { + printf("Could not retrieve status json.\n\n"); + } else { + printf("Could not retrieve status, type 'status json' for more information.\n"); + } + } +} + +// "db" is the handler to the multiversion databse +// localDb is the native Database object +// localDb is rarely needed except the "db" has not establised a connection to the cluster where the operation will +// return Never as we expect status command to always return, we use "localDb" to return the default result +ACTOR Future statusCommandActor(Reference db, + Database localDb, + std::vector tokens, + bool isExecMode) { + + state StatusClient::StatusLevel level; + if (tokens.size() == 1) + level = StatusClient::NORMAL; + else if (tokens.size() == 2 && tokencmp(tokens[1], "details")) + level = StatusClient::DETAILED; + else if (tokens.size() == 2 && tokencmp(tokens[1], "minimal")) + level = StatusClient::MINIMAL; + else if (tokens.size() == 2 && tokencmp(tokens[1], "json")) + level = StatusClient::JSON; + else { + printUsage(tokens[0]); + return false; + } + + state StatusObject s; + state Reference tr = db->createTransaction(); + if (!tr->isValid()) { + StatusObject _s = wait(StatusClient::statusFetcher(localDb)); + s = _s; + } else { + state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); + if (!statusValue.present()) { + fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); + } + json_spirit::mValue mv; + json_spirit::read_string(statusValue.get().toString(), mv); + s = StatusObject(mv.get_obj()); + } + + if (!isExecMode) + printf("\n"); + printStatus(s, level); + if (!isExecMode) + printf("\n"); + return true; +} + +CommandFactory statusFactory( + "status", + CommandHelp("status [minimal|details|json]", + "get the status of a FoundationDB cluster", + "If the cluster is down, this command will print a diagnostic which may be useful in figuring out " + "what is wrong. If the cluster is running, this command will print cluster " + "statistics.\n\nSpecifying `minimal' will provide a minimal description of the status of your " + "database.\n\nSpecifying `details' will provide load information for individual " + "workers.\n\nSpecifying `json' will provide status information in a machine readable JSON format.")); +} // namespace fdb_cli \ No newline at end of file diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 42501fd948f..c4a269dcfce 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -30,7 +30,6 @@ #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/IKnobCollection.h" #include "fdbclient/NativeAPI.actor.h" -#include "fdbclient/ReadYourWrites.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/Schemas.h" @@ -41,6 +40,7 @@ #include "fdbclient/ThreadSafeTransaction.h" #include "flow/DeterministicRandom.h" +#include "flow/FastRef.h" #include "flow/Platform.h" #include "flow/TLSConfig.actor.h" @@ -140,41 +140,17 @@ void printAtCol(const char* text, int col) { } while (*iter); } -std::string lineWrap(const char* text, int col) { - const char* iter = text; - const char* start = text; - const char* space = nullptr; - std::string out = ""; - do { - iter++; - if (*iter == '\n' || *iter == ' ' || *iter == '\0') - space = iter; - if (*iter == '\n' || *iter == '\0' || (iter - start == col)) { - if (!space) - space = iter; - out += format("%.*s\n", (int)(space - start), start); - start = space; - if (*start == ' ' /* || *start == '\n'*/) - start++; - space = nullptr; - } - } while (*iter); - return out; -} - class FdbOptions { public: // Prints an error and throws invalid_option or invalid_option_value if the option could not be set - // TODO: remove Reference after we refactor all fdbcli code - void setOption(Reference tr, - Reference tr2, + void setOption(Reference tr, StringRef optionStr, bool enabled, Optional arg, bool intrans) { auto transactionItr = transactionOptions.legalOptions.find(optionStr.toString()); if (transactionItr != transactionOptions.legalOptions.end()) - setTransactionOption(tr, tr2, transactionItr->second, enabled, arg, intrans); + setTransactionOption(tr, transactionItr->second, enabled, arg, intrans); else { fprintf(stderr, "ERROR: invalid option '%s'. Try `help options' for a list of available options.\n", @@ -184,13 +160,6 @@ class FdbOptions { } // Applies all enabled transaction options to the given transaction - void apply(Reference tr) { - for (const auto& [name, value] : transactionOptions.options) { - tr->setOption(name, value.castTo()); - } - } - - // TODO: replace the above function after we refactor all fdbcli code void apply(Reference tr) { for (const auto& [name, value] : transactionOptions.options) { tr->setOption(name, value.castTo()); @@ -217,9 +186,7 @@ class FdbOptions { private: // Sets a transaction option. If intrans == true, then this option is also applied to the passed in transaction. - // TODO: remove Reference after we refactor all fdbcli code - void setTransactionOption(Reference tr, - Reference tr2, + void setTransactionOption(Reference tr, FDBTransactionOptions::Option option, bool enabled, Optional arg, @@ -231,7 +198,6 @@ class FdbOptions { if (intrans) { tr->setOption(option, arg); - tr2->setOption(option, arg); } transactionOptions.setOption(option, enabled, arg.castTo()); @@ -573,14 +539,6 @@ void initHelp() { "pair in or any LocalityData (like dcid, zoneid, machineid, processid), removes any " "matching exclusions from the excluded servers and localities list. " "(A specified IP will match all IP:* exclusion entries)"); - helpMap["status"] = - CommandHelp("status [minimal|details|json]", - "get the status of a FoundationDB cluster", - "If the cluster is down, this command will print a diagnostic which may be useful in figuring out " - "what is wrong. If the cluster is running, this command will print cluster " - "statistics.\n\nSpecifying `minimal' will provide a minimal description of the status of your " - "database.\n\nSpecifying `details' will provide load information for individual " - "workers.\n\nSpecifying `json' will provide status information in a machine readable JSON format."); helpMap["exit"] = CommandHelp("exit", "exit the CLI", ""); helpMap["quit"] = CommandHelp(); helpMap["waitconnected"] = CommandHelp(); @@ -676,1126 +634,6 @@ void printHelp(StringRef command) { printf("I don't know anything about `%s'\n", formatStringRef(command).c_str()); } -std::string getCoordinatorsInfoString(StatusObjectReader statusObj) { - std::string outputString; - try { - StatusArray coordinatorsArr = statusObj["client.coordinators.coordinators"].get_array(); - for (StatusObjectReader coor : coordinatorsArr) - outputString += format("\n %s (%s)", - coor["address"].get_str().c_str(), - coor["reachable"].get_bool() ? "reachable" : "unreachable"); - } catch (std::runtime_error&) { - outputString = "\n Unable to retrieve list of coordination servers"; - } - - return outputString; -} - -std::string getDateInfoString(StatusObjectReader statusObj, std::string key) { - time_t curTime; - if (!statusObj.has(key)) { - return ""; - } - curTime = statusObj.last().get_int64(); - char buffer[128]; - struct tm* timeinfo; - timeinfo = localtime(&curTime); - strftime(buffer, 128, "%m/%d/%y %H:%M:%S", timeinfo); - return std::string(buffer); -} - -std::string getProcessAddressByServerID(StatusObjectReader processesMap, std::string serverID) { - if (serverID == "") - return "unknown"; - - for (auto proc : processesMap.obj()) { - try { - StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); - for (StatusObjectReader role : rolesArray) { - if (role["id"].get_str().find(serverID) == 0) { - // If this next line throws, then we found the serverID but the role has no address, so the role is - // skipped. - return proc.second.get_obj()["address"].get_str(); - } - } - } catch (std::exception&) { - // If an entry in the process map is badly formed then something will throw. Since we are - // looking for a positive match, just ignore any read execeptions and move on to the next proc - } - } - return "unknown"; -} - -std::string getWorkloadRates(StatusObjectReader statusObj, - bool unknown, - std::string first, - std::string second, - bool transactionSection = false) { - // Re-point statusObj at either the transactions sub-doc or the operations sub-doc depending on transactionSection - // flag - if (transactionSection) { - if (!statusObj.get("transactions", statusObj)) - return "unknown"; - } else { - if (!statusObj.get("operations", statusObj)) - return "unknown"; - } - - std::string path = first + "." + second; - double value; - if (!unknown && statusObj.get(path, value)) { - return format("%d Hz", (int)round(value)); - } - return "unknown"; -} - -void getBackupDRTags(StatusObjectReader& statusObjCluster, - const char* context, - std::map& tagMap) { - std::string path = format("layers.%s.tags", context); - StatusObjectReader tags; - if (statusObjCluster.tryGet(path, tags)) { - for (auto itr : tags.obj()) { - JSONDoc tag(itr.second); - bool running = false; - tag.tryGet("running_backup", running); - if (running) { - std::string uid; - if (tag.tryGet("mutation_stream_id", uid)) { - tagMap[itr.first] = uid; - } else { - tagMap[itr.first] = ""; - } - } - } - } -} - -std::string logBackupDR(const char* context, std::map const& tagMap) { - std::string outputString = ""; - if (tagMap.size() > 0) { - outputString += format("\n\n%s:", context); - for (auto itr : tagMap) { - outputString += format("\n %-22s", itr.first.c_str()); - if (itr.second.size() > 0) { - outputString += format(" - %s", itr.second.c_str()); - } - } - } - - return outputString; -} - -int getNumofNonExcludedMachines(StatusObjectReader statusObjCluster) { - StatusObjectReader machineMap; - int numOfNonExcludedMachines = 0; - if (statusObjCluster.get("machines", machineMap)) { - for (auto mach : machineMap.obj()) { - StatusObjectReader machine(mach.second); - if (machine.has("excluded") && !machine.last().get_bool()) - numOfNonExcludedMachines++; - } - } - return numOfNonExcludedMachines; -} - -std::pair getNumOfNonExcludedProcessAndZones(StatusObjectReader statusObjCluster) { - StatusObjectReader processesMap; - std::set zones; - int numOfNonExcludedProcesses = 0; - if (statusObjCluster.get("processes", processesMap)) { - for (auto proc : processesMap.obj()) { - StatusObjectReader process(proc.second); - if (process.has("excluded") && process.last().get_bool()) - continue; - numOfNonExcludedProcesses++; - std::string zoneId; - if (process.get("locality.zoneid", zoneId)) { - zones.insert(zoneId); - } - } - } - return { numOfNonExcludedProcesses, zones.size() }; -} - -void printStatus(StatusObjectReader statusObj, - StatusClient::StatusLevel level, - bool displayDatabaseAvailable = true, - bool hideErrorMessages = false) { - if (FlowTransport::transport().incompatibleOutgoingConnectionsPresent()) { - fprintf( - stderr, - "WARNING: One or more of the processes in the cluster is incompatible with this version of fdbcli.\n\n"); - } - - try { - bool printedCoordinators = false; - - // status or status details - if (level == StatusClient::NORMAL || level == StatusClient::DETAILED) { - - StatusObjectReader statusObjClient; - statusObj.get("client", statusObjClient); - - // The way the output string is assembled is to add new line character before addition to the string rather - // than after - std::string outputString = ""; - std::string clusterFilePath; - if (statusObjClient.get("cluster_file.path", clusterFilePath)) - outputString = format("Using cluster file `%s'.\n", clusterFilePath.c_str()); - else - outputString = "Using unknown cluster file.\n"; - - StatusObjectReader statusObjCoordinators; - StatusArray coordinatorsArr; - - if (statusObjClient.get("coordinators", statusObjCoordinators)) { - // Look for a second "coordinators", under the first one. - if (statusObjCoordinators.has("coordinators")) - coordinatorsArr = statusObjCoordinators.last().get_array(); - } - - // Check if any coordination servers are unreachable - bool quorum_reachable; - if (statusObjCoordinators.get("quorum_reachable", quorum_reachable) && !quorum_reachable) { - outputString += "\nCould not communicate with a quorum of coordination servers:"; - outputString += getCoordinatorsInfoString(statusObj); - - printf("%s\n", outputString.c_str()); - return; - } else { - for (StatusObjectReader coor : coordinatorsArr) { - bool reachable; - if (coor.get("reachable", reachable) && !reachable) { - outputString += "\nCould not communicate with all of the coordination servers." - "\n The database will remain operational as long as we" - "\n can connect to a quorum of servers, however the fault" - "\n tolerance of the system is reduced as long as the" - "\n servers remain disconnected.\n"; - outputString += getCoordinatorsInfoString(statusObj); - outputString += "\n"; - printedCoordinators = true; - break; - } - } - } - - // print any client messages - if (statusObjClient.has("messages")) { - for (StatusObjectReader message : statusObjClient.last().get_array()) { - std::string desc; - if (message.get("description", desc)) - outputString += "\n" + lineWrap(desc.c_str(), 80); - } - } - - bool fatalRecoveryState = false; - StatusObjectReader statusObjCluster; - try { - if (statusObj.get("cluster", statusObjCluster)) { - - StatusObjectReader recoveryState; - if (statusObjCluster.get("recovery_state", recoveryState)) { - std::string name; - std::string description; - if (recoveryState.get("name", name) && recoveryState.get("description", description) && - name != "accepting_commits" && name != "all_logs_recruited" && - name != "storage_recovered" && name != "fully_recovered") { - fatalRecoveryState = true; - - if (name == "recruiting_transaction_servers") { - description += - format("\nNeed at least %d log servers across unique zones, %d commit proxies, " - "%d GRV proxies and %d resolvers.", - recoveryState["required_logs"].get_int(), - recoveryState["required_commit_proxies"].get_int(), - recoveryState["required_grv_proxies"].get_int(), - recoveryState["required_resolvers"].get_int()); - if (statusObjCluster.has("machines") && statusObjCluster.has("processes")) { - auto numOfNonExcludedProcessesAndZones = - getNumOfNonExcludedProcessAndZones(statusObjCluster); - description += - format("\nHave %d non-excluded processes on %d machines across %d zones.", - numOfNonExcludedProcessesAndZones.first, - getNumofNonExcludedMachines(statusObjCluster), - numOfNonExcludedProcessesAndZones.second); - } - } else if (name == "locking_old_transaction_servers" && - recoveryState["missing_logs"].get_str().size()) { - description += format("\nNeed one or more of the following log servers: %s", - recoveryState["missing_logs"].get_str().c_str()); - } - description = lineWrap(description.c_str(), 80); - if (!printedCoordinators && - (name == "reading_coordinated_state" || name == "locking_coordinated_state" || - name == "configuration_never_created" || name == "writing_coordinated_state")) { - description += getCoordinatorsInfoString(statusObj); - description += "\n"; - printedCoordinators = true; - } - - outputString += "\n" + description; - } - } - } - } catch (std::runtime_error&) { - } - - // Check if cluster controllable is reachable - try { - // print any cluster messages - if (statusObjCluster.has("messages") && statusObjCluster.last().get_array().size()) { - - // any messages we don't want to display - std::set skipMsgs = { "unreachable_process", "" }; - if (fatalRecoveryState) { - skipMsgs.insert("status_incomplete"); - skipMsgs.insert("unreadable_configuration"); - skipMsgs.insert("immediate_priority_transaction_start_probe_timeout"); - skipMsgs.insert("batch_priority_transaction_start_probe_timeout"); - skipMsgs.insert("transaction_start_probe_timeout"); - skipMsgs.insert("read_probe_timeout"); - skipMsgs.insert("commit_probe_timeout"); - } - - for (StatusObjectReader msgObj : statusObjCluster.last().get_array()) { - std::string messageName; - if (!msgObj.get("name", messageName)) { - continue; - } - if (skipMsgs.count(messageName)) { - continue; - } else if (messageName == "client_issues") { - if (msgObj.has("issues")) { - for (StatusObjectReader issue : msgObj["issues"].get_array()) { - std::string issueName; - if (!issue.get("name", issueName)) { - continue; - } - - std::string description; - if (!issue.get("description", description)) { - description = issueName; - } - - std::string countStr; - StatusArray addresses; - if (!issue.has("addresses")) { - countStr = "Some client(s)"; - } else { - addresses = issue["addresses"].get_array(); - countStr = format("%d client(s)", addresses.size()); - } - outputString += - format("\n%s reported: %s\n", countStr.c_str(), description.c_str()); - - if (level == StatusClient::StatusLevel::DETAILED) { - for (int i = 0; i < addresses.size() && i < 4; ++i) { - outputString += format(" %s\n", addresses[i].get_str().c_str()); - } - if (addresses.size() > 4) { - outputString += " ...\n"; - } - } - } - } - } else { - if (msgObj.has("description")) - outputString += "\n" + lineWrap(msgObj.last().get_str().c_str(), 80); - } - } - } - } catch (std::runtime_error&) { - } - - if (fatalRecoveryState) { - printf("%s", outputString.c_str()); - return; - } - - StatusObjectReader statusObjConfig; - StatusArray excludedServersArr; - Optional activePrimaryDC; - - if (statusObjCluster.has("active_primary_dc")) { - activePrimaryDC = statusObjCluster["active_primary_dc"].get_str(); - } - if (statusObjCluster.get("configuration", statusObjConfig)) { - if (statusObjConfig.has("excluded_servers")) - excludedServersArr = statusObjConfig.last().get_array(); - } - - // If there is a configuration message then there is no configuration information to display - outputString += "\nConfiguration:"; - std::string outputStringCache = outputString; - bool isOldMemory = false; - try { - // Configuration section - // FIXME: Should we suppress this if there are cluster messages implying that the database has no - // configuration? - - outputString += "\n Redundancy mode - "; - std::string strVal; - - if (statusObjConfig.get("redundancy_mode", strVal)) { - outputString += strVal; - } else - outputString += "unknown"; - - outputString += "\n Storage engine - "; - if (statusObjConfig.get("storage_engine", strVal)) { - if (strVal == "memory-1") { - isOldMemory = true; - } - outputString += strVal; - } else - outputString += "unknown"; - - int intVal; - outputString += "\n Coordinators - "; - if (statusObjConfig.get("coordinators_count", intVal)) { - outputString += std::to_string(intVal); - } else - outputString += "unknown"; - - if (excludedServersArr.size()) { - outputString += format("\n Exclusions - %d (type `exclude' for details)", - excludedServersArr.size()); - } - - if (statusObjConfig.get("commit_proxies", intVal)) - outputString += format("\n Desired Commit Proxies - %d", intVal); - - if (statusObjConfig.get("grv_proxies", intVal)) - outputString += format("\n Desired GRV Proxies - %d", intVal); - - if (statusObjConfig.get("resolvers", intVal)) - outputString += format("\n Desired Resolvers - %d", intVal); - - if (statusObjConfig.get("logs", intVal)) - outputString += format("\n Desired Logs - %d", intVal); - - if (statusObjConfig.get("remote_logs", intVal)) - outputString += format("\n Desired Remote Logs - %d", intVal); - - if (statusObjConfig.get("log_routers", intVal)) - outputString += format("\n Desired Log Routers - %d", intVal); - - if (statusObjConfig.get("tss_count", intVal) && intVal > 0) { - int activeTss = 0; - if (statusObjCluster.has("active_tss_count")) { - statusObjCluster.get("active_tss_count", activeTss); - } - outputString += format("\n TSS - %d/%d", activeTss, intVal); - - if (statusObjConfig.get("tss_storage_engine", strVal)) - outputString += format("\n TSS Storage Engine - %s", strVal.c_str()); - } - - outputString += "\n Usable Regions - "; - if (statusObjConfig.get("usable_regions", intVal)) { - outputString += std::to_string(intVal); - } else { - outputString += "unknown"; - } - - StatusArray regions; - if (statusObjConfig.has("regions")) { - outputString += "\n Regions: "; - regions = statusObjConfig["regions"].get_array(); - for (StatusObjectReader region : regions) { - bool isPrimary = false; - std::vector regionSatelliteDCs; - std::string regionDC; - for (StatusObjectReader dc : region["datacenters"].get_array()) { - if (!dc.has("satellite")) { - regionDC = dc["id"].get_str(); - if (activePrimaryDC.present() && dc["id"].get_str() == activePrimaryDC.get()) { - isPrimary = true; - } - } else if (dc["satellite"].get_int() == 1) { - regionSatelliteDCs.push_back(dc["id"].get_str()); - } - } - if (activePrimaryDC.present()) { - if (isPrimary) { - outputString += "\n Primary -"; - } else { - outputString += "\n Remote -"; - } - } else { - outputString += "\n Region -"; - } - outputString += format("\n Datacenter - %s", regionDC.c_str()); - if (regionSatelliteDCs.size() > 0) { - outputString += "\n Satellite datacenters - "; - for (int i = 0; i < regionSatelliteDCs.size(); i++) { - if (i != regionSatelliteDCs.size() - 1) { - outputString += format("%s, ", regionSatelliteDCs[i].c_str()); - } else { - outputString += format("%s", regionSatelliteDCs[i].c_str()); - } - } - } - isPrimary = false; - if (region.get("satellite_redundancy_mode", strVal)) { - outputString += format("\n Satellite Redundancy Mode - %s", strVal.c_str()); - } - if (region.get("satellite_anti_quorum", intVal)) { - outputString += format("\n Satellite Anti Quorum - %d", intVal); - } - if (region.get("satellite_logs", intVal)) { - outputString += format("\n Satellite Logs - %d", intVal); - } - if (region.get("satellite_log_policy", strVal)) { - outputString += format("\n Satellite Log Policy - %s", strVal.c_str()); - } - if (region.get("satellite_log_replicas", intVal)) { - outputString += format("\n Satellite Log Replicas - %d", intVal); - } - if (region.get("satellite_usable_dcs", intVal)) { - outputString += format("\n Satellite Usable DCs - %d", intVal); - } - } - } - } catch (std::runtime_error&) { - outputString = outputStringCache; - outputString += "\n Unable to retrieve configuration status"; - } - - // Cluster section - outputString += "\n\nCluster:"; - StatusObjectReader processesMap; - StatusObjectReader machinesMap; - - outputStringCache = outputString; - - bool machinesAreZones = true; - std::map zones; - try { - outputString += "\n FoundationDB processes - "; - if (statusObjCluster.get("processes", processesMap)) { - - outputString += format("%d", processesMap.obj().size()); - - int errors = 0; - int processExclusions = 0; - for (auto p : processesMap.obj()) { - StatusObjectReader process(p.second); - bool excluded = process.has("excluded") && process.last().get_bool(); - if (excluded) { - processExclusions++; - } - if (process.has("messages") && process.last().get_array().size()) { - errors++; - } - - std::string zoneId; - if (process.get("locality.zoneid", zoneId)) { - std::string machineId; - if (!process.get("locality.machineid", machineId) || machineId != zoneId) { - machinesAreZones = false; - } - int& nonExcluded = zones[zoneId]; - if (!excluded) { - nonExcluded = 1; - } - } - } - - if (errors > 0 || processExclusions) { - outputString += format(" (less %d excluded; %d with errors)", processExclusions, errors); - } - - } else - outputString += "unknown"; - - if (zones.size() > 0) { - outputString += format("\n Zones - %d", zones.size()); - int zoneExclusions = 0; - for (auto itr : zones) { - if (itr.second == 0) { - ++zoneExclusions; - } - } - if (zoneExclusions > 0) { - outputString += format(" (less %d excluded)", zoneExclusions); - } - } else { - outputString += "\n Zones - unknown"; - } - - outputString += "\n Machines - "; - if (statusObjCluster.get("machines", machinesMap)) { - outputString += format("%d", machinesMap.obj().size()); - - int machineExclusions = 0; - for (auto mach : machinesMap.obj()) { - StatusObjectReader machine(mach.second); - if (machine.has("excluded") && machine.last().get_bool()) - machineExclusions++; - } - - if (machineExclusions) { - outputString += format(" (less %d excluded)", machineExclusions); - } - - int64_t minMemoryAvailable = std::numeric_limits::max(); - for (auto proc : processesMap.obj()) { - StatusObjectReader process(proc.second); - int64_t availBytes; - if (process.get("memory.available_bytes", availBytes)) { - minMemoryAvailable = std::min(minMemoryAvailable, availBytes); - } - } - - if (minMemoryAvailable < std::numeric_limits::max()) { - double worstServerGb = minMemoryAvailable / (1024.0 * 1024 * 1024); - outputString += "\n Memory availability - "; - outputString += format("%.1f GB per process on machine with least available", worstServerGb); - outputString += minMemoryAvailable < 4294967296 - ? "\n >>>>> (WARNING: 4.0 GB recommended) <<<<<" - : ""; - } - - double retransCount = 0; - for (auto mach : machinesMap.obj()) { - StatusObjectReader machine(mach.second); - double hz; - if (machine.get("network.tcp_segments_retransmitted.hz", hz)) - retransCount += hz; - } - - if (retransCount > 0) { - outputString += format("\n Retransmissions rate - %d Hz", (int)round(retransCount)); - } - } else - outputString += "\n Machines - unknown"; - - StatusObjectReader faultTolerance; - if (statusObjCluster.get("fault_tolerance", faultTolerance)) { - int availLoss, dataLoss; - - if (faultTolerance.get("max_zone_failures_without_losing_availability", availLoss) && - faultTolerance.get("max_zone_failures_without_losing_data", dataLoss)) { - - outputString += "\n Fault Tolerance - "; - - int minLoss = std::min(availLoss, dataLoss); - const char* faultDomain = machinesAreZones ? "machine" : "zone"; - outputString += format("%d %ss", minLoss, faultDomain); - - if (dataLoss > availLoss) { - outputString += format(" (%d without data loss)", dataLoss); - } - - if (dataLoss == -1) { - ASSERT_WE_THINK(availLoss == -1); - outputString += format( - "\n\n Warning: the database may have data loss and availability loss. Please restart " - "following tlog interfaces, otherwise storage servers may never be able to catch " - "up.\n"); - StatusObjectReader logs; - if (statusObjCluster.has("logs")) { - for (StatusObjectReader logEpoch : statusObjCluster.last().get_array()) { - bool possiblyLosingData; - if (logEpoch.get("possibly_losing_data", possiblyLosingData) && - !possiblyLosingData) { - continue; - } - // Current epoch doesn't have an end version. - int64_t epoch, beginVersion, endVersion = invalidVersion; - bool current; - logEpoch.get("epoch", epoch); - logEpoch.get("begin_version", beginVersion); - logEpoch.get("end_version", endVersion); - logEpoch.get("current", current); - std::string missing_log_interfaces; - if (logEpoch.has("log_interfaces")) { - for (StatusObjectReader logInterface : logEpoch.last().get_array()) { - bool healthy; - std::string address, id; - if (logInterface.get("healthy", healthy) && !healthy) { - logInterface.get("id", id); - logInterface.get("address", address); - missing_log_interfaces += format("%s,%s ", id.c_str(), address.c_str()); - } - } - } - outputString += format( - " %s log epoch: %ld begin: %ld end: %s, missing " - "log interfaces(id,address): %s\n", - current ? "Current" : "Old", - epoch, - beginVersion, - endVersion == invalidVersion ? "(unknown)" : format("%ld", endVersion).c_str(), - missing_log_interfaces.c_str()); - } - } - } - } - } - - std::string serverTime = getDateInfoString(statusObjCluster, "cluster_controller_timestamp"); - if (serverTime != "") { - outputString += "\n Server time - " + serverTime; - } - } catch (std::runtime_error&) { - outputString = outputStringCache; - outputString += "\n Unable to retrieve cluster status"; - } - - StatusObjectReader statusObjData; - statusObjCluster.get("data", statusObjData); - - // Data section - outputString += "\n\nData:"; - outputStringCache = outputString; - try { - outputString += "\n Replication health - "; - - StatusObjectReader statusObjDataState; - statusObjData.get("state", statusObjDataState); - - std::string dataState; - statusObjDataState.get("name", dataState); - - std::string description = ""; - statusObjDataState.get("description", description); - - bool healthy; - if (statusObjDataState.get("healthy", healthy) && healthy) { - outputString += "Healthy" + (description != "" ? " (" + description + ")" : ""); - } else if (dataState == "missing_data") { - outputString += "UNHEALTHY" + (description != "" ? ": " + description : ""); - } else if (dataState == "healing") { - outputString += "HEALING" + (description != "" ? ": " + description : ""); - } else if (description != "") { - outputString += description; - } else { - outputString += "unknown"; - } - - if (statusObjData.has("moving_data")) { - StatusObjectReader movingData = statusObjData.last(); - double dataInQueue, dataInFlight; - if (movingData.get("in_queue_bytes", dataInQueue) && - movingData.get("in_flight_bytes", dataInFlight)) - outputString += format("\n Moving data - %.3f GB", - ((double)dataInQueue + (double)dataInFlight) / 1e9); - } else if (dataState == "initializing") { - outputString += "\n Moving data - unknown (initializing)"; - } else { - outputString += "\n Moving data - unknown"; - } - - outputString += "\n Sum of key-value sizes - "; - - if (statusObjData.has("total_kv_size_bytes")) { - double totalDBBytes = statusObjData.last().get_int64(); - - if (totalDBBytes >= 1e12) - outputString += format("%.3f TB", (totalDBBytes / 1e12)); - - else if (totalDBBytes >= 1e9) - outputString += format("%.3f GB", (totalDBBytes / 1e9)); - - else - // no decimal points for MB - outputString += format("%d MB", (int)round(totalDBBytes / 1e6)); - } else { - outputString += "unknown"; - } - - outputString += "\n Disk space used - "; - - if (statusObjData.has("total_disk_used_bytes")) { - double totalDiskUsed = statusObjData.last().get_int64(); - - if (totalDiskUsed >= 1e12) - outputString += format("%.3f TB", (totalDiskUsed / 1e12)); - - else if (totalDiskUsed >= 1e9) - outputString += format("%.3f GB", (totalDiskUsed / 1e9)); - - else - // no decimal points for MB - outputString += format("%d MB", (int)round(totalDiskUsed / 1e6)); - } else - outputString += "unknown"; - - } catch (std::runtime_error&) { - outputString = outputStringCache; - outputString += "\n Unable to retrieve data status"; - } - - // Operating space section - outputString += "\n\nOperating space:"; - std::string operatingSpaceString = ""; - try { - int64_t val; - if (statusObjData.get("least_operating_space_bytes_storage_server", val)) - operatingSpaceString += format("\n Storage server - %.1f GB free on most full server", - std::max(val / 1e9, 0.0)); - - if (statusObjData.get("least_operating_space_bytes_log_server", val)) - operatingSpaceString += format("\n Log server - %.1f GB free on most full server", - std::max(val / 1e9, 0.0)); - - } catch (std::runtime_error&) { - operatingSpaceString = ""; - } - - if (operatingSpaceString.empty()) { - operatingSpaceString += "\n Unable to retrieve operating space status"; - } - outputString += operatingSpaceString; - - // Workload section - outputString += "\n\nWorkload:"; - outputStringCache = outputString; - bool foundLogAndStorage = false; - try { - // Determine which rates are unknown - StatusObjectReader statusObjWorkload; - statusObjCluster.get("workload", statusObjWorkload); - - std::string performanceLimited = ""; - bool unknownMCT = false; - bool unknownRP = false; - - // Print performance limit details if known. - try { - StatusObjectReader limit = statusObjCluster["qos.performance_limited_by"]; - std::string name = limit["name"].get_str(); - if (name != "workload") { - std::string desc = limit["description"].get_str(); - std::string serverID; - limit.get("reason_server_id", serverID); - std::string procAddr = getProcessAddressByServerID(processesMap, serverID); - performanceLimited = format("\n Performance limited by %s: %s", - (procAddr == "unknown") - ? ("server" + (serverID == "" ? "" : (" " + serverID))).c_str() - : "process", - desc.c_str()); - if (procAddr != "unknown") - performanceLimited += format("\n Most limiting process: %s", procAddr.c_str()); - } - } catch (std::exception&) { - // If anything here throws (such as for an incompatible type) ignore it. - } - - // display the known rates - outputString += "\n Read rate - "; - outputString += getWorkloadRates(statusObjWorkload, unknownRP, "reads", "hz"); - - outputString += "\n Write rate - "; - outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "writes", "hz"); - - outputString += "\n Transactions started - "; - outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "started", "hz", true); - - outputString += "\n Transactions committed - "; - outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "committed", "hz", true); - - outputString += "\n Conflict rate - "; - outputString += getWorkloadRates(statusObjWorkload, unknownMCT, "conflicted", "hz", true); - - outputString += unknownRP ? "" : performanceLimited; - - // display any process messages - // FIXME: Above comment is not what this code block does, it actually just looks for a specific message - // in the process map, *by description*, and adds process addresses that have it to a vector. Either - // change the comment or the code. - std::vector messagesAddrs; - for (auto proc : processesMap.obj()) { - StatusObjectReader process(proc.second); - if (process.has("roles")) { - StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); - bool storageRole = false; - bool logRole = false; - for (StatusObjectReader role : rolesArray) { - if (role["role"].get_str() == "storage") { - storageRole = true; - } else if (role["role"].get_str() == "log") { - logRole = true; - } - } - if (storageRole && logRole) { - foundLogAndStorage = true; - } - } - if (process.has("messages")) { - StatusArray processMessagesArr = process.last().get_array(); - if (processMessagesArr.size()) { - for (StatusObjectReader msg : processMessagesArr) { - std::string desc; - std::string addr; - if (msg.get("description", desc) && desc == "Unable to update cluster file." && - process.get("address", addr)) { - messagesAddrs.push_back(addr); - } - } - } - } - } - if (messagesAddrs.size()) { - outputString += format("\n\n%d FoundationDB processes reported unable to update cluster file:", - messagesAddrs.size()); - for (auto msg : messagesAddrs) { - outputString += "\n " + msg; - } - } - } catch (std::runtime_error&) { - outputString = outputStringCache; - outputString += "\n Unable to retrieve workload status"; - } - - // Backup and DR section - outputString += "\n\nBackup and DR:"; - - std::map backupTags; - getBackupDRTags(statusObjCluster, "backup", backupTags); - - std::map drPrimaryTags; - getBackupDRTags(statusObjCluster, "dr_backup", drPrimaryTags); - - std::map drSecondaryTags; - getBackupDRTags(statusObjCluster, "dr_backup_dest", drSecondaryTags); - - outputString += format("\n Running backups - %d", backupTags.size()); - outputString += format("\n Running DRs - "); - - if (drPrimaryTags.size() == 0 && drSecondaryTags.size() == 0) { - outputString += format("%d", 0); - } else { - if (drPrimaryTags.size() > 0) { - outputString += format("%d as primary", drPrimaryTags.size()); - if (drSecondaryTags.size() > 0) { - outputString += ", "; - } - } - if (drSecondaryTags.size() > 0) { - outputString += format("%d as secondary", drSecondaryTags.size()); - } - } - - // status details - if (level == StatusClient::DETAILED) { - outputString += logBackupDR("Running backup tags", backupTags); - outputString += logBackupDR("Running DR tags (as primary)", drPrimaryTags); - outputString += logBackupDR("Running DR tags (as secondary)", drSecondaryTags); - - outputString += "\n\nProcess performance details:"; - outputStringCache = outputString; - try { - // constructs process performance details output - std::map workerDetails; - for (auto proc : processesMap.obj()) { - StatusObjectReader procObj(proc.second); - std::string address; - procObj.get("address", address); - - std::string line; - - NetworkAddress parsedAddress; - try { - parsedAddress = NetworkAddress::parse(address); - } catch (Error&) { - // Groups all invalid IP address/port pair in the end of this detail group. - line = format(" %-22s (invalid IP address or port)", address.c_str()); - IPAddress::IPAddressStore maxIp; - for (int i = 0; i < maxIp.size(); ++i) { - maxIp[i] = std::numeric_limits::type>::max(); - } - std::string& lastline = - workerDetails[NetworkAddress(IPAddress(maxIp), std::numeric_limits::max())]; - if (!lastline.empty()) - lastline.append("\n"); - lastline += line; - continue; - } - - try { - double tx = -1, rx = -1, mCPUUtil = -1; - int64_t processTotalSize; - - // Get the machine for this process - // StatusObjectReader mach = machinesMap[procObj["machine_id"].get_str()]; - StatusObjectReader mach; - if (machinesMap.get(procObj["machine_id"].get_str(), mach, false)) { - StatusObjectReader machCPU; - if (mach.get("cpu", machCPU)) { - - machCPU.get("logical_core_utilization", mCPUUtil); - - StatusObjectReader network; - if (mach.get("network", network)) { - network.get("megabits_sent.hz", tx); - network.get("megabits_received.hz", rx); - } - } - } - - procObj.get("memory.used_bytes", processTotalSize); - - StatusObjectReader procCPUObj; - procObj.get("cpu", procCPUObj); - - line = format(" %-22s (", address.c_str()); - - double usageCores; - if (procCPUObj.get("usage_cores", usageCores)) - line += format("%3.0f%% cpu;", usageCores * 100); - - line += mCPUUtil != -1 ? format("%3.0f%% machine;", mCPUUtil * 100) : ""; - line += std::min(tx, rx) != -1 ? format("%6.3f Gbps;", std::max(tx, rx) / 1000.0) : ""; - - double diskBusy; - if (procObj.get("disk.busy", diskBusy)) - line += format("%3.0f%% disk IO;", 100.0 * diskBusy); - - line += processTotalSize != -1 - ? format("%4.1f GB", processTotalSize / (1024.0 * 1024 * 1024)) - : ""; - - double availableBytes; - if (procObj.get("memory.available_bytes", availableBytes)) - line += format(" / %3.1f GB RAM )", availableBytes / (1024.0 * 1024 * 1024)); - else - line += " )"; - - if (procObj.has("messages")) { - for (StatusObjectReader message : procObj.last().get_array()) { - std::string desc; - if (message.get("description", desc)) { - if (message.has("type")) { - line += "\n Last logged error: " + desc; - } else { - line += "\n " + desc; - } - } - } - } - - workerDetails[parsedAddress] = line; - } - - catch (std::runtime_error&) { - std::string noMetrics = format(" %-22s (no metrics available)", address.c_str()); - workerDetails[parsedAddress] = noMetrics; - } - } - for (auto w : workerDetails) - outputString += "\n" + format("%s", w.second.c_str()); - } catch (std::runtime_error&) { - outputString = outputStringCache; - outputString += "\n Unable to retrieve process performance details"; - } - - if (!printedCoordinators) { - printedCoordinators = true; - outputString += "\n\nCoordination servers:"; - outputString += getCoordinatorsInfoString(statusObj); - } - } - - // client time - std::string clientTime = getDateInfoString(statusObjClient, "timestamp"); - if (clientTime != "") { - outputString += "\n\nClient time: " + clientTime; - } - - if (processesMap.obj().size() > 1 && isOldMemory) { - outputString += "\n\nWARNING: type `configure memory' to switch to a safer method of persisting data " - "on the transaction logs."; - } - if (processesMap.obj().size() > 9 && foundLogAndStorage) { - outputString += - "\n\nWARNING: A single process is both a transaction log and a storage server.\n For best " - "performance use dedicated disks for the transaction logs by setting process classes."; - } - - if (statusObjCluster.has("data_distribution_disabled")) { - outputString += "\n\nWARNING: Data distribution is off."; - } else { - if (statusObjCluster.has("data_distribution_disabled_for_ss_failures")) { - outputString += "\n\nWARNING: Data distribution is currently turned on but disabled for all " - "storage server failures."; - } - if (statusObjCluster.has("data_distribution_disabled_for_rebalance")) { - outputString += "\n\nWARNING: Data distribution is currently turned on but shard size balancing is " - "currently disabled."; - } - } - - printf("%s\n", outputString.c_str()); - } - - // status minimal - else if (level == StatusClient::MINIMAL) { - // Checking for field exsistence is not necessary here because if a field is missing there is no additional - // information that we would be able to display if we continued execution. Instead, any missing fields will - // throw and the catch will display the proper message. - try { - // If any of these throw, can't get status because the result makes no sense. - StatusObjectReader statusObjClient = statusObj["client"].get_obj(); - StatusObjectReader statusObjClientDatabaseStatus = statusObjClient["database_status"].get_obj(); - - bool available = statusObjClientDatabaseStatus["available"].get_bool(); - - // Database unavailable - if (!available) { - printf("%s", "The database is unavailable; type `status' for more information.\n"); - } else { - try { - bool healthy = statusObjClientDatabaseStatus["healthy"].get_bool(); - - // Database available without issues - if (healthy) { - if (displayDatabaseAvailable) { - printf("The database is available.\n"); - } - } else { // Database running but with issues - printf("The database is available, but has issues (type 'status' for more information).\n"); - } - } catch (std::runtime_error&) { - printf("The database is available, but has issues (type 'status' for more information).\n"); - } - } - - bool upToDate; - if (!statusObjClient.get("cluster_file.up_to_date", upToDate) || !upToDate) { - fprintf(stderr, - "WARNING: The cluster file is not up to date. Type 'status' for more information.\n"); - } - } catch (std::runtime_error&) { - printf("Unable to determine database state, type 'status' for more information.\n"); - } - - } - - // status JSON - else if (level == StatusClient::JSON) { - printf("%s\n", - json_spirit::write_string(json_spirit::mValue(statusObj.obj()), - json_spirit::Output_options::pretty_print) - .c_str()); - } - } catch (Error&) { - if (hideErrorMessages) - return; - if (level == StatusClient::MINIMAL) { - printf("Unable to determine database state, type 'status' for more information.\n"); - } else if (level == StatusClient::JSON) { - printf("Could not retrieve status json.\n\n"); - } else { - printf("Could not retrieve status, type 'status json' for more information.\n"); - } - } - return; -} - int printStatusFromJSON(std::string const& jsonFileName) { try { json_spirit::mValue value; @@ -1823,9 +661,27 @@ ACTOR Future timeWarning(double when, const char* msg) { return Void(); } -ACTOR Future checkStatus(Future f, Database db, bool displayDatabaseAvailable = true) { +ACTOR Future checkStatus(Future f, + Reference db, + Database localDb, + bool displayDatabaseAvailable = true) { wait(f); - StatusObject s = wait(StatusClient::statusFetcher(db)); + state Reference tr = db->createTransaction(); + state StatusObject s; + if (!tr->isValid()) { + StatusObject _s = wait(StatusClient::statusFetcher(localDb)); + s = _s; + } else { + state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); + if (!statusValue.present()) { + fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); + return Void(); + } + json_spirit::mValue mv; + json_spirit::read_string(statusValue.get().toString(), mv); + s = StatusObject(mv.get_obj()); + } printf("\n"); printStatus(s, StatusClient::MINIMAL, displayDatabaseAvailable); printf("\n"); @@ -1844,16 +700,6 @@ Future makeInterruptable(Future f) { } } -ACTOR Future commitTransaction(Reference tr) { - wait(makeInterruptable(tr->commit())); - auto ver = tr->getCommittedVersion(); - if (ver != invalidVersion) - printf("Committed (%" PRId64 ")\n", ver); - else - printf("Nothing to commit\n"); - return Void(); -} - ACTOR Future commitTransaction(Reference tr) { wait(makeInterruptable(safeThreadFutureToFuture(tr->commit()))); auto ver = tr->getCommittedVersion(); @@ -2629,37 +1475,19 @@ ACTOR Future createSnapshot(Database db, std::vector tokens) { return false; } -Reference getTransaction(Database db, - Reference& tr, - FdbOptions* options, - bool intrans) { - if (!tr || !intrans) { - tr = makeReference(db); - options->apply(tr); - } - - return tr; -} - -// TODO: Update the function to get rid of Database and ReadYourWritesTransaction after refactoring -// The original ReadYourWritesTransaciton handle "tr" is needed as some commands can be called inside a -// transaction and "tr" holds the pointer to the ongoing transaction object. As it's not easy to get ride of "tr" in -// one shot and we are refactoring the code to use Reference (tr2), we need to let "tr2" point to the same -// underlying transaction like "tr". Thus everytime we need to use "tr2", we first update "tr" and let "tr2" points to -// "tr1". "tr2" is always having the same lifetime as "tr1" -Reference getTransaction(Database db, - Reference& tr, - Reference& tr2, +// TODO: Update the function to get rid of the Database after refactoring +Reference getTransaction(Reference db, + Reference& tr, FdbOptions* options, bool intrans) { // Update "tr" to point to a brand new transaction object when it's not initialized or "intrans" flag is "false", // which indicates we need a new transaction object if (!tr || !intrans) { - tr = makeReference(db); + tr = db->createTransaction(); options->apply(tr); } - tr2 = Reference(new ThreadSafeTransaction(tr.getPtr())); - return tr2; + + return tr; } std::string newCompletion(const char* base, const char* name) { @@ -3103,11 +1931,9 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { state LineNoise& linenoise = *plinenoise; state bool intrans = false; - state Database db; - state Reference tr; - // TODO: refactoring work, will replace db, tr when we have all commands through the general fdb interface - state Reference db2; - state Reference tr2; + state Database localDb; + state Reference db; + state Reference tr; state bool writeMode = false; @@ -3135,19 +1961,11 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { TraceEvent::setNetworkThread(); try { - db = Database::createDatabase(ccf, -1, IsInternal::False); + localDb = Database::createDatabase(ccf, -1, IsInternal::False); if (!opt.exec.present()) { printf("Using cluster file `%s'.\n", ccf->getFilename().c_str()); } - } catch (Error& e) { - fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code()); - printf("Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str()); - return 1; - } - - // Note: refactoring work, will remove the above code finally - try { - db2 = API->createDatabase(opt.clusterFile.c_str()); + db = API->createDatabase(opt.clusterFile.c_str()); } catch (Error& e) { fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code()); printf("Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str()); @@ -3168,9 +1986,31 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { .trackLatest("ProgramStart"); } + // used to catch the first cluster_version_changed error when using external clients + // when using external clients, it will throw cluster_version_changed for the first time establish the connection to + // the cluster. Thus, we catch it by doing a get version request to establish the connection + // The 3.0 timeout is a guard to avoid waiting forever when the cli cannot talk to any coordinators + loop { + try { + getTransaction(db, tr, options, intrans); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + wait(delay(3.0) || success(safeThreadFutureToFuture(tr->getReadVersion()))); + break; + } catch (Error& e) { + if (e.code() == error_code_cluster_version_changed) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } else { + // unexpected errors + fprintf(stderr, "ERROR: unexpected error %d while initializing the multiversion database\n", e.code()); + tr->reset(); + break; + } + } + } + if (!opt.exec.present()) { if (opt.initialStatusCheck) { - Future checkStatusF = checkStatus(Void(), db); + Future checkStatusF = checkStatus(Void(), db, localDb); wait(makeInterruptable(success(checkStatusF))); } else { printf("\n"); @@ -3208,7 +2048,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { linenoise.historyAdd(line); } - warn = checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db); + warn = checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb); try { state UID randomID = deterministicRandom()->randomUniqueID(); @@ -3308,13 +2148,12 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "waitconnected")) { - wait(makeInterruptable(db->onConnected())); + wait(makeInterruptable(localDb->onConnected())); continue; } if (tokencmp(tokens[0], "waitopen")) { - wait(success( - safeThreadFutureToFuture(getTransaction(db, tr, tr2, options, intrans)->getReadVersion()))); + wait(success(safeThreadFutureToFuture(getTransaction(db, tr, options, intrans)->getReadVersion()))); continue; } @@ -3339,46 +2178,26 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { // Warn at 7 seconds since status will spend as long as 5 seconds trying to read/write from the // database warn = timeWarning(7.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"); - - state StatusClient::StatusLevel level; - if (tokens.size() == 1) - level = StatusClient::NORMAL; - else if (tokens.size() == 2 && tokencmp(tokens[1], "details")) - level = StatusClient::DETAILED; - else if (tokens.size() == 2 && tokencmp(tokens[1], "minimal")) - level = StatusClient::MINIMAL; - else if (tokens.size() == 2 && tokencmp(tokens[1], "json")) - level = StatusClient::JSON; - else { - printUsage(tokens[0]); + bool _result = wait(makeInterruptable(statusCommandActor(db, localDb, tokens, opt.exec.present()))); + if (!_result) is_error = true; - continue; - } - - StatusObject s = wait(makeInterruptable(StatusClient::statusFetcher(db))); - - if (!opt.exec.present()) - printf("\n"); - printStatus(s, level); - if (!opt.exec.present()) - printf("\n"); continue; } if (tokencmp(tokens[0], "triggerddteaminfolog")) { - wait(triggerddteaminfologCommandActor(db2)); + wait(triggerddteaminfologCommandActor(db)); continue; } if (tokencmp(tokens[0], "tssq")) { - bool _result = wait(makeInterruptable(tssqCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(tssqCommandActor(db, tokens))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "configure")) { - bool err = wait(configure(db, tokens, db->getConnectionFile(), &linenoise, warn)); + bool err = wait(configure(localDb, tokens, localDb->getConnectionFile(), &linenoise, warn)); if (err) is_error = true; continue; @@ -3387,7 +2206,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (tokencmp(tokens[0], "fileconfigure")) { if (tokens.size() == 2 || (tokens.size() == 3 && (tokens[1] == LiteralStringRef("new") || tokens[1] == LiteralStringRef("FORCE")))) { - bool err = wait(fileConfigure(db, + bool err = wait(fileConfigure(localDb, tokens.back().toString(), tokens[1] == LiteralStringRef("new"), tokens[1] == LiteralStringRef("FORCE"))); @@ -3401,7 +2220,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "coordinators")) { - auto cs = ClusterConnectionFile(db->getConnectionFile()->getFilename()).getConnectionString(); + auto cs = ClusterConnectionFile(localDb->getConnectionFile()->getFilename()).getConnectionString(); if (tokens.size() < 2) { printf("Cluster description: %s\n", cs.clusterKeyName().toString().c_str()); printf("Cluster coordinators (%zu): %s\n", @@ -3409,7 +2228,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { describe(cs.coordinators()).c_str()); printf("Type `help coordinators' to learn how to change this information.\n"); } else { - bool err = wait(coordinators(db, tokens, cs.coordinators()[0].isTLS())); + bool err = wait(coordinators(localDb, tokens, cs.coordinators()[0].isTLS())); if (err) is_error = true; } @@ -3417,7 +2236,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "exclude")) { - bool err = wait(exclude(db, tokens, db->getConnectionFile(), warn)); + bool err = wait(exclude(localDb, tokens, localDb->getConnectionFile(), warn)); if (err) is_error = true; continue; @@ -3428,7 +2247,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printUsage(tokens[0]); is_error = true; } else { - bool err = wait(include(db, tokens)); + bool err = wait(include(localDb, tokens)); if (err) is_error = true; } @@ -3436,7 +2255,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "snapshot")) { - bool _result = wait(snapshotCommandActor(db2, tokens)); + bool _result = wait(snapshotCommandActor(db, tokens)); if (!_result) is_error = true; continue; @@ -3449,7 +2268,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } else { state UID lockUID = deterministicRandom()->randomUniqueID(); printf("Locking database with lockUID: %s\n", lockUID.toString().c_str()); - wait(makeInterruptable(lockDatabase(db, lockUID))); + wait(makeInterruptable(lockDatabase(localDb, lockUID))); printf("Database locked.\n"); } continue; @@ -3468,11 +2287,12 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { fflush(stdout); Optional input = wait(linenoise.read(format("Repeat the above passphrase if you would like to proceed:"))); - warn = checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db); + warn = + checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb); if (input.present() && input.get() == passPhrase) { UID unlockUID = UID::fromString(tokens[1].toString()); try { - wait(makeInterruptable(unlockDatabase(db, unlockUID))); + wait(makeInterruptable(unlockDatabase(localDb, unlockUID))); printf("Database unlocked.\n"); } catch (Error& e) { if (e.code() == error_code_database_locked) { @@ -3490,7 +2310,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "setclass")) { - bool _result = wait(makeInterruptable(setClassCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(setClassCommandActor(db, tokens))); if (!_result) is_error = true; continue; @@ -3506,7 +2326,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } else { activeOptions = FdbOptions(globalOptions); options = &activeOptions; - getTransaction(db, tr, tr2, options, false); + getTransaction(db, tr, options, false); intrans = true; printf("Transaction started\n"); } @@ -3521,7 +2341,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { fprintf(stderr, "ERROR: No active transaction\n"); is_error = true; } else { - wait(commitTransaction(tr2)); + wait(commitTransaction(tr)); intrans = false; options = &globalOptions; } @@ -3538,11 +2358,9 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { is_error = true; } else { tr->reset(); - tr2->reset(); activeOptions = FdbOptions(globalOptions); options = &activeOptions; options->apply(tr); - options->apply(tr2); printf("Transaction reset\n"); } continue; @@ -3568,8 +2386,9 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printUsage(tokens[0]); is_error = true; } else { - Optional> v = wait(makeInterruptable( - safeThreadFutureToFuture(getTransaction(db, tr, tr2, options, intrans)->get(tokens[1])))); + state ThreadFuture> valueF = + getTransaction(db, tr, options, intrans)->get(tokens[1]); + Optional> v = wait(makeInterruptable(safeThreadFutureToFuture(valueF))); if (v.present()) printf("`%s' is `%s'\n", printable(tokens[1]).c_str(), printable(v.get()).c_str()); @@ -3585,69 +2404,69 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { is_error = true; } else { Version v = wait(makeInterruptable( - safeThreadFutureToFuture(getTransaction(db, tr, tr2, options, intrans)->getReadVersion()))); + safeThreadFutureToFuture(getTransaction(db, tr, options, intrans)->getReadVersion()))); printf("%ld\n", v); } continue; } if (tokencmp(tokens[0], "advanceversion")) { - bool _result = wait(makeInterruptable(advanceVersionCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(advanceVersionCommandActor(db, tokens))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "kill")) { - getTransaction(db, tr, tr2, options, intrans); - bool _result = wait(makeInterruptable(killCommandActor(db2, tr2, tokens, &address_interface))); + getTransaction(db, tr, options, intrans); + bool _result = wait(makeInterruptable(killCommandActor(db, tr, tokens, &address_interface))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "suspend")) { - getTransaction(db, tr, tr2, options, intrans); - bool _result = wait(makeInterruptable(suspendCommandActor(db2, tr2, tokens, &address_interface))); + getTransaction(db, tr, options, intrans); + bool _result = wait(makeInterruptable(suspendCommandActor(db, tr, tokens, &address_interface))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "force_recovery_with_data_loss")) { - bool _result = wait(makeInterruptable(forceRecoveryWithDataLossCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(forceRecoveryWithDataLossCommandActor(db, tokens))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "maintenance")) { - bool _result = wait(makeInterruptable(maintenanceCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(maintenanceCommandActor(db, tokens))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "consistencycheck")) { - getTransaction(db, tr, tr2, options, intrans); - bool _result = wait(makeInterruptable(consistencyCheckCommandActor(tr2, tokens, intrans))); + getTransaction(db, tr, options, intrans); + bool _result = wait(makeInterruptable(consistencyCheckCommandActor(tr, tokens, intrans))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "profile")) { - getTransaction(db, tr, tr2, options, intrans); - bool _result = wait(makeInterruptable(profileCommandActor(tr2, tokens, intrans))); + getTransaction(db, tr, options, intrans); + bool _result = wait(makeInterruptable(profileCommandActor(tr, tokens, intrans))); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "expensive_data_check")) { - getTransaction(db, tr, tr2, options, intrans); + getTransaction(db, tr, options, intrans); bool _result = - wait(makeInterruptable(expensiveDataCheckCommandActor(db2, tr2, tokens, &address_interface))); + wait(makeInterruptable(expensiveDataCheckCommandActor(db, tr, tokens, &address_interface))); if (!_result) is_error = true; continue; @@ -3705,9 +2524,9 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { endKey = strinc(tokens[1]); } - RangeResult kvs = wait(makeInterruptable( - safeThreadFutureToFuture(getTransaction(db, tr, tr2, options, intrans) - ->getRange(KeyRangeRef(tokens[1], endKey), limit)))); + state ThreadFuture kvsF = + getTransaction(db, tr, options, intrans)->getRange(KeyRangeRef(tokens[1], endKey), limit); + RangeResult kvs = wait(makeInterruptable(safeThreadFutureToFuture(kvsF))); printf("\nRange limited to %d keys\n", limit); for (auto iter = kvs.begin(); iter < kvs.end(); iter++) { @@ -3750,11 +2569,11 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printUsage(tokens[0]); is_error = true; } else { - getTransaction(db, tr, tr2, options, intrans); - tr2->set(tokens[1], tokens[2]); + getTransaction(db, tr, options, intrans); + tr->set(tokens[1], tokens[2]); if (!intrans) { - wait(commitTransaction(tr2)); + wait(commitTransaction(tr)); } } continue; @@ -3771,11 +2590,11 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printUsage(tokens[0]); is_error = true; } else { - getTransaction(db, tr, tr2, options, intrans); - tr2->clear(tokens[1]); + getTransaction(db, tr, options, intrans); + tr->clear(tokens[1]); if (!intrans) { - wait(commitTransaction(tr2)); + wait(commitTransaction(tr)); } } continue; @@ -3792,18 +2611,18 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printUsage(tokens[0]); is_error = true; } else { - getTransaction(db, tr, tr2, options, intrans); - tr2->clear(KeyRangeRef(tokens[1], tokens[2])); + getTransaction(db, tr, options, intrans); + tr->clear(KeyRangeRef(tokens[1], tokens[2])); if (!intrans) { - wait(commitTransaction(tr2)); + wait(commitTransaction(tr)); } } continue; } if (tokencmp(tokens[0], "datadistribution")) { - bool _result = wait(makeInterruptable(dataDistributionCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(dataDistributionCommandActor(db, tokens))); if (!_result) is_error = true; continue; @@ -3853,7 +2672,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { Optional arg = (tokens.size() > 3) ? tokens[3] : Optional(); try { - options->setOption(tr, tr2, tokens[2], isOn, arg, intrans); + options->setOption(tr, tokens[2], isOn, arg, intrans); printf("Option %s for %s\n", isOn ? "enabled" : "disabled", intrans ? "current transaction" : "all transactions"); @@ -3868,14 +2687,14 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "throttle")) { - bool _result = wait(throttleCommandActor(db2, tokens)); + bool _result = wait(throttleCommandActor(db, tokens)); if (!_result) is_error = true; continue; } if (tokencmp(tokens[0], "cache_range")) { - bool _result = wait(makeInterruptable(cacheRangeCommandActor(db2, tokens))); + bool _result = wait(makeInterruptable(cacheRangeCommandActor(db, tokens))); if (!_result) is_error = true; continue; @@ -3896,7 +2715,6 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { intrans = false; options = &globalOptions; options->apply(tr); - options->apply(tr2); } } diff --git a/fdbcli/fdbcli.actor.h b/fdbcli/fdbcli.actor.h index 23fc862d88f..dab31df05f9 100644 --- a/fdbcli/fdbcli.actor.h +++ b/fdbcli/fdbcli.actor.h @@ -30,6 +30,7 @@ #include "fdbclient/CoordinationInterface.h" #include "fdbclient/IClientApi.h" +#include "fdbclient/StatusClient.h" #include "flow/Arena.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -93,6 +94,11 @@ ACTOR Future getWorkerInterfaces(Reference tr, ACTOR Future verifyAndAddInterface(std::map>* address_interface, Reference connectLock, KeyValue kv); +// print cluster status info +void printStatus(StatusObjectReader statusObj, + StatusClient::StatusLevel level, + bool displayDatabaseAvailable = true, + bool hideErrorMessages = false); // All fdbcli commands (alphabetically) // advanceversion command @@ -130,6 +136,11 @@ ACTOR Future profileCommandActor(Reference tr, std::vector setClassCommandActor(Reference db, std::vector tokens); // snapshot command ACTOR Future snapshotCommandActor(Reference db, std::vector tokens); +// status command +ACTOR Future statusCommandActor(Reference db, + Database localDb, + std::vector tokens, + bool isExecMode = false); // suspend command ACTOR Future suspendCommandActor(Reference db, Reference tr, diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index 79e45c0842b..cf304202bb6 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -92,6 +92,10 @@ class ITransaction { // used in template functions as returned Future type template using FutureT = ThreadFuture; + // internal use only, return true by default + // Only if it's a MultiVersionTransaction and the underlying transaction handler is null, + // it will return false + virtual bool isValid() { return true; } }; // An interface that represents a connection to a cluster made by a client diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index f016201f31f..61425401c2a 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -877,6 +877,11 @@ void MultiVersionTransaction::reset() { updateTransaction(); } +bool MultiVersionTransaction::isValid() { + auto tr = getTransaction(); + return tr.transaction.isValid(); +} + // MultiVersionDatabase MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api, int threadIdx, diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 274df7dd840..f3d7f5d8ce4 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -388,6 +388,9 @@ class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted< void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } + // return true if the underlying transaction pointer is not empty + bool isValid() override; + private: const Reference db; ThreadSpinLock lock; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index b2e5ff445f5..1590f3c199e 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -49,7 +49,9 @@ set(FDBSERVER_SRCS LocalConfiguration.h LogProtocolMessage.h LogRouter.actor.cpp + LogSystem.cpp LogSystem.h + LogSystemConfig.cpp LogSystemConfig.h LogSystemDiskQueueAdapter.actor.cpp LogSystemDiskQueueAdapter.h @@ -117,6 +119,7 @@ set(FDBSERVER_SRCS StorageMetrics.h storageserver.actor.cpp TagPartitionedLogSystem.actor.cpp + TagPartitionedLogSystem.actor.h template_fdb.h tester.actor.cpp TesterInterface.actor.h diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index 06a90ad1d17..fdfcb1ace8b 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -236,10 +236,15 @@ struct RocksDBKeyValueStore : IKeyValueStore { a.done.sendError(statusToError(status)); } else { TraceEvent(SevInfo, "RocksDB").detail("Path", a.path).detail("Method", "Open"); - onMainThread([&] { - a.metrics = rocksDBMetricLogger(options.statistics, db); - return Future(true); - }).blockUntilReady(); + // The current thread and main thread are same when the code runs in simulation. + // blockUntilReady() is getting the thread into deadlock state, so avoiding the + // metric logger in simulation. + if (!g_network->isSimulated()) { + onMainThread([&] { + a.metrics = rocksDBMetricLogger(options.statistics, db); + return Future(true); + }).blockUntilReady(); + } a.done.send(Void()); } } @@ -459,10 +464,12 @@ struct RocksDBKeyValueStore : IKeyValueStore { int accumulatedBytes = 0; rocksdb::Status s; auto options = getReadOptions(); - uint64_t deadlineMircos = + // TODO: Deadline option is not supported with current rocksdb verion. Re-enable the code + // below when deadline option is supported. + /* uint64_t deadlineMircos = db->GetEnv()->NowMicros() + (readRangeTimeout - (timer_monotonic() - a.startTime)) * 1000000; std::chrono::seconds deadlineSeconds(deadlineMircos / 1000000); - options.deadline = std::chrono::duration_cast(deadlineSeconds); + options.deadline = std::chrono::duration_cast(deadlineSeconds); */ // When using a prefix extractor, ensure that keys are returned in order even if they cross // a prefix boundary. options.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0); diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 7e3316b28d5..7f91aeb3b35 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -415,7 +415,7 @@ void peekMessagesFromMemory(LogRouterData* self, Tag tag, Version begin, BinaryW auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) -> bool { return l.first < r.first; }); Version currentVersion = -1; for (; it != deque.end(); ++it) { diff --git a/fdbserver/LogSystem.cpp b/fdbserver/LogSystem.cpp new file mode 100644 index 00000000000..f9863346133 --- /dev/null +++ b/fdbserver/LogSystem.cpp @@ -0,0 +1,346 @@ +/* + * LogSystem.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/LogSystem.h" + +std::string LogSet::logRouterString() { + std::string result; + for (int i = 0; i < logRouters.size(); i++) { + if (i > 0) { + result += ", "; + } + result += logRouters[i]->get().id().toString(); + } + return result; +} + +bool LogSet::hasLogRouter(UID id) const { + for (const auto& router : logRouters) { + if (router->get().id() == id) { + return true; + } + } + return false; +} + +bool LogSet::hasBackupWorker(UID id) const { + for (const auto& worker : backupWorkers) { + if (worker->get().id() == id) { + return true; + } + } + return false; +} + +std::string LogSet::logServerString() { + std::string result; + for (int i = 0; i < logServers.size(); i++) { + if (i > 0) { + result += ", "; + } + result += logServers[i]->get().id().toString(); + } + return result; +} + +void LogSet::populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) { + satelliteTagLocations.clear(); + satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags }) + 1); + + std::map server_usedBest; + std::set> used_servers; + for (int i = 0; i < tLogLocalities.size(); i++) { + used_servers.insert(std::make_pair(0, i)); + } + + Reference serverSet = Reference(new LocalityMap>()); + LocalityMap>* serverMap = (LocalityMap>*)serverSet.getPtr(); + std::vector> resultPairs; + for (int loc = 0; loc < satelliteTagLocations.size(); loc++) { + int team = loc; + if (loc < logRouterTags) { + team = loc + 1; + } else if (loc == logRouterTags) { + team = 0; + } + + bool teamComplete = false; + alsoServers.resize(1); + serverMap->clear(); + resultPairs.clear(); + for (auto& used_idx : used_servers) { + auto entry = serverMap->add(tLogLocalities[used_idx.second], &used_idx); + if (!resultPairs.size()) { + resultPairs.push_back(used_idx); + alsoServers[0] = entry; + } + + resultEntries.clear(); + if (serverSet->selectReplicas(tLogPolicy, alsoServers, resultEntries)) { + for (auto& entry : resultEntries) { + resultPairs.push_back(*serverMap->getObject(entry)); + } + int firstBestUsed = server_usedBest[resultPairs[0].second]; + for (int i = 1; i < resultPairs.size(); i++) { + int thisBestUsed = server_usedBest[resultPairs[i].second]; + if (thisBestUsed < firstBestUsed) { + std::swap(resultPairs[0], resultPairs[i]); + firstBestUsed = thisBestUsed; + } + } + server_usedBest[resultPairs[0].second]++; + + for (auto& res : resultPairs) { + satelliteTagLocations[team].push_back(res.second); + used_servers.erase(res); + res.first++; + used_servers.insert(res); + } + teamComplete = true; + break; + } + } + ASSERT(teamComplete); + } + + checkSatelliteTagLocations(); +} + +void LogSet::checkSatelliteTagLocations() { + std::vector usedBest; + std::vector used; + usedBest.resize(tLogLocalities.size()); + used.resize(tLogLocalities.size()); + for (auto team : satelliteTagLocations) { + usedBest[team[0]]++; + for (auto loc : team) { + used[loc]++; + } + } + + int minUsedBest = satelliteTagLocations.size(); + int maxUsedBest = 0; + for (auto i : usedBest) { + minUsedBest = std::min(minUsedBest, i); + maxUsedBest = std::max(maxUsedBest, i); + } + + int minUsed = satelliteTagLocations.size(); + int maxUsed = 0; + for (auto i : used) { + minUsed = std::min(minUsed, i); + maxUsed = std::max(maxUsed, i); + } + + bool foundDuplicate = false; + std::set> zones; + std::set> dcs; + for (auto& loc : tLogLocalities) { + if (zones.count(loc.zoneId())) { + foundDuplicate = true; + break; + } + zones.insert(loc.zoneId()); + dcs.insert(loc.dcId()); + } + bool moreThanOneDC = dcs.size() > 1 ? true : false; + + TraceEvent(((maxUsed - minUsed > 1) || (maxUsedBest - minUsedBest > 1)) + ? (g_network->isSimulated() && !foundDuplicate && !moreThanOneDC ? SevError : SevWarnAlways) + : SevInfo, + "CheckSatelliteTagLocations") + .detail("MinUsed", minUsed) + .detail("MaxUsed", maxUsed) + .detail("MinUsedBest", minUsedBest) + .detail("MaxUsedBest", maxUsedBest) + .detail("DuplicateZones", foundDuplicate) + .detail("NumOfDCs", dcs.size()); +} + +int LogSet::bestLocationFor(Tag tag) { + if (locality == tagLocalitySatellite) { + return satelliteTagLocations[tag == txsTag ? 0 : tag.id + 1][0]; + } + + // the following logic supports upgrades from 5.X + if (tag == txsTag) + return txsTagOld % logServers.size(); + return tag.id % logServers.size(); +} + +void LogSet::updateLocalitySet(std::vector const& localities) { + LocalityMap* logServerMap; + + logServerSet = Reference(new LocalityMap()); + logServerMap = (LocalityMap*)logServerSet.getPtr(); + + logEntryArray.clear(); + logEntryArray.reserve(localities.size()); + logIndexArray.clear(); + logIndexArray.reserve(localities.size()); + + for (int i = 0; i < localities.size(); i++) { + logIndexArray.push_back(i); + logEntryArray.push_back(logServerMap->add(localities[i], &logIndexArray.back())); + } +} + +bool LogSet::satisfiesPolicy(const std::vector& locations) { + resultEntries.clear(); + + // Run the policy, assert if unable to satify + bool result = logServerSet->selectReplicas(tLogPolicy, locations, resultEntries); + ASSERT(result); + + return resultEntries.size() == 0; +} + +void LogSet::getPushLocations(VectorRef tags, std::vector& locations, int locationOffset, bool allLocations) { + if (locality == tagLocalitySatellite) { + for (auto& t : tags) { + if (t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) { + for (int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) { + locations.push_back(locationOffset + loc); + } + } + } + uniquify(locations); + return; + } + + newLocations.clear(); + alsoServers.clear(); + resultEntries.clear(); + + if (allLocations) { + // special handling for allLocations + TraceEvent("AllLocationsSet").log(); + for (int i = 0; i < logServers.size(); i++) { + newLocations.push_back(i); + } + } else { + for (auto& t : tags) { + if (locality == tagLocalitySpecial || t.locality == locality || t.locality < 0) { + newLocations.push_back(bestLocationFor(t)); + } + } + } + + uniquify(newLocations); + + if (newLocations.size()) + alsoServers.reserve(newLocations.size()); + + // Convert locations to the also servers + for (auto location : newLocations) { + locations.push_back(locationOffset + location); + alsoServers.push_back(logEntryArray[location]); + } + + // Run the policy, assert if unable to satify + bool result = logServerSet->selectReplicas(tLogPolicy, alsoServers, resultEntries); + ASSERT(result); + + // Add the new servers to the location array + LocalityMap* logServerMap = (LocalityMap*)logServerSet.getPtr(); + for (auto entry : resultEntries) { + locations.push_back(locationOffset + *logServerMap->getObject(entry)); + } + //TraceEvent("GetPushLocations").detail("Policy", tLogPolicy->info()) + // .detail("Results", locations.size()).detail("Selection", logServerSet->size()) + // .detail("Included", alsoServers.size()).detail("Duration", timer() - t); +} + +void LogPushData::addTxsTag() { + if (logSystem->getTLogVersion() >= TLogVersion::V4) { + next_message_tags.push_back(logSystem->getRandomTxsTag()); + } else { + next_message_tags.push_back(txsTag); + } +} + +void LogPushData::addTransactionInfo(SpanID const& context) { + TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanID + spanContext = context; + writtenLocations.clear(); +} + +void LogPushData::writeMessage(StringRef rawMessageWithoutLength, bool usePreviousLocations) { + if (!usePreviousLocations) { + prev_tags.clear(); + if (logSystem->hasRemoteLogs()) { + prev_tags.push_back(logSystem->getRandomRouterTag()); + } + for (auto& tag : next_message_tags) { + prev_tags.push_back(tag); + } + msg_locations.clear(); + logSystem->getPushLocations(prev_tags, msg_locations); + next_message_tags.clear(); + } + uint32_t subseq = this->subsequence++; + uint32_t msgsize = + rawMessageWithoutLength.size() + sizeof(subseq) + sizeof(uint16_t) + sizeof(Tag) * prev_tags.size(); + for (int loc : msg_locations) { + BinaryWriter& wr = messagesWriter[loc]; + wr << msgsize << subseq << uint16_t(prev_tags.size()); + for (auto& tag : prev_tags) + wr << tag; + wr.serializeBytes(rawMessageWithoutLength); + } +} + +void LogPushData::recordEmptyMessage(int loc, const Standalone& value) { + if (!isEmptyMessage[loc]) { + BinaryWriter w(AssumeVersion(g_network->protocolVersion())); + Standalone v = w.toValue(); + if (value.size() > v.size()) { + isEmptyMessage[loc] = true; + } + } +} + +float LogPushData::getEmptyMessageRatio() const { + auto count = std::count(isEmptyMessage.begin(), isEmptyMessage.end(), false); + ASSERT_WE_THINK(isEmptyMessage.size() > 0); + return 1.0 * count / isEmptyMessage.size(); +} + +bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) { + if (!FLOW_KNOBS->WRITE_TRACING_ENABLED || logSystem->getTLogVersion() < TLogVersion::V6 || + writtenLocations.count(location) != 0) { + return false; + } + + TEST(true); // Wrote SpanContextMessage to a transaction log + writtenLocations.insert(location); + + BinaryWriter& wr = messagesWriter[location]; + SpanContextMessage contextMessage(spanContext); + + int offset = wr.getLength(); + wr << uint32_t(0) << subseq << uint16_t(prev_tags.size()); + for (auto& tag : prev_tags) + wr << tag; + wr << contextMessage; + int length = wr.getLength() - offset; + *(uint32_t*)((uint8_t*)wr.getData() + offset) = length - sizeof(uint32_t); + return true; +} diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index fa6341f9259..557bab5ffc6 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -41,6 +41,8 @@ struct DBCoreState; struct TLogSet; struct CoreTLogSet; +struct LogPushData; +struct LocalityData; struct ConnectionResetInfo : public ReferenceCounted { double lastReset; @@ -79,256 +81,28 @@ class LogSet : NonCopyable, public ReferenceCounted { LogSet(const TLogSet& tlogSet); LogSet(const CoreTLogSet& coreSet); - std::string logRouterString() { - std::string result; - for (int i = 0; i < logRouters.size(); i++) { - if (i > 0) { - result += ", "; - } - result += logRouters[i]->get().id().toString(); - } - return result; - } + std::string logRouterString(); - bool hasLogRouter(UID id) const { - for (const auto& router : logRouters) { - if (router->get().id() == id) { - return true; - } - } - return false; - } + bool hasLogRouter(UID id) const; - bool hasBackupWorker(UID id) const { - for (const auto& worker : backupWorkers) { - if (worker->get().id() == id) { - return true; - } - } - return false; - } + bool hasBackupWorker(UID id) const; - std::string logServerString() { - std::string result; - for (int i = 0; i < logServers.size(); i++) { - if (i > 0) { - result += ", "; - } - result += logServers[i]->get().id().toString(); - } - return result; - } + std::string logServerString(); - void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) { - satelliteTagLocations.clear(); - satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags }) + 1); + void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags); - std::map server_usedBest; - std::set> used_servers; - for (int i = 0; i < tLogLocalities.size(); i++) { - used_servers.insert(std::make_pair(0, i)); - } + void checkSatelliteTagLocations(); - Reference serverSet = Reference(new LocalityMap>()); - LocalityMap>* serverMap = (LocalityMap>*)serverSet.getPtr(); - std::vector> resultPairs; - for (int loc = 0; loc < satelliteTagLocations.size(); loc++) { - int team = loc; - if (loc < logRouterTags) { - team = loc + 1; - } else if (loc == logRouterTags) { - team = 0; - } + int bestLocationFor(Tag tag); - bool teamComplete = false; - alsoServers.resize(1); - serverMap->clear(); - resultPairs.clear(); - for (auto& used_idx : used_servers) { - auto entry = serverMap->add(tLogLocalities[used_idx.second], &used_idx); - if (!resultPairs.size()) { - resultPairs.push_back(used_idx); - alsoServers[0] = entry; - } + void updateLocalitySet(std::vector const& localities); - resultEntries.clear(); - if (serverSet->selectReplicas(tLogPolicy, alsoServers, resultEntries)) { - for (auto& entry : resultEntries) { - resultPairs.push_back(*serverMap->getObject(entry)); - } - int firstBestUsed = server_usedBest[resultPairs[0].second]; - for (int i = 1; i < resultPairs.size(); i++) { - int thisBestUsed = server_usedBest[resultPairs[i].second]; - if (thisBestUsed < firstBestUsed) { - std::swap(resultPairs[0], resultPairs[i]); - firstBestUsed = thisBestUsed; - } - } - server_usedBest[resultPairs[0].second]++; - - for (auto& res : resultPairs) { - satelliteTagLocations[team].push_back(res.second); - used_servers.erase(res); - res.first++; - used_servers.insert(res); - } - teamComplete = true; - break; - } - } - ASSERT(teamComplete); - } - - checkSatelliteTagLocations(); - } - - void checkSatelliteTagLocations() { - std::vector usedBest; - std::vector used; - usedBest.resize(tLogLocalities.size()); - used.resize(tLogLocalities.size()); - for (auto team : satelliteTagLocations) { - usedBest[team[0]]++; - for (auto loc : team) { - used[loc]++; - } - } - - int minUsedBest = satelliteTagLocations.size(); - int maxUsedBest = 0; - for (auto i : usedBest) { - minUsedBest = std::min(minUsedBest, i); - maxUsedBest = std::max(maxUsedBest, i); - } - - int minUsed = satelliteTagLocations.size(); - int maxUsed = 0; - for (auto i : used) { - minUsed = std::min(minUsed, i); - maxUsed = std::max(maxUsed, i); - } - - bool foundDuplicate = false; - std::set> zones; - std::set> dcs; - for (auto& loc : tLogLocalities) { - if (zones.count(loc.zoneId())) { - foundDuplicate = true; - break; - } - zones.insert(loc.zoneId()); - dcs.insert(loc.dcId()); - } - bool moreThanOneDC = dcs.size() > 1 ? true : false; - - TraceEvent(((maxUsed - minUsed > 1) || (maxUsedBest - minUsedBest > 1)) - ? (g_network->isSimulated() && !foundDuplicate && !moreThanOneDC ? SevError : SevWarnAlways) - : SevInfo, - "CheckSatelliteTagLocations") - .detail("MinUsed", minUsed) - .detail("MaxUsed", maxUsed) - .detail("MinUsedBest", minUsedBest) - .detail("MaxUsedBest", maxUsedBest) - .detail("DuplicateZones", foundDuplicate) - .detail("NumOfDCs", dcs.size()); - } - - int bestLocationFor(Tag tag) { - if (locality == tagLocalitySatellite) { - return satelliteTagLocations[tag == txsTag ? 0 : tag.id + 1][0]; - } - - // the following logic supports upgrades from 5.X - if (tag == txsTag) - return txsTagOld % logServers.size(); - return tag.id % logServers.size(); - } - - void updateLocalitySet(std::vector const& localities) { - LocalityMap* logServerMap; - - logServerSet = Reference(new LocalityMap()); - logServerMap = (LocalityMap*)logServerSet.getPtr(); - - logEntryArray.clear(); - logEntryArray.reserve(localities.size()); - logIndexArray.clear(); - logIndexArray.reserve(localities.size()); - - for (int i = 0; i < localities.size(); i++) { - logIndexArray.push_back(i); - logEntryArray.push_back(logServerMap->add(localities[i], &logIndexArray.back())); - } - } - - bool satisfiesPolicy(const std::vector& locations) { - resultEntries.clear(); - - // Run the policy, assert if unable to satify - bool result = logServerSet->selectReplicas(tLogPolicy, locations, resultEntries); - ASSERT(result); - - return resultEntries.size() == 0; - } + bool satisfiesPolicy(const std::vector& locations); void getPushLocations(VectorRef tags, std::vector& locations, int locationOffset, - bool allLocations = false) { - if (locality == tagLocalitySatellite) { - for (auto& t : tags) { - if (t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) { - for (int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) { - locations.push_back(locationOffset + loc); - } - } - } - uniquify(locations); - return; - } - - newLocations.clear(); - alsoServers.clear(); - resultEntries.clear(); - - if (allLocations) { - // special handling for allLocations - TraceEvent("AllLocationsSet").log(); - for (int i = 0; i < logServers.size(); i++) { - newLocations.push_back(i); - } - } else { - for (auto& t : tags) { - if (locality == tagLocalitySpecial || t.locality == locality || t.locality < 0) { - newLocations.push_back(bestLocationFor(t)); - } - } - } - - uniquify(newLocations); - - if (newLocations.size()) - alsoServers.reserve(newLocations.size()); - - // Convert locations to the also servers - for (auto location : newLocations) { - locations.push_back(locationOffset + location); - alsoServers.push_back(logEntryArray[location]); - } - - // Run the policy, assert if unable to satify - bool result = logServerSet->selectReplicas(tLogPolicy, alsoServers, resultEntries); - ASSERT(result); - - // Add the new servers to the location array - LocalityMap* logServerMap = (LocalityMap*)logServerSet.getPtr(); - for (auto entry : resultEntries) { - locations.push_back(locationOffset + *logServerMap->getObject(entry)); - } - //TraceEvent("GetPushLocations").detail("Policy", tLogPolicy->info()) - // .detail("Results", locations.size()).detail("Selection", logServerSet->size()) - // .detail("Included", alsoServers.size()).detail("Duration", timer() - t); - } + bool allLocations = false); private: std::vector alsoServers, resultEntries; @@ -743,7 +517,7 @@ struct ILogSystem { Version version, Version knownCommittedVersion, Version minKnownCommittedVersion, - struct LogPushData& data, + LogPushData& data, SpanID const& spanContext, Optional debugID = Optional()) = 0; // Waits for the version number of the bundle (in this epoch) to be prevVersion (i.e. for all pushes ordered @@ -812,13 +586,13 @@ struct ILogSystem { static Reference fromServerDBInfo( UID const& dbgid, - struct ServerDBInfo const& db, + ServerDBInfo const& db, bool useRecoveredAt = false, Optional>> addActor = Optional>>()); static Reference fromLogSystemConfig( UID const& dbgid, - struct LocalityData const&, - struct LogSystemConfig const&, + LocalityData const&, + LogSystemConfig const&, bool excludeRemote = false, bool useRecoveredAt = false, Optional>> addActor = Optional>>()); @@ -826,9 +600,7 @@ struct ILogSystem { // reference if there isn't a fully recovered log system available. The caller can peek() the returned log system // and can push() if it has version numbers reserved for it and prevVersions - static Reference fromOldLogSystemConfig(UID const& dbgid, - struct LocalityData const&, - struct LogSystemConfig const&); + static Reference fromOldLogSystemConfig(UID const& dbgid, LocalityData const&, LogSystemConfig const&); // Constructs a new ILogSystem implementation from the old log data within a ServerDBInfo/LogSystemConfig. Might // return a null reference if there isn't a fully recovered log system available. @@ -863,7 +635,7 @@ struct ILogSystem { virtual std::map getOldEpochTagsVersionsInfo() const = 0; virtual Future> newEpoch( - struct RecruitFromConfigurationReply const& recr, + RecruitFromConfigurationReply const& recr, Future const& fRemoteWorkers, DatabaseConfiguration const& config, LogEpoch recoveryCount, @@ -948,11 +720,6 @@ struct LengthPrefixedStringRef { LengthPrefixedStringRef(uint32_t* length) : length(length) {} }; -template -struct CompareFirst { - bool operator()(T const& lhs, T const& rhs) const { return lhs.first < rhs.first; } -}; - // Structure to store serialized mutations sent from the proxy to the // transaction logs. The serialization repeats with the following format: // @@ -980,13 +747,7 @@ struct LogPushData : NonCopyable { isEmptyMessage = std::vector(messagesWriter.size(), false); } - void addTxsTag() { - if (logSystem->getTLogVersion() >= TLogVersion::V4) { - next_message_tags.push_back(logSystem->getRandomTxsTag()); - } else { - next_message_tags.push_back(txsTag); - } - } + void addTxsTag(); // addTag() adds a tag for the *next* message to be added void addTag(Tag tag) { next_message_tags.push_back(tag); } @@ -997,125 +758,22 @@ struct LogPushData : NonCopyable { } // Add transaction info to be written before the first mutation in the transaction. - void addTransactionInfo(SpanID const& context) { - TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanID - spanContext = context; - writtenLocations.clear(); - } + void addTransactionInfo(SpanID const& context); - void writeMessage(StringRef rawMessageWithoutLength, bool usePreviousLocations) { - if (!usePreviousLocations) { - prev_tags.clear(); - if (logSystem->hasRemoteLogs()) { - prev_tags.push_back(logSystem->getRandomRouterTag()); - } - for (auto& tag : next_message_tags) { - prev_tags.push_back(tag); - } - msg_locations.clear(); - logSystem->getPushLocations(prev_tags, msg_locations); - next_message_tags.clear(); - } - uint32_t subseq = this->subsequence++; - uint32_t msgsize = - rawMessageWithoutLength.size() + sizeof(subseq) + sizeof(uint16_t) + sizeof(Tag) * prev_tags.size(); - for (int loc : msg_locations) { - BinaryWriter& wr = messagesWriter[loc]; - wr << msgsize << subseq << uint16_t(prev_tags.size()); - for (auto& tag : prev_tags) - wr << tag; - wr.serializeBytes(rawMessageWithoutLength); - } - } + void writeMessage(StringRef rawMessageWithoutLength, bool usePreviousLocations); template - void writeTypedMessage(T const& item, bool metadataMessage = false, bool allLocations = false) { - prev_tags.clear(); - if (logSystem->hasRemoteLogs()) { - prev_tags.push_back(logSystem->getRandomRouterTag()); - } - for (auto& tag : next_message_tags) { - prev_tags.push_back(tag); - } - msg_locations.clear(); - logSystem->getPushLocations(prev_tags, msg_locations, allLocations); - - BinaryWriter bw(AssumeVersion(g_network->protocolVersion())); - - // Metadata messages (currently LogProtocolMessage is the only metadata - // message) should be written before span information. If this isn't a - // metadata message, make sure all locations have had transaction info - // written to them. Mutations may have different sets of tags, so it - // is necessary to check all tag locations each time a mutation is - // written. - if (!metadataMessage) { - uint32_t subseq = this->subsequence++; - bool updatedLocation = false; - for (int loc : msg_locations) { - updatedLocation = writeTransactionInfo(loc, subseq) || updatedLocation; - } - // If this message doesn't write to any new locations, the - // subsequence wasn't actually used and can be decremented. - if (!updatedLocation) { - this->subsequence--; - TEST(true); // No new SpanContextMessage written to transaction logs - ASSERT(this->subsequence > 0); - } - } else { - // When writing a metadata message, make sure transaction state has - // been reset. If you are running into this assertion, make sure - // you are calling addTransactionInfo before each transaction. - ASSERT(writtenLocations.size() == 0); - } - - uint32_t subseq = this->subsequence++; - bool first = true; - int firstOffset = -1, firstLength = -1; - for (int loc : msg_locations) { - BinaryWriter& wr = messagesWriter[loc]; - - if (first) { - firstOffset = wr.getLength(); - wr << uint32_t(0) << subseq << uint16_t(prev_tags.size()); - for (auto& tag : prev_tags) - wr << tag; - wr << item; - firstLength = wr.getLength() - firstOffset; - *(uint32_t*)((uint8_t*)wr.getData() + firstOffset) = firstLength - sizeof(uint32_t); - DEBUG_TAGS_AND_MESSAGE("ProxyPushLocations", - invalidVersion, - StringRef(((uint8_t*)wr.getData() + firstOffset), firstLength)) - .detail("PushLocations", msg_locations); - first = false; - } else { - BinaryWriter& from = messagesWriter[msg_locations[0]]; - wr.serializeBytes((uint8_t*)from.getData() + firstOffset, firstLength); - } - } - next_message_tags.clear(); - } + void writeTypedMessage(T const& item, bool metadataMessage = false, bool allLocations = false); Standalone getMessages(int loc) { return messagesWriter[loc].toValue(); } // Records if a tlog (specified by "loc") will receive an empty version batch message. // "value" is the message returned by getMessages() call. - void recordEmptyMessage(int loc, const Standalone& value) { - if (!isEmptyMessage[loc]) { - BinaryWriter w(AssumeVersion(g_network->protocolVersion())); - Standalone v = w.toValue(); - if (value.size() > v.size()) { - isEmptyMessage[loc] = true; - } - } - } + void recordEmptyMessage(int loc, const Standalone& value); // Returns the ratio of empty messages in this version batch. // MUST be called after getMessages() and recordEmptyMessage(). - float getEmptyMessageRatio() const { - auto count = std::count(isEmptyMessage.begin(), isEmptyMessage.end(), false); - ASSERT_WE_THINK(isEmptyMessage.size() > 0); - return 1.0 * count / isEmptyMessage.size(); - } + float getEmptyMessageRatio() const; private: Reference logSystem; @@ -1135,27 +793,73 @@ struct LogPushData : NonCopyable { // it has not already been written (for the current transaction). Returns // true on a successful write, and false if the location has already been // written. - bool writeTransactionInfo(int location, uint32_t subseq) { - if (!FLOW_KNOBS->WRITE_TRACING_ENABLED || logSystem->getTLogVersion() < TLogVersion::V6 || - writtenLocations.count(location) != 0) { - return false; - } + bool writeTransactionInfo(int location, uint32_t subseq); +}; + +template +void LogPushData::writeTypedMessage(T const& item, bool metadataMessage, bool allLocations) { + prev_tags.clear(); + if (logSystem->hasRemoteLogs()) { + prev_tags.push_back(logSystem->getRandomRouterTag()); + } + for (auto& tag : next_message_tags) { + prev_tags.push_back(tag); + } + msg_locations.clear(); + logSystem->getPushLocations(prev_tags, msg_locations, allLocations); - TEST(true); // Wrote SpanContextMessage to a transaction log - writtenLocations.insert(location); + BinaryWriter bw(AssumeVersion(g_network->protocolVersion())); + + // Metadata messages (currently LogProtocolMessage is the only metadata + // message) should be written before span information. If this isn't a + // metadata message, make sure all locations have had transaction info + // written to them. Mutations may have different sets of tags, so it + // is necessary to check all tag locations each time a mutation is + // written. + if (!metadataMessage) { + uint32_t subseq = this->subsequence++; + bool updatedLocation = false; + for (int loc : msg_locations) { + updatedLocation = writeTransactionInfo(loc, subseq) || updatedLocation; + } + // If this message doesn't write to any new locations, the + // subsequence wasn't actually used and can be decremented. + if (!updatedLocation) { + this->subsequence--; + TEST(true); // No new SpanContextMessage written to transaction logs + ASSERT(this->subsequence > 0); + } + } else { + // When writing a metadata message, make sure transaction state has + // been reset. If you are running into this assertion, make sure + // you are calling addTransactionInfo before each transaction. + ASSERT(writtenLocations.size() == 0); + } - BinaryWriter& wr = messagesWriter[location]; - SpanContextMessage contextMessage(spanContext); + uint32_t subseq = this->subsequence++; + bool first = true; + int firstOffset = -1, firstLength = -1; + for (int loc : msg_locations) { + BinaryWriter& wr = messagesWriter[loc]; - int offset = wr.getLength(); - wr << uint32_t(0) << subseq << uint16_t(prev_tags.size()); - for (auto& tag : prev_tags) - wr << tag; - wr << contextMessage; - int length = wr.getLength() - offset; - *(uint32_t*)((uint8_t*)wr.getData() + offset) = length - sizeof(uint32_t); - return true; + if (first) { + firstOffset = wr.getLength(); + wr << uint32_t(0) << subseq << uint16_t(prev_tags.size()); + for (auto& tag : prev_tags) + wr << tag; + wr << item; + firstLength = wr.getLength() - firstOffset; + *(uint32_t*)((uint8_t*)wr.getData() + firstOffset) = firstLength - sizeof(uint32_t); + DEBUG_TAGS_AND_MESSAGE( + "ProxyPushLocations", invalidVersion, StringRef(((uint8_t*)wr.getData() + firstOffset), firstLength)) + .detail("PushLocations", msg_locations); + first = false; + } else { + BinaryWriter& from = messagesWriter[msg_locations[0]]; + wr.serializeBytes((uint8_t*)from.getData() + firstOffset, firstLength); + } } -}; + next_message_tags.clear(); +} -#endif +#endif // FDBSERVER_LOGSYSTEM_H diff --git a/fdbserver/LogSystemConfig.cpp b/fdbserver/LogSystemConfig.cpp new file mode 100644 index 00000000000..16bbd46551b --- /dev/null +++ b/fdbserver/LogSystemConfig.cpp @@ -0,0 +1,333 @@ +/* + * LogSystemConfig.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/LogSystemConfig.h" + +std::string TLogSet::toString() const { + return format("anti: %d replication: %d local: %d routers: %d tLogs: %s backupWorkers: %s locality: %d", + tLogWriteAntiQuorum, + tLogReplicationFactor, + isLocal, + logRouters.size(), + describe(tLogs).c_str(), + describe(backupWorkers).c_str(), + locality); +} + +bool TLogSet::operator==(const TLogSet& rhs) const { + if (tLogWriteAntiQuorum != rhs.tLogWriteAntiQuorum || tLogReplicationFactor != rhs.tLogReplicationFactor || + isLocal != rhs.isLocal || satelliteTagLocations != rhs.satelliteTagLocations || + startVersion != rhs.startVersion || tLogs.size() != rhs.tLogs.size() || locality != rhs.locality || + logRouters.size() != rhs.logRouters.size() || backupWorkers.size() != rhs.backupWorkers.size()) { + return false; + } + if ((tLogPolicy && !rhs.tLogPolicy) || (!tLogPolicy && rhs.tLogPolicy) || + (tLogPolicy && (tLogPolicy->info() != rhs.tLogPolicy->info()))) { + return false; + } + for (int j = 0; j < tLogs.size(); j++) { + if (tLogs[j].id() != rhs.tLogs[j].id() || tLogs[j].present() != rhs.tLogs[j].present() || + (tLogs[j].present() && + tLogs[j].interf().commit.getEndpoint().token != rhs.tLogs[j].interf().commit.getEndpoint().token)) { + return false; + } + } + for (int j = 0; j < logRouters.size(); j++) { + if (logRouters[j].id() != rhs.logRouters[j].id() || logRouters[j].present() != rhs.logRouters[j].present() || + (logRouters[j].present() && logRouters[j].interf().commit.getEndpoint().token != + rhs.logRouters[j].interf().commit.getEndpoint().token)) { + return false; + } + } + for (int j = 0; j < backupWorkers.size(); j++) { + if (backupWorkers[j].id() != rhs.backupWorkers[j].id() || + backupWorkers[j].present() != rhs.backupWorkers[j].present() || + (backupWorkers[j].present() && + backupWorkers[j].interf().getToken() != rhs.backupWorkers[j].interf().getToken())) { + return false; + } + } + return true; +} + +bool TLogSet::isEqualIds(TLogSet const& r) const { + if (tLogWriteAntiQuorum != r.tLogWriteAntiQuorum || tLogReplicationFactor != r.tLogReplicationFactor || + isLocal != r.isLocal || satelliteTagLocations != r.satelliteTagLocations || startVersion != r.startVersion || + tLogs.size() != r.tLogs.size() || locality != r.locality) { + return false; + } + if ((tLogPolicy && !r.tLogPolicy) || (!tLogPolicy && r.tLogPolicy) || + (tLogPolicy && (tLogPolicy->info() != r.tLogPolicy->info()))) { + return false; + } + for (int i = 0; i < tLogs.size(); i++) { + if (tLogs[i].id() != r.tLogs[i].id()) { + return false; + } + } + return true; +} + +bool OldTLogConf::operator==(const OldTLogConf& rhs) const { + return tLogs == rhs.tLogs && epochBegin == rhs.epochBegin && epochEnd == rhs.epochEnd && + logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && pseudoLocalities == rhs.pseudoLocalities && + epoch == rhs.epoch; +} + +bool OldTLogConf::isEqualIds(OldTLogConf const& r) const { + if (tLogs.size() != r.tLogs.size()) { + return false; + } + for (int i = 0; i < tLogs.size(); i++) { + if (!tLogs[i].isEqualIds(r.tLogs[i])) { + return false; + } + } + return true; +} + +std::string LogSystemConfig::toString() const { + return format("type: %d oldGenerations: %d tags: %d %s", + logSystemType, + oldTLogs.size(), + logRouterTags, + describe(tLogs).c_str()); +} + +Optional LogSystemConfig::getRemoteDcId() const { + for (int i = 0; i < tLogs.size(); i++) { + if (!tLogs[i].isLocal) { + for (int j = 0; j < tLogs[i].tLogs.size(); j++) { + if (tLogs[i].tLogs[j].present()) { + return tLogs[i].tLogs[j].interf().filteredLocality.dcId(); + } + } + } + } + return Optional(); +} + +std::vector LogSystemConfig::allLocalLogs(bool includeSatellite) const { + std::vector results; + for (int i = 0; i < tLogs.size(); i++) { + // skip satellite TLogs, if it was not needed + if (!includeSatellite && tLogs[i].locality == tagLocalitySatellite) { + continue; + } + if (tLogs[i].isLocal) { + for (int j = 0; j < tLogs[i].tLogs.size(); j++) { + if (tLogs[i].tLogs[j].present()) { + results.push_back(tLogs[i].tLogs[j].interf()); + } + } + } + } + return results; +} + +std::vector LogSystemConfig::allPresentLogs() const { + std::vector results; + for (int i = 0; i < tLogs.size(); i++) { + for (int j = 0; j < tLogs[i].tLogs.size(); j++) { + if (tLogs[i].tLogs[j].present()) { + results.push_back(tLogs[i].tLogs[j].interf()); + } + } + } + return results; +} + +std::pair LogSystemConfig::getLocalityForDcId(Optional dcId) const { + std::map matchingLocalities; + std::map allLocalities; + for (auto& tLogSet : tLogs) { + for (auto& tLog : tLogSet.tLogs) { + if (tLogSet.locality >= 0) { + if (tLog.present() && tLog.interf().filteredLocality.dcId() == dcId) { + matchingLocalities[tLogSet.locality]++; + } else { + allLocalities[tLogSet.locality]++; + } + } + } + } + + for (auto& oldLog : oldTLogs) { + for (auto& tLogSet : oldLog.tLogs) { + for (auto& tLog : tLogSet.tLogs) { + if (tLogSet.locality >= 0) { + if (tLog.present() && tLog.interf().filteredLocality.dcId() == dcId) { + matchingLocalities[tLogSet.locality]++; + } else { + allLocalities[tLogSet.locality]++; + } + } + } + } + } + + int8_t bestLoc = tagLocalityInvalid; + int bestLocalityCount = -1; + for (auto& it : matchingLocalities) { + if (it.second > bestLocalityCount) { + bestLoc = it.first; + bestLocalityCount = it.second; + } + } + + int8_t secondLoc = tagLocalityInvalid; + int8_t thirdLoc = tagLocalityInvalid; + int secondLocalityCount = -1; + int thirdLocalityCount = -1; + for (auto& it : allLocalities) { + if (bestLoc != it.first) { + if (it.second > secondLocalityCount) { + thirdLoc = secondLoc; + thirdLocalityCount = secondLocalityCount; + secondLoc = it.first; + secondLocalityCount = it.second; + } else if (it.second > thirdLocalityCount) { + thirdLoc = it.first; + thirdLocalityCount = it.second; + } + } + } + + if (bestLoc != tagLocalityInvalid) { + return std::make_pair(bestLoc, secondLoc); + } + return std::make_pair(secondLoc, thirdLoc); +} + +std::vector> LogSystemConfig::allSharedLogs() const { + typedef std::pair IdAddrPair; + std::vector results; + for (auto& tLogSet : tLogs) { + for (auto& tLog : tLogSet.tLogs) { + if (tLog.present()) + results.push_back(IdAddrPair(tLog.interf().getSharedTLogID(), tLog.interf().address())); + } + } + + for (auto& oldLog : oldTLogs) { + for (auto& tLogSet : oldLog.tLogs) { + for (auto& tLog : tLogSet.tLogs) { + if (tLog.present()) + results.push_back(IdAddrPair(tLog.interf().getSharedTLogID(), tLog.interf().address())); + } + } + } + uniquify(results); + // This assert depends on the fact that uniquify will sort the elements based on order + ASSERT_WE_THINK(std::unique(results.begin(), results.end(), [](IdAddrPair& x, IdAddrPair& y) { + return x.first == y.first; + }) == results.end()); + return results; +} + +bool LogSystemConfig::isEqual(LogSystemConfig const& r) const { + return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && + expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && txsTags == r.txsTags && + recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && + pseudoLocalities == r.pseudoLocalities && epoch == r.epoch && oldestBackupEpoch == r.oldestBackupEpoch; +} + +bool LogSystemConfig::isEqualIds(LogSystemConfig const& r) const { + for (auto& i : r.tLogs) { + for (auto& j : tLogs) { + if (i.isEqualIds(j)) { + return true; + } + } + } + return false; +} + +bool LogSystemConfig::isNextGenerationOf(LogSystemConfig const& r) const { + if (!oldTLogs.size()) { + return false; + } + + for (auto& i : r.tLogs) { + for (auto& j : oldTLogs[0].tLogs) { + if (i.isEqualIds(j)) { + return true; + } + } + } + return false; +} + +bool LogSystemConfig::hasTLog(UID tid) const { + for (const auto& log : tLogs) { + if (std::count(log.tLogs.begin(), log.tLogs.end(), tid) > 0) { + return true; + } + } + for (const auto& old : oldTLogs) { + for (const auto& log : old.tLogs) { + if (std::count(log.tLogs.begin(), log.tLogs.end(), tid) > 0) { + return true; + } + } + } + return false; +} + +bool LogSystemConfig::hasLogRouter(UID rid) const { + for (const auto& log : tLogs) { + if (std::count(log.logRouters.begin(), log.logRouters.end(), rid) > 0) { + return true; + } + } + for (const auto& old : oldTLogs) { + for (const auto& log : old.tLogs) { + if (std::count(log.logRouters.begin(), log.logRouters.end(), rid) > 0) { + return true; + } + } + } + return false; +} + +bool LogSystemConfig::hasBackupWorker(UID bid) const { + for (const auto& log : tLogs) { + if (std::count(log.backupWorkers.begin(), log.backupWorkers.end(), bid) > 0) { + return true; + } + } + for (const auto& old : oldTLogs) { + for (const auto& log : old.tLogs) { + if (std::count(log.backupWorkers.begin(), log.backupWorkers.end(), bid) > 0) { + return true; + } + } + } + return false; +} + +Version LogSystemConfig::getEpochEndVersion(LogEpoch epoch) const { + for (const auto& old : oldTLogs) { + if (old.epoch == epoch) { + return old.epochEnd; + } + } + return invalidVersion; +} diff --git a/fdbserver/LogSystemConfig.h b/fdbserver/LogSystemConfig.h index b86fd17f5e1..1be515131d4 100644 --- a/fdbserver/LogSystemConfig.h +++ b/fdbserver/LogSystemConfig.h @@ -47,19 +47,23 @@ struct OptionalInterface { bool operator==(UID const& r) const { return ident == r; } template - void serialize(Ar& ar) { - serializer(ar, iface); - if (!iface.present()) - serializer(ar, ident); - else - ident = iface.get().id(); - } + void serialize(Ar& ar); protected: UID ident; Optional iface; }; +template +template +void OptionalInterface::serialize(Ar& ar) { + serializer(ar, iface); + if (!iface.present()) + serializer(ar, ident); + else + ident = iface.get().id(); +} + class LogSet; struct OldLogData; @@ -101,90 +105,33 @@ struct TLogSet { startVersion(invalidVersion) {} explicit TLogSet(const LogSet& rhs); - std::string toString() const { - return format("anti: %d replication: %d local: %d routers: %d tLogs: %s backupWorkers: %s locality: %d", - tLogWriteAntiQuorum, - tLogReplicationFactor, - isLocal, - logRouters.size(), - describe(tLogs).c_str(), - describe(backupWorkers).c_str(), - locality); - } + std::string toString() const; - bool operator==(const TLogSet& rhs) const { - if (tLogWriteAntiQuorum != rhs.tLogWriteAntiQuorum || tLogReplicationFactor != rhs.tLogReplicationFactor || - isLocal != rhs.isLocal || satelliteTagLocations != rhs.satelliteTagLocations || - startVersion != rhs.startVersion || tLogs.size() != rhs.tLogs.size() || locality != rhs.locality || - logRouters.size() != rhs.logRouters.size() || backupWorkers.size() != rhs.backupWorkers.size()) { - return false; - } - if ((tLogPolicy && !rhs.tLogPolicy) || (!tLogPolicy && rhs.tLogPolicy) || - (tLogPolicy && (tLogPolicy->info() != rhs.tLogPolicy->info()))) { - return false; - } - for (int j = 0; j < tLogs.size(); j++) { - if (tLogs[j].id() != rhs.tLogs[j].id() || tLogs[j].present() != rhs.tLogs[j].present() || - (tLogs[j].present() && - tLogs[j].interf().commit.getEndpoint().token != rhs.tLogs[j].interf().commit.getEndpoint().token)) { - return false; - } - } - for (int j = 0; j < logRouters.size(); j++) { - if (logRouters[j].id() != rhs.logRouters[j].id() || - logRouters[j].present() != rhs.logRouters[j].present() || - (logRouters[j].present() && logRouters[j].interf().commit.getEndpoint().token != - rhs.logRouters[j].interf().commit.getEndpoint().token)) { - return false; - } - } - for (int j = 0; j < backupWorkers.size(); j++) { - if (backupWorkers[j].id() != rhs.backupWorkers[j].id() || - backupWorkers[j].present() != rhs.backupWorkers[j].present() || - (backupWorkers[j].present() && - backupWorkers[j].interf().getToken() != rhs.backupWorkers[j].interf().getToken())) { - return false; - } - } - return true; - } + bool operator==(const TLogSet& rhs) const; - bool isEqualIds(TLogSet const& r) const { - if (tLogWriteAntiQuorum != r.tLogWriteAntiQuorum || tLogReplicationFactor != r.tLogReplicationFactor || - isLocal != r.isLocal || satelliteTagLocations != r.satelliteTagLocations || - startVersion != r.startVersion || tLogs.size() != r.tLogs.size() || locality != r.locality) { - return false; - } - if ((tLogPolicy && !r.tLogPolicy) || (!tLogPolicy && r.tLogPolicy) || - (tLogPolicy && (tLogPolicy->info() != r.tLogPolicy->info()))) { - return false; - } - for (int i = 0; i < tLogs.size(); i++) { - if (tLogs[i].id() != r.tLogs[i].id()) { - return false; - } - } - return true; - } + bool isEqualIds(TLogSet const& r) const; template - void serialize(Ar& ar) { - serializer(ar, - tLogs, - logRouters, - tLogWriteAntiQuorum, - tLogReplicationFactor, - tLogPolicy, - tLogLocalities, - isLocal, - locality, - startVersion, - satelliteTagLocations, - tLogVersion, - backupWorkers); - } + void serialize(Ar& ar); }; +template +void TLogSet::serialize(Ar& ar) { + serializer(ar, + tLogs, + logRouters, + tLogWriteAntiQuorum, + tLogReplicationFactor, + tLogPolicy, + tLogLocalities, + isLocal, + locality, + startVersion, + satelliteTagLocations, + tLogVersion, + backupWorkers); +} + struct OldTLogConf { constexpr static FileIdentifier file_identifier = 16233772; std::vector tLogs; @@ -202,23 +149,9 @@ struct OldTLogConf { return format("end: %d tags: %d %s", epochEnd, logRouterTags, describe(tLogs).c_str()); } - bool operator==(const OldTLogConf& rhs) const { - return tLogs == rhs.tLogs && epochBegin == rhs.epochBegin && epochEnd == rhs.epochEnd && - logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && - pseudoLocalities == rhs.pseudoLocalities && epoch == rhs.epoch; - } + bool operator==(const OldTLogConf& rhs) const; - bool isEqualIds(OldTLogConf const& r) const { - if (tLogs.size() != r.tLogs.size()) { - return false; - } - for (int i = 0; i < tLogs.size(); i++) { - if (!tLogs[i].isEqualIds(r.tLogs[i])) { - return false; - } - } - return true; - } + bool isEqualIds(OldTLogConf const& r) const; template void serialize(Ar& ar) { @@ -253,253 +186,53 @@ struct LogSystemConfig { : logSystemType(LogSystemType::empty), logRouterTags(0), txsTags(0), expectedLogSets(0), stopped(false), epoch(e), oldestBackupEpoch(e) {} - std::string toString() const { - return format("type: %d oldGenerations: %d tags: %d %s", - logSystemType, - oldTLogs.size(), - logRouterTags, - describe(tLogs).c_str()); - } + std::string toString() const; - Optional getRemoteDcId() const { - for (int i = 0; i < tLogs.size(); i++) { - if (!tLogs[i].isLocal) { - for (int j = 0; j < tLogs[i].tLogs.size(); j++) { - if (tLogs[i].tLogs[j].present()) { - return tLogs[i].tLogs[j].interf().filteredLocality.dcId(); - } - } - } - } - return Optional(); - } + Optional getRemoteDcId() const; - std::vector allLocalLogs(bool includeSatellite = true) const { - std::vector results; - for (int i = 0; i < tLogs.size(); i++) { - // skip satellite TLogs, if it was not needed - if (!includeSatellite && tLogs[i].locality == tagLocalitySatellite) { - continue; - } - if (tLogs[i].isLocal) { - for (int j = 0; j < tLogs[i].tLogs.size(); j++) { - if (tLogs[i].tLogs[j].present()) { - results.push_back(tLogs[i].tLogs[j].interf()); - } - } - } - } - return results; - } + std::vector allLocalLogs(bool includeSatellite = true) const; - std::vector allPresentLogs() const { - std::vector results; - for (int i = 0; i < tLogs.size(); i++) { - for (int j = 0; j < tLogs[i].tLogs.size(); j++) { - if (tLogs[i].tLogs[j].present()) { - results.push_back(tLogs[i].tLogs[j].interf()); - } - } - } - return results; - } + std::vector allPresentLogs() const; - std::pair getLocalityForDcId(Optional dcId) const { - std::map matchingLocalities; - std::map allLocalities; - for (auto& tLogSet : tLogs) { - for (auto& tLog : tLogSet.tLogs) { - if (tLogSet.locality >= 0) { - if (tLog.present() && tLog.interf().filteredLocality.dcId() == dcId) { - matchingLocalities[tLogSet.locality]++; - } else { - allLocalities[tLogSet.locality]++; - } - } - } - } + std::pair getLocalityForDcId(Optional dcId) const; - for (auto& oldLog : oldTLogs) { - for (auto& tLogSet : oldLog.tLogs) { - for (auto& tLog : tLogSet.tLogs) { - if (tLogSet.locality >= 0) { - if (tLog.present() && tLog.interf().filteredLocality.dcId() == dcId) { - matchingLocalities[tLogSet.locality]++; - } else { - allLocalities[tLogSet.locality]++; - } - } - } - } - } - - int8_t bestLoc = tagLocalityInvalid; - int bestLocalityCount = -1; - for (auto& it : matchingLocalities) { - if (it.second > bestLocalityCount) { - bestLoc = it.first; - bestLocalityCount = it.second; - } - } - - int8_t secondLoc = tagLocalityInvalid; - int8_t thirdLoc = tagLocalityInvalid; - int secondLocalityCount = -1; - int thirdLocalityCount = -1; - for (auto& it : allLocalities) { - if (bestLoc != it.first) { - if (it.second > secondLocalityCount) { - thirdLoc = secondLoc; - thirdLocalityCount = secondLocalityCount; - secondLoc = it.first; - secondLocalityCount = it.second; - } else if (it.second > thirdLocalityCount) { - thirdLoc = it.first; - thirdLocalityCount = it.second; - } - } - } - - if (bestLoc != tagLocalityInvalid) { - return std::make_pair(bestLoc, secondLoc); - } - return std::make_pair(secondLoc, thirdLoc); - } - - std::vector> allSharedLogs() const { - typedef std::pair IdAddrPair; - std::vector results; - for (auto& tLogSet : tLogs) { - for (auto& tLog : tLogSet.tLogs) { - if (tLog.present()) - results.push_back(IdAddrPair(tLog.interf().getSharedTLogID(), tLog.interf().address())); - } - } - - for (auto& oldLog : oldTLogs) { - for (auto& tLogSet : oldLog.tLogs) { - for (auto& tLog : tLogSet.tLogs) { - if (tLog.present()) - results.push_back(IdAddrPair(tLog.interf().getSharedTLogID(), tLog.interf().address())); - } - } - } - uniquify(results); - // This assert depends on the fact that uniquify will sort the elements based on order - ASSERT_WE_THINK(std::unique(results.begin(), results.end(), [](IdAddrPair& x, IdAddrPair& y) { - return x.first == y.first; - }) == results.end()); - return results; - } + std::vector> allSharedLogs() const; bool operator==(const LogSystemConfig& rhs) const { return isEqual(rhs); } - bool isEqual(LogSystemConfig const& r) const { - return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && - expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && txsTags == r.txsTags && - recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && - pseudoLocalities == r.pseudoLocalities && epoch == r.epoch && oldestBackupEpoch == r.oldestBackupEpoch; - } + bool isEqual(LogSystemConfig const& r) const; - bool isEqualIds(LogSystemConfig const& r) const { - for (auto& i : r.tLogs) { - for (auto& j : tLogs) { - if (i.isEqualIds(j)) { - return true; - } - } - } - return false; - } + bool isEqualIds(LogSystemConfig const& r) const; - bool isNextGenerationOf(LogSystemConfig const& r) const { - if (!oldTLogs.size()) { - return false; - } - - for (auto& i : r.tLogs) { - for (auto& j : oldTLogs[0].tLogs) { - if (i.isEqualIds(j)) { - return true; - } - } - } - return false; - } + bool isNextGenerationOf(LogSystemConfig const& r) const; - bool hasTLog(UID tid) const { - for (const auto& log : tLogs) { - if (std::count(log.tLogs.begin(), log.tLogs.end(), tid) > 0) { - return true; - } - } - for (const auto& old : oldTLogs) { - for (const auto& log : old.tLogs) { - if (std::count(log.tLogs.begin(), log.tLogs.end(), tid) > 0) { - return true; - } - } - } - return false; - } + bool hasTLog(UID tid) const; - bool hasLogRouter(UID rid) const { - for (const auto& log : tLogs) { - if (std::count(log.logRouters.begin(), log.logRouters.end(), rid) > 0) { - return true; - } - } - for (const auto& old : oldTLogs) { - for (const auto& log : old.tLogs) { - if (std::count(log.logRouters.begin(), log.logRouters.end(), rid) > 0) { - return true; - } - } - } - return false; - } + bool hasLogRouter(UID rid) const; - bool hasBackupWorker(UID bid) const { - for (const auto& log : tLogs) { - if (std::count(log.backupWorkers.begin(), log.backupWorkers.end(), bid) > 0) { - return true; - } - } - for (const auto& old : oldTLogs) { - for (const auto& log : old.tLogs) { - if (std::count(log.backupWorkers.begin(), log.backupWorkers.end(), bid) > 0) { - return true; - } - } - } - return false; - } + bool hasBackupWorker(UID bid) const; - Version getEpochEndVersion(LogEpoch epoch) const { - for (const auto& old : oldTLogs) { - if (old.epoch == epoch) { - return old.epochEnd; - } - } - return invalidVersion; - } + Version getEpochEndVersion(LogEpoch epoch) const; template - void serialize(Ar& ar) { - serializer(ar, - logSystemType, - tLogs, - logRouterTags, - oldTLogs, - expectedLogSets, - recruitmentID, - stopped, - recoveredAt, - pseudoLocalities, - txsTags, - epoch, - oldestBackupEpoch); - } + void serialize(Ar& ar); }; -#endif +template +void LogSystemConfig::serialize(Ar& ar) { + serializer(ar, + logSystemType, + tLogs, + logRouterTags, + oldTLogs, + expectedLogSets, + recruitmentID, + stopped, + recoveredAt, + pseudoLocalities, + txsTags, + epoch, + oldestBackupEpoch); +} + +#endif // FDBSERVER_LOGSYSTEMCONFIG_H diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 7cb4c565bff..53fa5114355 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -595,7 +595,7 @@ ACTOR Future updatePersistentData(TLogData* self, Reference logDa msg = std::upper_bound(tag->value.version_messages.begin(), tag->value.version_messages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); } } @@ -748,7 +748,7 @@ ACTOR Future updateStorage(TLogData* self) { auto it = std::lower_bound(tag->value.version_messages.begin(), tag->value.version_messages.end(), std::make_pair(prevVersion, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); for (; it != tag->value.version_messages.end() && it->first < nextVersion; ++it) { totalSize += it->second.expectedSize(); } @@ -945,7 +945,7 @@ void peekMessagesFromMemory(Reference self, auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); Version currentVersion = -1; for (; it != deque.end(); ++it) { diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 9ed12b17395..a2ddfcfdc71 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -747,7 +747,7 @@ ACTOR Future updatePersistentData(TLogData* self, Reference logDa msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); } } @@ -1187,7 +1187,7 @@ void peekMessagesFromMemory(Reference self, auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); Version currentVersion = -1; for (; it != deque.end(); ++it) { diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index c7a8a75184c..7c33aa66b9a 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1007,7 +1007,7 @@ ACTOR Future updatePersistentData(TLogData* self, Reference logDa msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); } } } @@ -1504,7 +1504,7 @@ void peekMessagesFromMemory(Reference self, auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) { return l.first < r.first; }); Version currentVersion = -1; for (; it != deque.end(); ++it) { diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 250ef8b8078..c40b6ac06ab 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1297,9 +1297,7 @@ void SimulationConfig::setDatacenters(const TestConfig& testConfig) { // Sets storage engine based on testConfig details void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { - // Using [0, 4) to disable the RocksDB storage engine. - // TODO: Figure out what is broken with the RocksDB engine in simulation. - int storage_engine_type = deterministicRandom()->randomInt(0, 4); + int storage_engine_type = deterministicRandom()->randomInt(0, 5); if (testConfig.storageEngineType.present()) { storage_engine_type = testConfig.storageEngineType.get(); } else { @@ -1337,7 +1335,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { set_config("ssd-rocksdb-experimental"); // Tests using the RocksDB engine are necessarily non-deterministic because of RocksDB // background threads. - TraceEvent(SevWarn, "RocksDBNonDeterminism") + TraceEvent(SevWarnAlways, "RocksDBNonDeterminism") .detail("Explanation", "The RocksDB storage engine is threaded and non-deterministic"); noUnseed = true; break; @@ -2179,6 +2177,13 @@ ACTOR void setupAndRun(std::string dataFolder, testConfig.storageEngineExcludeTypes.push_back(4); } + // TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine + // possibly due to running the rocksdb in single thread in simulation. + // Re-enable the backup and restore related simulation tests when the tests are passing again. + if (std::string_view(testFile).find("Backup") != std::string_view::npos) { + testConfig.storageEngineExcludeTypes.push_back(4); + } + // The RocksDB engine is not always built with the rest of fdbserver. Don't try to use it if it is not included // in the build. if (!rocksDBEnabled) { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 64646cacb63..48339c4f831 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1027,10 +1027,11 @@ ACTOR Future updatePersistentData(TLogData* self, Reference logDa Future f = yield(TaskPriority::UpdateStorage); if (!f.isReady()) { wait(f); - msg = std::upper_bound(tagData->versionMessages.begin(), - tagData->versionMessages.end(), - std::make_pair(currentVersion, LengthPrefixedStringRef()), - CompareFirst>()); + msg = std::upper_bound( + tagData->versionMessages.begin(), + tagData->versionMessages.end(), + std::make_pair(currentVersion, LengthPrefixedStringRef()), + [](const auto& l, const auto& r) -> bool { return l.first < r.first; }); } } } @@ -1535,7 +1536,7 @@ void peekMessagesFromMemory(Reference self, auto it = std::lower_bound(deque.begin(), deque.end(), std::make_pair(begin, LengthPrefixedStringRef()), - CompareFirst>()); + [](const auto& l, const auto& r) -> bool { return l.first < r.first; }); Version currentVersion = -1; for (; it != deque.end(); ++it) { diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index f8d75e1c725..e6374cdb891 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,18 +18,8 @@ * limitations under the License. */ -#include "flow/ActorCollection.h" -#include "fdbserver/LogSystem.h" -#include "fdbserver/ServerDBInfo.h" -#include "fdbserver/DBCoreState.h" -#include "fdbserver/WaitFailure.h" -#include "fdbclient/SystemData.h" -#include "fdbrpc/simulator.h" -#include "fdbrpc/Replication.h" -#include "fdbrpc/ReplicationUtils.h" -#include "fdbserver/Knobs.h" -#include "fdbserver/RecoveryState.h" -#include "fdbserver/LogProtocolMessage.h" +#include "fdbserver/TagPartitionedLogSystem.actor.h" + #include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future minVersionWhenReady(Future f, std::vector> replies) { @@ -43,39 +33,6 @@ ACTOR Future minVersionWhenReady(Future f, std::vector> tLogs; - int32_t logRouterTags; - int32_t txsTags; // The number of txsTags, which may change across generations. - Version epochBegin, epochEnd; - std::set pseudoLocalities; - LogEpoch epoch; - - OldLogData() : logRouterTags(0), txsTags(0), epochBegin(0), epochEnd(0), epoch(0) {} - - // Constructor for T of OldTLogConf and OldTLogCoreData - template - explicit OldLogData(const T& conf) - : logRouterTags(conf.logRouterTags), txsTags(conf.txsTags), epochBegin(conf.epochBegin), epochEnd(conf.epochEnd), - pseudoLocalities(conf.pseudoLocalities), epoch(conf.epoch) { - tLogs.resize(conf.tLogs.size()); - for (int j = 0; j < conf.tLogs.size(); j++) { - auto logSet = makeReference(conf.tLogs[j]); - tLogs[j] = logSet; - } - } -}; - -struct LogLockInfo { - Version epochEnd; - bool isCurrent; - Reference logSet; - std::vector> replies; - - LogLockInfo() : epochEnd(std::numeric_limits::max()), isCurrent(false) {} -}; - LogSet::LogSet(const TLogSet& tLogSet) : tLogWriteAntiQuorum(tLogSet.tLogWriteAntiQuorum), tLogReplicationFactor(tLogSet.tLogReplicationFactor), tLogLocalities(tLogSet.tLogLocalities), tLogVersion(tLogSet.tLogVersion), tLogPolicy(tLogSet.tLogPolicy), @@ -152,309 +109,337 @@ OldTLogCoreData::OldTLogCoreData(const OldLogData& oldData) } } -struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted { - const UID dbgid; - LogSystemType logSystemType; - std::vector> tLogs; // LogSets in different locations: primary, satellite, or remote - int expectedLogSets; - int logRouterTags; - int txsTags; - UID recruitmentID; - int repopulateRegionAntiQuorum; - bool stopped; - std::set pseudoLocalities; // Represent special localities that will be mapped to tagLocalityLogRouter - const LogEpoch epoch; - LogEpoch oldestBackupEpoch; - - // new members - std::map pseudoLocalityPopVersion; - Future rejoins; - Future recoveryComplete; - Future remoteRecovery; - Future remoteRecoveryComplete; - std::vector lockResults; - AsyncVar recoveryCompleteWrittenToCoreState; - bool remoteLogsWrittenToCoreState; - bool hasRemoteServers; - AsyncTrigger backupWorkerChanged; - std::set removedBackupWorkers; // Workers that are removed before setting them. - - Optional recoverAt; - Optional recoveredAt; - Version knownCommittedVersion; - Version backupStartVersion = invalidVersion; // max(tLogs[0].startVersion, previous epochEnd). - LocalityData locality; - // For each currently running popFromLog actor, outstandingPops is - // (logID, tag)->(max popped version, durableKnownCommittedVersion). - // Why do we need durableKnownCommittedVersion? knownCommittedVersion gives the lower bound of what data - // will need to be copied into the next generation to restore the replication factor. - // Guess: It probably serves as a minimum version of what data should be on a TLog in the next generation and - // sending a pop for anything less than durableKnownCommittedVersion for the TLog will be absurd. - std::map, std::pair> outstandingPops; - - Optional>> addActor; - ActorCollection popActors; - std::vector oldLogData; // each element has the log info. in one old epoch. - AsyncTrigger logSystemConfigChanged; - - TagPartitionedLogSystem(UID dbgid, - LocalityData locality, - LogEpoch e, - Optional>> addActor = Optional>>()) - : dbgid(dbgid), logSystemType(LogSystemType::empty), expectedLogSets(0), logRouterTags(0), txsTags(0), - repopulateRegionAntiQuorum(0), stopped(false), epoch(e), oldestBackupEpoch(0), - recoveryCompleteWrittenToCoreState(false), remoteLogsWrittenToCoreState(false), hasRemoteServers(false), - locality(locality), addActor(addActor), popActors(false) {} - - void stopRejoins() final { rejoins = Future(); } - - void addref() final { ReferenceCounted::addref(); } - - void delref() final { ReferenceCounted::delref(); } - - std::string describe() const final { - std::string result; - for (int i = 0; i < tLogs.size(); i++) { - result += format("%d: ", i); - for (int j = 0; j < tLogs[i]->logServers.size(); j++) { - result += tLogs[i]->logServers[j]->get().id().toString() + - ((j == tLogs[i]->logServers.size() - 1) ? " " : ", "); - } - } - return result; - } - - UID getDebugID() const final { return dbgid; } - - void addPseudoLocality(int8_t locality) { - ASSERT(locality < 0); - pseudoLocalities.insert(locality); - for (uint16_t i = 0; i < logRouterTags; i++) { - pseudoLocalityPopVersion[Tag(locality, i)] = 0; - } - } - - Tag getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const final { - switch (type) { - case ProcessClass::LogRouterClass: - if (tag.locality == tagLocalityLogRouter) { - ASSERT(pseudoLocalities.count(tagLocalityLogRouterMapped) > 0); - tag.locality = tagLocalityLogRouterMapped; - } - break; +Future ILogSystem::recoverAndEndEpoch(Reference>> const& outLogSystem, + UID const& dbgid, + DBCoreState const& oldState, + FutureStream const& rejoins, + LocalityData const& locality, + bool* forceRecovery) { + return TagPartitionedLogSystem::recoverAndEndEpoch(outLogSystem, dbgid, oldState, rejoins, locality, forceRecovery); +} - case ProcessClass::BackupClass: - if (tag.locality == tagLocalityLogRouter) { - ASSERT(pseudoLocalities.count(tagLocalityBackup) > 0); - tag.locality = tagLocalityBackup; - } - break; +Reference ILogSystem::fromLogSystemConfig(UID const& dbgid, + struct LocalityData const& locality, + struct LogSystemConfig const& conf, + bool excludeRemote, + bool useRecoveredAt, + Optional>> addActor) { + if (conf.logSystemType == LogSystemType::empty) + return Reference(); + else if (conf.logSystemType == LogSystemType::tagPartitioned) + return TagPartitionedLogSystem::fromLogSystemConfig( + dbgid, locality, conf, excludeRemote, useRecoveredAt, addActor); + else + throw internal_error(); +} - default: // This should be an error at caller site. - break; - } - return tag; - } +Reference ILogSystem::fromOldLogSystemConfig(UID const& dbgid, + struct LocalityData const& locality, + struct LogSystemConfig const& conf) { + if (conf.logSystemType == LogSystemType::empty) + return Reference(); + else if (conf.logSystemType == LogSystemType::tagPartitioned) + return TagPartitionedLogSystem::fromOldLogSystemConfig(dbgid, locality, conf); + else + throw internal_error(); +} + +Reference ILogSystem::fromServerDBInfo(UID const& dbgid, + ServerDBInfo const& dbInfo, + bool useRecoveredAt, + Optional>> addActor) { + return fromLogSystemConfig(dbgid, dbInfo.myLocality, dbInfo.logSystemConfig, false, useRecoveredAt, addActor); +} + +void TagPartitionedLogSystem::stopRejoins() { + rejoins = Future(); +} - bool hasPseudoLocality(int8_t locality) const final { return pseudoLocalities.count(locality) > 0; } +void TagPartitionedLogSystem::addref() { + ReferenceCounted::addref(); +} - // Return the min version of all pseudoLocalities, i.e., logRouter and backupTag - Version popPseudoLocalityTag(Tag tag, Version upTo) final { - ASSERT(isPseudoLocality(tag.locality) && hasPseudoLocality(tag.locality)); +void TagPartitionedLogSystem::delref() { + ReferenceCounted::delref(); +} - Version& localityVersion = pseudoLocalityPopVersion[tag]; - localityVersion = std::max(localityVersion, upTo); - Version minVersion = localityVersion; - // Why do we need to use the minimum popped version among all tags? Reason: for example, - // 2 pseudo tags pop 100 or 150, respectively. It's only safe to pop min(100, 150), - // because [101,150) is needed by another pseudo tag. - for (const int8_t locality : pseudoLocalities) { - minVersion = std::min(minVersion, pseudoLocalityPopVersion[Tag(locality, tag.id)]); +std::string TagPartitionedLogSystem::describe() const { + std::string result; + for (int i = 0; i < tLogs.size(); i++) { + result += format("%d: ", i); + for (int j = 0; j < tLogs[i]->logServers.size(); j++) { + result += + tLogs[i]->logServers[j]->get().id().toString() + ((j == tLogs[i]->logServers.size() - 1) ? " " : ", "); } - // TraceEvent("TLogPopPseudoTag", dbgid).detail("Tag", tag.toString()).detail("Version", upTo).detail("PopVersion", minVersion); - return minVersion; } + return result; +} - static Future recoverAndEndEpoch(Reference>> const& outLogSystem, - UID const& dbgid, - DBCoreState const& oldState, - FutureStream const& rejoins, - LocalityData const& locality, - bool* forceRecovery) { - return epochEnd(outLogSystem, dbgid, oldState, rejoins, locality, forceRecovery); - } +UID TagPartitionedLogSystem::getDebugID() const { + return dbgid; +} - static Reference fromLogSystemConfig(UID const& dbgid, - LocalityData const& locality, - LogSystemConfig const& lsConf, - bool excludeRemote, - bool useRecoveredAt, - Optional>> addActor) { - ASSERT(lsConf.logSystemType == LogSystemType::tagPartitioned || - (lsConf.logSystemType == LogSystemType::empty && !lsConf.tLogs.size())); - // ASSERT(lsConf.epoch == epoch); //< FIXME - auto logSystem = makeReference(dbgid, locality, lsConf.epoch, addActor); +void TagPartitionedLogSystem::addPseudoLocality(int8_t locality) { + ASSERT(locality < 0); + pseudoLocalities.insert(locality); + for (uint16_t i = 0; i < logRouterTags; i++) { + pseudoLocalityPopVersion[Tag(locality, i)] = 0; + } +} - logSystem->tLogs.reserve(lsConf.tLogs.size()); - logSystem->expectedLogSets = lsConf.expectedLogSets; - logSystem->logRouterTags = lsConf.logRouterTags; - logSystem->txsTags = lsConf.txsTags; - logSystem->recruitmentID = lsConf.recruitmentID; - logSystem->stopped = lsConf.stopped; - if (useRecoveredAt) { - logSystem->recoveredAt = lsConf.recoveredAt; - } - logSystem->pseudoLocalities = lsConf.pseudoLocalities; - for (const TLogSet& tLogSet : lsConf.tLogs) { - if (!excludeRemote || tLogSet.isLocal) { - logSystem->tLogs.push_back(makeReference(tLogSet)); - } +Tag TagPartitionedLogSystem::getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const { + switch (type) { + case ProcessClass::LogRouterClass: + if (tag.locality == tagLocalityLogRouter) { + ASSERT(pseudoLocalities.count(tagLocalityLogRouterMapped) > 0); + tag.locality = tagLocalityLogRouterMapped; } + break; - for (const auto& oldTlogConf : lsConf.oldTLogs) { - logSystem->oldLogData.emplace_back(oldTlogConf); - //TraceEvent("BWFromLSConf") - // .detail("Epoch", logSystem->oldLogData.back().epoch) - // .detail("Version", logSystem->oldLogData.back().epochEnd); + case ProcessClass::BackupClass: + if (tag.locality == tagLocalityLogRouter) { + ASSERT(pseudoLocalities.count(tagLocalityBackup) > 0); + tag.locality = tagLocalityBackup; } + break; - logSystem->logSystemType = lsConf.logSystemType; - logSystem->oldestBackupEpoch = lsConf.oldestBackupEpoch; - return logSystem; + default: // This should be an error at caller site. + break; } + return tag; +} - static Reference fromOldLogSystemConfig(UID const& dbgid, - LocalityData const& locality, - LogSystemConfig const& lsConf) { - ASSERT(lsConf.logSystemType == LogSystemType::tagPartitioned || - (lsConf.logSystemType == LogSystemType::empty && !lsConf.tLogs.size())); - // ASSERT(lsConf.epoch == epoch); //< FIXME - const LogEpoch e = lsConf.oldTLogs.size() > 0 ? lsConf.oldTLogs[0].epoch : 0; - auto logSystem = makeReference(dbgid, locality, e); +bool TagPartitionedLogSystem::hasPseudoLocality(int8_t locality) const { + return pseudoLocalities.count(locality) > 0; +} - if (lsConf.oldTLogs.size()) { - for (const TLogSet& tLogSet : lsConf.oldTLogs[0].tLogs) { - logSystem->tLogs.push_back(makeReference(tLogSet)); - } - logSystem->logRouterTags = lsConf.oldTLogs[0].logRouterTags; - logSystem->txsTags = lsConf.oldTLogs[0].txsTags; - // logSystem->epochEnd = lsConf.oldTLogs[0].epochEnd; +Version TagPartitionedLogSystem::popPseudoLocalityTag(Tag tag, Version upTo) { + ASSERT(isPseudoLocality(tag.locality) && hasPseudoLocality(tag.locality)); - for (int i = 1; i < lsConf.oldTLogs.size(); i++) { - logSystem->oldLogData.emplace_back(lsConf.oldTLogs[i]); - } + Version& localityVersion = pseudoLocalityPopVersion[tag]; + localityVersion = std::max(localityVersion, upTo); + Version minVersion = localityVersion; + // Why do we need to use the minimum popped version among all tags? Reason: for example, + // 2 pseudo tags pop 100 or 150, respectively. It's only safe to pop min(100, 150), + // because [101,150) is needed by another pseudo tag. + for (const int8_t locality : pseudoLocalities) { + minVersion = std::min(minVersion, pseudoLocalityPopVersion[Tag(locality, tag.id)]); + } + // TraceEvent("TLogPopPseudoTag", dbgid).detail("Tag", tag.toString()).detail("Version", upTo).detail("PopVersion", minVersion); + return minVersion; +} + +Future TagPartitionedLogSystem::recoverAndEndEpoch(Reference>> const& outLogSystem, + UID const& dbgid, + DBCoreState const& oldState, + FutureStream const& rejoins, + LocalityData const& locality, + bool* forceRecovery) { + return epochEnd(outLogSystem, dbgid, oldState, rejoins, locality, forceRecovery); +} + +Reference TagPartitionedLogSystem::fromLogSystemConfig(UID const& dbgid, + LocalityData const& locality, + LogSystemConfig const& lsConf, + bool excludeRemote, + bool useRecoveredAt, + Optional>> addActor) { + ASSERT(lsConf.logSystemType == LogSystemType::tagPartitioned || + (lsConf.logSystemType == LogSystemType::empty && !lsConf.tLogs.size())); + // ASSERT(lsConf.epoch == epoch); //< FIXME + auto logSystem = makeReference(dbgid, locality, lsConf.epoch, addActor); + + logSystem->tLogs.reserve(lsConf.tLogs.size()); + logSystem->expectedLogSets = lsConf.expectedLogSets; + logSystem->logRouterTags = lsConf.logRouterTags; + logSystem->txsTags = lsConf.txsTags; + logSystem->recruitmentID = lsConf.recruitmentID; + logSystem->stopped = lsConf.stopped; + if (useRecoveredAt) { + logSystem->recoveredAt = lsConf.recoveredAt; + } + logSystem->pseudoLocalities = lsConf.pseudoLocalities; + for (const TLogSet& tLogSet : lsConf.tLogs) { + if (!excludeRemote || tLogSet.isLocal) { + logSystem->tLogs.push_back(makeReference(tLogSet)); + } + } + + for (const auto& oldTlogConf : lsConf.oldTLogs) { + logSystem->oldLogData.emplace_back(oldTlogConf); + //TraceEvent("BWFromLSConf") + // .detail("Epoch", logSystem->oldLogData.back().epoch) + // .detail("Version", logSystem->oldLogData.back().epochEnd); + } + + logSystem->logSystemType = lsConf.logSystemType; + logSystem->oldestBackupEpoch = lsConf.oldestBackupEpoch; + return logSystem; +} + +Reference TagPartitionedLogSystem::fromOldLogSystemConfig(UID const& dbgid, + LocalityData const& locality, + LogSystemConfig const& lsConf) { + ASSERT(lsConf.logSystemType == LogSystemType::tagPartitioned || + (lsConf.logSystemType == LogSystemType::empty && !lsConf.tLogs.size())); + // ASSERT(lsConf.epoch == epoch); //< FIXME + const LogEpoch e = lsConf.oldTLogs.size() > 0 ? lsConf.oldTLogs[0].epoch : 0; + auto logSystem = makeReference(dbgid, locality, e); + + if (lsConf.oldTLogs.size()) { + for (const TLogSet& tLogSet : lsConf.oldTLogs[0].tLogs) { + logSystem->tLogs.push_back(makeReference(tLogSet)); } - logSystem->logSystemType = lsConf.logSystemType; - logSystem->stopped = true; - logSystem->pseudoLocalities = lsConf.pseudoLocalities; + logSystem->logRouterTags = lsConf.oldTLogs[0].logRouterTags; + logSystem->txsTags = lsConf.oldTLogs[0].txsTags; + // logSystem->epochEnd = lsConf.oldTLogs[0].epochEnd; - return logSystem; + for (int i = 1; i < lsConf.oldTLogs.size(); i++) { + logSystem->oldLogData.emplace_back(lsConf.oldTLogs[i]); + } } + logSystem->logSystemType = lsConf.logSystemType; + logSystem->stopped = true; + logSystem->pseudoLocalities = lsConf.pseudoLocalities; - // Convert TagPartitionedLogSystem to DBCoreState and override input newState as return value - void toCoreState(DBCoreState& newState) final { - if (recoveryComplete.isValid() && recoveryComplete.isError()) - throw recoveryComplete.getError(); + return logSystem; +} - if (remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isError()) - throw remoteRecoveryComplete.getError(); +void TagPartitionedLogSystem::toCoreState(DBCoreState& newState) { + if (recoveryComplete.isValid() && recoveryComplete.isError()) + throw recoveryComplete.getError(); - newState.tLogs.clear(); - newState.logRouterTags = logRouterTags; - newState.txsTags = txsTags; - newState.pseudoLocalities = pseudoLocalities; - for (const auto& t : tLogs) { - if (t->logServers.size()) { - newState.tLogs.emplace_back(*t); - newState.tLogs.back().tLogLocalities.clear(); - for (const auto& log : t->logServers) { - newState.tLogs.back().tLogLocalities.push_back(log->get().interf().filteredLocality); - } - } - } + if (remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isError()) + throw remoteRecoveryComplete.getError(); - newState.oldTLogData.clear(); - if (!recoveryComplete.isValid() || !recoveryComplete.isReady() || - (repopulateRegionAntiQuorum == 0 && - (!remoteRecoveryComplete.isValid() || !remoteRecoveryComplete.isReady())) || - epoch != oldestBackupEpoch) { - for (const auto& oldData : oldLogData) { - newState.oldTLogData.emplace_back(oldData); - TraceEvent("BWToCore") - .detail("Epoch", newState.oldTLogData.back().epoch) - .detail("TotalTags", newState.oldTLogData.back().logRouterTags) - .detail("BeginVersion", newState.oldTLogData.back().epochBegin) - .detail("EndVersion", newState.oldTLogData.back().epochEnd); + newState.tLogs.clear(); + newState.logRouterTags = logRouterTags; + newState.txsTags = txsTags; + newState.pseudoLocalities = pseudoLocalities; + for (const auto& t : tLogs) { + if (t->logServers.size()) { + newState.tLogs.emplace_back(*t); + newState.tLogs.back().tLogLocalities.clear(); + for (const auto& log : t->logServers) { + newState.tLogs.back().tLogLocalities.push_back(log->get().interf().filteredLocality); } } + } - newState.logSystemType = logSystemType; + newState.oldTLogData.clear(); + if (!recoveryComplete.isValid() || !recoveryComplete.isReady() || + (repopulateRegionAntiQuorum == 0 && (!remoteRecoveryComplete.isValid() || !remoteRecoveryComplete.isReady())) || + epoch != oldestBackupEpoch) { + for (const auto& oldData : oldLogData) { + newState.oldTLogData.emplace_back(oldData); + TraceEvent("BWToCore") + .detail("Epoch", newState.oldTLogData.back().epoch) + .detail("TotalTags", newState.oldTLogData.back().logRouterTags) + .detail("BeginVersion", newState.oldTLogData.back().epochBegin) + .detail("EndVersion", newState.oldTLogData.back().epochEnd); + } } - bool remoteStorageRecovered() final { return remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isReady(); } + newState.logSystemType = logSystemType; +} - Future onCoreStateChanged() final { - std::vector> changes; - changes.push_back(Never()); - if (recoveryComplete.isValid() && !recoveryComplete.isReady()) { - changes.push_back(recoveryComplete); - } - if (remoteRecovery.isValid() && !remoteRecovery.isReady()) { - changes.push_back(remoteRecovery); - } - if (remoteRecoveryComplete.isValid() && !remoteRecoveryComplete.isReady()) { - changes.push_back(remoteRecoveryComplete); - } - changes.push_back(backupWorkerChanged.onTrigger()); // changes to oldestBackupEpoch - return waitForAny(changes); +bool TagPartitionedLogSystem::remoteStorageRecovered() { + return remoteRecoveryComplete.isValid() && remoteRecoveryComplete.isReady(); +} + +Future TagPartitionedLogSystem::onCoreStateChanged() { + std::vector> changes; + changes.push_back(Never()); + if (recoveryComplete.isValid() && !recoveryComplete.isReady()) { + changes.push_back(recoveryComplete); + } + if (remoteRecovery.isValid() && !remoteRecovery.isReady()) { + changes.push_back(remoteRecovery); } + if (remoteRecoveryComplete.isValid() && !remoteRecoveryComplete.isReady()) { + changes.push_back(remoteRecoveryComplete); + } + changes.push_back(backupWorkerChanged.onTrigger()); // changes to oldestBackupEpoch + return waitForAny(changes); +} - void coreStateWritten(DBCoreState const& newState) final { - if (!newState.oldTLogData.size()) { - recoveryCompleteWrittenToCoreState.set(true); - } - for (auto& t : newState.tLogs) { - if (!t.isLocal) { - TraceEvent("RemoteLogsWritten", dbgid).log(); - remoteLogsWrittenToCoreState = true; - break; - } +void TagPartitionedLogSystem::coreStateWritten(DBCoreState const& newState) { + if (!newState.oldTLogData.size()) { + recoveryCompleteWrittenToCoreState.set(true); + } + for (auto& t : newState.tLogs) { + if (!t.isLocal) { + TraceEvent("RemoteLogsWritten", dbgid).log(); + remoteLogsWrittenToCoreState = true; + break; } } +} - Future onError() final { return onError_internal(this); } +Future TagPartitionedLogSystem::onError() { + return onError_internal(this); +} - ACTOR static Future onError_internal(TagPartitionedLogSystem* self) { - // Never returns normally, but throws an error if the subsystem stops working - loop { - std::vector> failed; - std::vector> backupFailed(1, Never()); - std::vector> changes; +ACTOR Future TagPartitionedLogSystem::onError_internal(TagPartitionedLogSystem* self) { + // Never returns normally, but throws an error if the subsystem stops working + loop { + std::vector> failed; + std::vector> backupFailed(1, Never()); + std::vector> changes; - for (auto& it : self->tLogs) { - for (auto& t : it->logServers) { - if (t->get().present()) { - failed.push_back(waitFailureClient(t->get().interf().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } else { - changes.push_back(t->onChange()); - } + for (auto& it : self->tLogs) { + for (auto& t : it->logServers) { + if (t->get().present()) { + failed.push_back( + waitFailureClient(t->get().interf().waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } else { + changes.push_back(t->onChange()); } - for (auto& t : it->logRouters) { - if (t->get().present()) { - failed.push_back(waitFailureClient(t->get().interf().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } else { - changes.push_back(t->onChange()); + } + for (auto& t : it->logRouters) { + if (t->get().present()) { + failed.push_back( + waitFailureClient(t->get().interf().waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } else { + changes.push_back(t->onChange()); + } + } + for (const auto& worker : it->backupWorkers) { + if (worker->get().present()) { + backupFailed.push_back( + waitFailureClient(worker->get().interf().waitFailure, + SERVER_KNOBS->BACKUP_TIMEOUT, + -SERVER_KNOBS->BACKUP_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } else { + changes.push_back(worker->onChange()); + } + } + } + + if (!self->recoveryCompleteWrittenToCoreState.get()) { + for (auto& old : self->oldLogData) { + for (auto& it : old.tLogs) { + for (auto& t : it->logRouters) { + if (t->get().present()) { + failed.push_back(waitFailureClient(t->get().interf().waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / + SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } else { + changes.push_back(t->onChange()); + } } } - for (const auto& worker : it->backupWorkers) { + // Monitor changes of backup workers for old epochs. + for (const auto& worker : old.tLogs[0]->backupWorkers) { if (worker->get().present()) { backupFailed.push_back(waitFailureClient(worker->get().interf().waitFailure, SERVER_KNOBS->BACKUP_TIMEOUT, @@ -466,487 +451,476 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCountedrecoveryCompleteWrittenToCoreState.get()) { - for (auto& old : self->oldLogData) { - for (auto& it : old.tLogs) { - for (auto& t : it->logRouters) { - if (t->get().present()) { - failed.push_back(waitFailureClient(t->get().interf().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } else { - changes.push_back(t->onChange()); - } - } - } - // Monitor changes of backup workers for old epochs. - for (const auto& worker : old.tLogs[0]->backupWorkers) { - if (worker->get().present()) { - backupFailed.push_back(waitFailureClient(worker->get().interf().waitFailure, - SERVER_KNOBS->BACKUP_TIMEOUT, - -SERVER_KNOBS->BACKUP_TIMEOUT / - SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } else { - changes.push_back(worker->onChange()); - } - } - } - } - - if (self->hasRemoteServers && (!self->remoteRecovery.isReady() || self->remoteRecovery.isError())) { - changes.push_back(self->remoteRecovery); - } + if (self->hasRemoteServers && (!self->remoteRecovery.isReady() || self->remoteRecovery.isError())) { + changes.push_back(self->remoteRecovery); + } - changes.push_back(self->recoveryCompleteWrittenToCoreState.onChange()); - changes.push_back(self->backupWorkerChanged.onTrigger()); + changes.push_back(self->recoveryCompleteWrittenToCoreState.onChange()); + changes.push_back(self->backupWorkerChanged.onTrigger()); - ASSERT(failed.size() >= 1); - wait(quorum(changes, 1) || tagError(quorum(failed, 1), master_tlog_failed()) || - tagError(quorum(backupFailed, 1), master_backup_worker_failed())); - } + ASSERT(failed.size() >= 1); + wait(quorum(changes, 1) || tagError(quorum(failed, 1), master_tlog_failed()) || + tagError(quorum(backupFailed, 1), master_backup_worker_failed())); } +} + +ACTOR Future TagPartitionedLogSystem::pushResetChecker(Reference self, NetworkAddress addr) { + self->slowReplies = 0; + self->fastReplies = 0; + wait(delay(SERVER_KNOBS->PUSH_STATS_INTERVAL)); + TraceEvent("SlowPushStats") + .detail("PeerAddress", addr) + .detail("SlowReplies", self->slowReplies) + .detail("FastReplies", self->fastReplies); + if (self->slowReplies >= SERVER_KNOBS->PUSH_STATS_SLOW_AMOUNT && + self->slowReplies / double(self->slowReplies + self->fastReplies) >= SERVER_KNOBS->PUSH_STATS_SLOW_RATIO) { + FlowTransport::transport().resetConnection(addr); + self->lastReset = now(); + } + return Void(); +} - ACTOR static Future pushResetChecker(Reference self, NetworkAddress addr) { - self->slowReplies = 0; - self->fastReplies = 0; - wait(delay(SERVER_KNOBS->PUSH_STATS_INTERVAL)); - TraceEvent("SlowPushStats") - .detail("PeerAddress", addr) - .detail("SlowReplies", self->slowReplies) - .detail("FastReplies", self->fastReplies); - if (self->slowReplies >= SERVER_KNOBS->PUSH_STATS_SLOW_AMOUNT && - self->slowReplies / double(self->slowReplies + self->fastReplies) >= SERVER_KNOBS->PUSH_STATS_SLOW_RATIO) { - FlowTransport::transport().resetConnection(addr); - self->lastReset = now(); +ACTOR Future TagPartitionedLogSystem::recordPushMetrics(Reference self, + Reference dist, + NetworkAddress addr, + Future in) { + state double startTime = now(); + TLogCommitReply t = wait(in); + if (now() - self->lastReset > SERVER_KNOBS->PUSH_RESET_INTERVAL) { + if (now() - startTime > SERVER_KNOBS->PUSH_MAX_LATENCY) { + if (self->resetCheck.isReady()) { + self->resetCheck = TagPartitionedLogSystem::pushResetChecker(self, addr); + } + self->slowReplies++; + } else { + self->fastReplies++; } - return Void(); } + dist->sampleSeconds(now() - startTime); + return t; +} - ACTOR static Future recordPushMetrics(Reference self, - Reference dist, - NetworkAddress addr, - Future in) { - state double startTime = now(); - TLogCommitReply t = wait(in); - if (now() - self->lastReset > SERVER_KNOBS->PUSH_RESET_INTERVAL) { - if (now() - startTime > SERVER_KNOBS->PUSH_MAX_LATENCY) { - if (self->resetCheck.isReady()) { - self->resetCheck = pushResetChecker(self, addr); - } - self->slowReplies++; - } else { - self->fastReplies++; - } - } - dist->sampleSeconds(now() - startTime); - return t; - } - - Future push(Version prevVersion, - Version version, - Version knownCommittedVersion, - Version minKnownCommittedVersion, - LogPushData& data, - SpanID const& spanContext, - Optional debugID) final { - // FIXME: Randomize request order as in LegacyLogSystem? - vector> quorumResults; - vector> allReplies; - int location = 0; - Span span("TPLS:push"_loc, spanContext); - for (auto& it : tLogs) { - if (it->isLocal && it->logServers.size()) { - if (it->connectionResetTrackers.size() == 0) { - for (int i = 0; i < it->logServers.size(); i++) { - it->connectionResetTrackers.push_back(makeReference()); - } - } - if (it->tlogPushDistTrackers.empty()) { - for (int i = 0; i < it->logServers.size(); i++) { - it->tlogPushDistTrackers.push_back( - Histogram::getHistogram("ToTlog_" + it->logServers[i]->get().interf().uniqueID.toString(), - it->logServers[i]->get().interf().address().toString(), - Histogram::Unit::microseconds)); - } - } - vector> tLogCommitResults; - for (int loc = 0; loc < it->logServers.size(); loc++) { - Standalone msg = data.getMessages(location); - data.recordEmptyMessage(location, msg); - allReplies.push_back(recordPushMetrics( - it->connectionResetTrackers[loc], - it->tlogPushDistTrackers[loc], - it->logServers[loc]->get().interf().address(), - it->logServers[loc]->get().interf().commit.getReply(TLogCommitRequest(spanContext, - msg.arena(), - prevVersion, - version, - knownCommittedVersion, - minKnownCommittedVersion, - msg, - debugID), - TaskPriority::ProxyTLogCommitReply))); - Future commitSuccess = success(allReplies.back()); - addActor.get().send(commitSuccess); - tLogCommitResults.push_back(commitSuccess); - location++; - } - quorumResults.push_back(quorum(tLogCommitResults, tLogCommitResults.size() - it->tLogWriteAntiQuorum)); +Future TagPartitionedLogSystem::push(Version prevVersion, + Version version, + Version knownCommittedVersion, + Version minKnownCommittedVersion, + LogPushData& data, + SpanID const& spanContext, + Optional debugID) { + // FIXME: Randomize request order as in LegacyLogSystem? + vector> quorumResults; + vector> allReplies; + int location = 0; + Span span("TPLS:push"_loc, spanContext); + for (auto& it : tLogs) { + if (it->isLocal && it->logServers.size()) { + if (it->connectionResetTrackers.size() == 0) { + for (int i = 0; i < it->logServers.size(); i++) { + it->connectionResetTrackers.push_back(makeReference()); + } + } + if (it->tlogPushDistTrackers.empty()) { + for (int i = 0; i < it->logServers.size(); i++) { + it->tlogPushDistTrackers.push_back( + Histogram::getHistogram("ToTlog_" + it->logServers[i]->get().interf().uniqueID.toString(), + it->logServers[i]->get().interf().address().toString(), + Histogram::Unit::microseconds)); + } + } + vector> tLogCommitResults; + for (int loc = 0; loc < it->logServers.size(); loc++) { + Standalone msg = data.getMessages(location); + data.recordEmptyMessage(location, msg); + allReplies.push_back(recordPushMetrics( + it->connectionResetTrackers[loc], + it->tlogPushDistTrackers[loc], + it->logServers[loc]->get().interf().address(), + it->logServers[loc]->get().interf().commit.getReply(TLogCommitRequest(spanContext, + msg.arena(), + prevVersion, + version, + knownCommittedVersion, + minKnownCommittedVersion, + msg, + debugID), + TaskPriority::ProxyTLogCommitReply))); + Future commitSuccess = success(allReplies.back()); + addActor.get().send(commitSuccess); + tLogCommitResults.push_back(commitSuccess); + location++; + } + quorumResults.push_back(quorum(tLogCommitResults, tLogCommitResults.size() - it->tLogWriteAntiQuorum)); + } + } + + return minVersionWhenReady(waitForAll(quorumResults), allReplies); +} + +Reference TagPartitionedLogSystem::peekAll(UID dbgid, + Version begin, + Version end, + Tag tag, + bool parallelGetMore) { + int bestSet = 0; + std::vector> localSets; + Version lastBegin = 0; + bool foundSpecial = false; + for (auto& log : tLogs) { + if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { + foundSpecial = true; + } + if (log->isLocal && log->logServers.size() && + (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || + log->locality == tag.locality || tag == txsTag || tag.locality == tagLocalityTxs || + tag.locality == tagLocalityLogRouter || + ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { + lastBegin = std::max(lastBegin, log->startVersion); + localSets.push_back(log); + if (log->locality != tagLocalitySatellite) { + bestSet = localSets.size() - 1; } } - - return minVersionWhenReady(waitForAll(quorumResults), allReplies); } - Reference peekAll(UID dbgid, Version begin, Version end, Tag tag, bool parallelGetMore) { - int bestSet = 0; - std::vector> localSets; - Version lastBegin = 0; - bool foundSpecial = false; - for (auto& log : tLogs) { - if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { - foundSpecial = true; - } - if (log->isLocal && log->logServers.size() && - (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || - log->locality == tag.locality || tag == txsTag || tag.locality == tagLocalityTxs || - tag.locality == tagLocalityLogRouter || - ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { - lastBegin = std::max(lastBegin, log->startVersion); - localSets.push_back(log); - if (log->locality != tagLocalitySatellite) { - bestSet = localSets.size() - 1; - } - } - } + if (!localSets.size()) { + lastBegin = end; + } - if (!localSets.size()) { - lastBegin = end; - } + if (begin >= lastBegin && localSets.size()) { + TraceEvent("TLogPeekAllCurrentOnly", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .detail("BestLogs", localSets[bestSet]->logServerString()); + return makeReference( + localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, end, parallelGetMore); + } else { + std::vector> cursors; + std::vector epochEnds; - if (begin >= lastBegin && localSets.size()) { - TraceEvent("TLogPeekAllCurrentOnly", dbgid) + if (lastBegin < end && localSets.size()) { + TraceEvent("TLogPeekAllAddingCurrent", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) .detail("BestLogs", localSets[bestSet]->logServerString()); - return makeReference( - localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, end, parallelGetMore); - } else { - std::vector> cursors; - std::vector epochEnds; - - if (lastBegin < end && localSets.size()) { - TraceEvent("TLogPeekAllAddingCurrent", dbgid) + cursors.push_back(makeReference( + localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, lastBegin, end, parallelGetMore)); + } + for (int i = 0; begin < lastBegin; i++) { + if (i == oldLogData.size()) { + if (tag == txsTag || tag.locality == tagLocalityTxs || tag == cacheTag) { + break; + } + TraceEvent("TLogPeekAllDead", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) - .detail("BestLogs", localSets[bestSet]->logServerString()); - cursors.push_back(makeReference(localSets, - bestSet, - localSets[bestSet]->bestLocationFor(tag), - tag, - lastBegin, - end, - parallelGetMore)); - } - for (int i = 0; begin < lastBegin; i++) { - if (i == oldLogData.size()) { - if (tag == txsTag || tag.locality == tagLocalityTxs || tag == cacheTag) { - break; - } - TraceEvent("TLogPeekAllDead", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("LastBegin", lastBegin) - .detail("OldLogDataSize", oldLogData.size()); - return makeReference( - Reference>>(), - tag, - begin, - getPeekEnd(), - false, - false); - } - - int bestOldSet = 0; - std::vector> localOldSets; - Version thisBegin = begin; - bool thisSpecial = false; - for (auto& log : oldLogData[i].tLogs) { - if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { - thisSpecial = true; - } - if (log->isLocal && log->logServers.size() && - (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || - log->locality == tag.locality || tag == txsTag || tag.locality == tagLocalityTxs || - tag.locality == tagLocalityLogRouter || - ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && - log->locality != tagLocalitySatellite))) { - thisBegin = std::max(thisBegin, log->startVersion); - localOldSets.push_back(log); - if (log->locality != tagLocalitySatellite) { - bestOldSet = localOldSets.size() - 1; - } - } - } - - if (!localOldSets.size()) { - TraceEvent("TLogPeekAllNoLocalSets", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("LastBegin", lastBegin); - if (!cursors.size() && !foundSpecial) { - continue; - } - return makeReference( - Reference>>(), - tag, - begin, - getPeekEnd(), - false, - false); - } - if (thisSpecial) { - foundSpecial = true; - } + .detail("LastBegin", lastBegin) + .detail("OldLogDataSize", oldLogData.size()); + return makeReference( + Reference>>(), tag, begin, getPeekEnd(), false, false); + } - if (thisBegin < lastBegin) { - if (thisBegin < end) { - TraceEvent("TLogPeekAllAddingOld", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("BestLogs", localOldSets[bestOldSet]->logServerString()) - .detail("LastBegin", lastBegin) - .detail("ThisBegin", thisBegin); - cursors.push_back( - makeReference(localOldSets, - bestOldSet, - localOldSets[bestOldSet]->bestLocationFor(tag), - tag, - thisBegin, - std::min(lastBegin, end), - parallelGetMore)); - epochEnds.push_back(LogMessageVersion(std::min(lastBegin, end))); + int bestOldSet = 0; + std::vector> localOldSets; + Version thisBegin = begin; + bool thisSpecial = false; + for (auto& log : oldLogData[i].tLogs) { + if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { + thisSpecial = true; + } + if (log->isLocal && log->logServers.size() && + (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || + log->locality == tag.locality || tag == txsTag || tag.locality == tagLocalityTxs || + tag.locality == tagLocalityLogRouter || + ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && + log->locality != tagLocalitySatellite))) { + thisBegin = std::max(thisBegin, log->startVersion); + localOldSets.push_back(log); + if (log->locality != tagLocalitySatellite) { + bestOldSet = localOldSets.size() - 1; } - lastBegin = thisBegin; } } - return makeReference(cursors, epochEnds); - } - } - - Reference peekRemote(UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore) { - int bestSet = -1; - Version lastBegin = recoveredAt.present() ? recoveredAt.get() + 1 : 0; - for (int t = 0; t < tLogs.size(); t++) { - if (tLogs[t]->isLocal) { - lastBegin = std::max(lastBegin, tLogs[t]->startVersion); + if (!localOldSets.size()) { + TraceEvent("TLogPeekAllNoLocalSets", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .detail("LastBegin", lastBegin); + if (!cursors.size() && !foundSpecial) { + continue; + } + return makeReference( + Reference>>(), tag, begin, getPeekEnd(), false, false); } - - if (tLogs[t]->logRouters.size()) { - ASSERT(bestSet == -1); - bestSet = t; + if (thisSpecial) { + foundSpecial = true; } - } - if (bestSet == -1) { - TraceEvent("TLogPeekRemoteNoBestSet", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end.present() ? end.get() : getPeekEnd()); - return makeReference(Reference>>(), - tag, - begin, - getPeekEnd(), - false, - parallelGetMore); - } - if (begin >= lastBegin) { - TraceEvent("TLogPeekRemoteBestOnly", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end.present() ? end.get() : getPeekEnd()) - .detail("BestSet", bestSet) - .detail("BestSetStart", lastBegin) - .detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - return makeReference( - tLogs[bestSet]->logRouters, tag, begin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore); - } else { - std::vector> cursors; - std::vector epochEnds; - TraceEvent("TLogPeekRemoteAddingBest", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end.present() ? end.get() : getPeekEnd()) - .detail("BestSet", bestSet) - .detail("BestSetStart", lastBegin) - .detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - cursors.push_back(makeReference(tLogs[bestSet]->logRouters, - tag, - lastBegin, - end.present() ? end.get() + 1 : getPeekEnd(), - parallelGetMore)); - int i = 0; - while (begin < lastBegin) { - if (i == oldLogData.size()) { - TraceEvent("TLogPeekRemoteDead", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end.present() ? end.get() : getPeekEnd()) - .detail("LastBegin", lastBegin) - .detail("OldLogDataSize", oldLogData.size()); - return makeReference( - Reference>>(), - tag, - begin, - getPeekEnd(), - false, - parallelGetMore); - } - - int bestOldSet = -1; - Version thisBegin = begin; - for (int t = 0; t < oldLogData[i].tLogs.size(); t++) { - if (oldLogData[i].tLogs[t]->isLocal) { - thisBegin = std::max(thisBegin, oldLogData[i].tLogs[t]->startVersion); - } - - if (oldLogData[i].tLogs[t]->logRouters.size()) { - ASSERT(bestOldSet == -1); - bestOldSet = t; - } - } - if (bestOldSet == -1) { - TraceEvent("TLogPeekRemoteNoOldBestSet", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end.present() ? end.get() : getPeekEnd()); - return makeReference( - Reference>>(), - tag, - begin, - getPeekEnd(), - false, - parallelGetMore); - } - if (thisBegin < lastBegin) { - TraceEvent("TLogPeekRemoteAddingOldBest", dbgid) + if (thisBegin < lastBegin) { + if (thisBegin < end) { + TraceEvent("TLogPeekAllAddingOld", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) - .detail("End", end.present() ? end.get() : getPeekEnd()) - .detail("BestOldSet", bestOldSet) - .detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) + .detail("End", end) + .detail("BestLogs", localOldSets[bestOldSet]->logServerString()) .detail("LastBegin", lastBegin) - .detail("ThisBegin", thisBegin) - .detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); - cursors.push_back(makeReference( - oldLogData[i].tLogs[bestOldSet]->logRouters, tag, thisBegin, lastBegin, parallelGetMore)); - epochEnds.emplace_back(lastBegin); - lastBegin = thisBegin; + .detail("ThisBegin", thisBegin); + cursors.push_back( + makeReference(localOldSets, + bestOldSet, + localOldSets[bestOldSet]->bestLocationFor(tag), + tag, + thisBegin, + std::min(lastBegin, end), + parallelGetMore)); + epochEnds.push_back(LogMessageVersion(std::min(lastBegin, end))); } - i++; + lastBegin = thisBegin; } - - return makeReference(cursors, epochEnds); - } - } - - Reference peek(UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore) final { - if (!tLogs.size()) { - TraceEvent("TLogPeekNoLogSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); - return makeReference( - Reference>>(), tag, begin, getPeekEnd(), false, false); } - if (tag.locality == tagLocalityRemoteLog) { - return peekRemote(dbgid, begin, end, tag, parallelGetMore); - } else { - return peekAll(dbgid, begin, getPeekEnd(), tag, parallelGetMore); - } + return makeReference(cursors, epochEnds); } +} - Reference peek(UID dbgid, - Version begin, - Optional end, - std::vector tags, - bool parallelGetMore) final { - if (tags.empty()) { - TraceEvent("TLogPeekNoTags", dbgid).detail("Begin", begin); - return makeReference( - Reference>>(), invalidTag, begin, getPeekEnd(), false, false); +Reference TagPartitionedLogSystem::peekRemote(UID dbgid, + Version begin, + Optional end, + Tag tag, + bool parallelGetMore) { + int bestSet = -1; + Version lastBegin = recoveredAt.present() ? recoveredAt.get() + 1 : 0; + for (int t = 0; t < tLogs.size(); t++) { + if (tLogs[t]->isLocal) { + lastBegin = std::max(lastBegin, tLogs[t]->startVersion); } - if (tags.size() == 1) { - return peek(dbgid, begin, end, tags[0], parallelGetMore); + if (tLogs[t]->logRouters.size()) { + ASSERT(bestSet == -1); + bestSet = t; } - + } + if (bestSet == -1) { + TraceEvent("TLogPeekRemoteNoBestSet", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end.present() ? end.get() : getPeekEnd()); + return makeReference( + Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore); + } + if (begin >= lastBegin) { + TraceEvent("TLogPeekRemoteBestOnly", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end.present() ? end.get() : getPeekEnd()) + .detail("BestSet", bestSet) + .detail("BestSetStart", lastBegin) + .detail("LogRouterIds", tLogs[bestSet]->logRouterString()); + return makeReference( + tLogs[bestSet]->logRouters, tag, begin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore); + } else { std::vector> cursors; - cursors.reserve(tags.size()); - for (auto tag : tags) { - cursors.push_back(peek(dbgid, begin, end, tag, parallelGetMore)); - } - return makeReference(cursors, - begin, - end.present() ? end.get() + 1 : getPeekEnd(), - true, - tLogs[0]->locality == tagLocalityUpgraded, - false); - } - - Reference peekLocal(UID dbgid, - Tag tag, - Version begin, - Version end, - bool useMergePeekCursors, - int8_t peekLocality = tagLocalityInvalid) { - if (tag.locality >= 0 || tag.locality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial) { - peekLocality = tag.locality; - } - ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial); - - int bestSet = -1; - bool foundSpecial = false; - int logCount = 0; - for (int t = 0; t < tLogs.size(); t++) { - if (tLogs[t]->logServers.size() && tLogs[t]->locality != tagLocalitySatellite) { - logCount++; - } - if (tLogs[t]->logServers.size() && - (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || - tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || - peekLocality == tagLocalitySpecial)) { - if (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded) { - foundSpecial = true; + std::vector epochEnds; + TraceEvent("TLogPeekRemoteAddingBest", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end.present() ? end.get() : getPeekEnd()) + .detail("BestSet", bestSet) + .detail("BestSetStart", lastBegin) + .detail("LogRouterIds", tLogs[bestSet]->logRouterString()); + cursors.push_back(makeReference( + tLogs[bestSet]->logRouters, tag, lastBegin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore)); + int i = 0; + while (begin < lastBegin) { + if (i == oldLogData.size()) { + TraceEvent("TLogPeekRemoteDead", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end.present() ? end.get() : getPeekEnd()) + .detail("LastBegin", lastBegin) + .detail("OldLogDataSize", oldLogData.size()); + return makeReference( + Reference>>(), + tag, + begin, + getPeekEnd(), + false, + parallelGetMore); + } + + int bestOldSet = -1; + Version thisBegin = begin; + for (int t = 0; t < oldLogData[i].tLogs.size(); t++) { + if (oldLogData[i].tLogs[t]->isLocal) { + thisBegin = std::max(thisBegin, oldLogData[i].tLogs[t]->startVersion); + } + + if (oldLogData[i].tLogs[t]->logRouters.size()) { + ASSERT(bestOldSet == -1); + bestOldSet = t; } - bestSet = t; - break; } - } - if (bestSet == -1) { - TraceEvent("TLogPeekLocalNoBestSet", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("LogCount", logCount); - if (useMergePeekCursors || logCount > 1) { - throw worker_removed(); - } else { + if (bestOldSet == -1) { + TraceEvent("TLogPeekRemoteNoOldBestSet", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end.present() ? end.get() : getPeekEnd()); return makeReference( - Reference>>(), tag, begin, getPeekEnd(), false, false); + Reference>>(), + tag, + begin, + getPeekEnd(), + false, + parallelGetMore); } + + if (thisBegin < lastBegin) { + TraceEvent("TLogPeekRemoteAddingOldBest", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end.present() ? end.get() : getPeekEnd()) + .detail("BestOldSet", bestOldSet) + .detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) + .detail("LastBegin", lastBegin) + .detail("ThisBegin", thisBegin) + .detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); + cursors.push_back(makeReference( + oldLogData[i].tLogs[bestOldSet]->logRouters, tag, thisBegin, lastBegin, parallelGetMore)); + epochEnds.emplace_back(lastBegin); + lastBegin = thisBegin; + } + i++; } - if (begin >= tLogs[bestSet]->startVersion) { - TraceEvent("TLogPeekLocalBestOnly", dbgid) + return makeReference(cursors, epochEnds); + } +} + +Reference TagPartitionedLogSystem::peek(UID dbgid, + Version begin, + Optional end, + Tag tag, + bool parallelGetMore) { + if (!tLogs.size()) { + TraceEvent("TLogPeekNoLogSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); + return makeReference( + Reference>>(), tag, begin, getPeekEnd(), false, false); + } + + if (tag.locality == tagLocalityRemoteLog) { + return peekRemote(dbgid, begin, end, tag, parallelGetMore); + } else { + return peekAll(dbgid, begin, getPeekEnd(), tag, parallelGetMore); + } +} + +Reference TagPartitionedLogSystem::peek(UID dbgid, + Version begin, + Optional end, + std::vector tags, + bool parallelGetMore) { + if (tags.empty()) { + TraceEvent("TLogPeekNoTags", dbgid).detail("Begin", begin); + return makeReference( + Reference>>(), invalidTag, begin, getPeekEnd(), false, false); + } + + if (tags.size() == 1) { + return peek(dbgid, begin, end, tags[0], parallelGetMore); + } + + std::vector> cursors; + cursors.reserve(tags.size()); + for (auto tag : tags) { + cursors.push_back(peek(dbgid, begin, end, tag, parallelGetMore)); + } + return makeReference(cursors, + begin, + end.present() ? end.get() + 1 : getPeekEnd(), + true, + tLogs[0]->locality == tagLocalityUpgraded, + false); +} + +Reference TagPartitionedLogSystem::peekLocal(UID dbgid, + Tag tag, + Version begin, + Version end, + bool useMergePeekCursors, + int8_t peekLocality) { + if (tag.locality >= 0 || tag.locality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial) { + peekLocality = tag.locality; + } + ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial); + + int bestSet = -1; + bool foundSpecial = false; + int logCount = 0; + for (int t = 0; t < tLogs.size(); t++) { + if (tLogs[t]->logServers.size() && tLogs[t]->locality != tagLocalitySatellite) { + logCount++; + } + if (tLogs[t]->logServers.size() && + (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || + tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || + peekLocality == tagLocalitySpecial)) { + if (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded) { + foundSpecial = true; + } + bestSet = t; + break; + } + } + if (bestSet == -1) { + TraceEvent("TLogPeekLocalNoBestSet", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .detail("LogCount", logCount); + if (useMergePeekCursors || logCount > 1) { + throw worker_removed(); + } else { + return makeReference( + Reference>>(), tag, begin, getPeekEnd(), false, false); + } + } + + if (begin >= tLogs[bestSet]->startVersion) { + TraceEvent("TLogPeekLocalBestOnly", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .detail("BestSet", bestSet) + .detail("BestSetStart", tLogs[bestSet]->startVersion) + .detail("LogId", tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)]->get().id()); + if (useMergePeekCursors) { + return makeReference(tLogs[bestSet]->logServers, + tLogs[bestSet]->bestLocationFor(tag), + tLogs[bestSet]->logServers.size() + 1 - + tLogs[bestSet]->tLogReplicationFactor, + tag, + begin, + end, + true, + tLogs[bestSet]->tLogLocalities, + tLogs[bestSet]->tLogPolicy, + tLogs[bestSet]->tLogReplicationFactor); + } else { + return makeReference( + tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)], tag, begin, end, false, false); + } + } else { + std::vector> cursors; + std::vector epochEnds; + + if (tLogs[bestSet]->startVersion < end) { + TraceEvent("TLogPeekLocalAddingBest", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) @@ -954,1931 +928,1973 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCountedstartVersion) .detail("LogId", tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)]->get().id()); if (useMergePeekCursors) { - return makeReference(tLogs[bestSet]->logServers, - tLogs[bestSet]->bestLocationFor(tag), - tLogs[bestSet]->logServers.size() + 1 - - tLogs[bestSet]->tLogReplicationFactor, - tag, - begin, - end, - true, - tLogs[bestSet]->tLogLocalities, - tLogs[bestSet]->tLogPolicy, - tLogs[bestSet]->tLogReplicationFactor); + cursors.push_back(makeReference(tLogs[bestSet]->logServers, + tLogs[bestSet]->bestLocationFor(tag), + tLogs[bestSet]->logServers.size() + 1 - + tLogs[bestSet]->tLogReplicationFactor, + tag, + tLogs[bestSet]->startVersion, + end, + true, + tLogs[bestSet]->tLogLocalities, + tLogs[bestSet]->tLogPolicy, + tLogs[bestSet]->tLogReplicationFactor)); } else { - return makeReference( - tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)], tag, begin, end, false, false); + cursors.push_back(makeReference( + tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)], + tag, + tLogs[bestSet]->startVersion, + end, + false, + false)); } - } else { - std::vector> cursors; - std::vector epochEnds; - - if (tLogs[bestSet]->startVersion < end) { - TraceEvent("TLogPeekLocalAddingBest", dbgid) + } + Version lastBegin = tLogs[bestSet]->startVersion; + for (int i = 0; begin < lastBegin; i++) { + if (i == oldLogData.size()) { + if ((tag == txsTag || tag.locality == tagLocalityTxs) && cursors.size()) { + break; + } + TraceEvent("TLogPeekLocalDead", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) - .detail("BestSet", bestSet) - .detail("BestSetStart", tLogs[bestSet]->startVersion) - .detail("LogId", tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)]->get().id()); - if (useMergePeekCursors) { - cursors.push_back(makeReference( - tLogs[bestSet]->logServers, - tLogs[bestSet]->bestLocationFor(tag), - tLogs[bestSet]->logServers.size() + 1 - tLogs[bestSet]->tLogReplicationFactor, - tag, - tLogs[bestSet]->startVersion, - end, - true, - tLogs[bestSet]->tLogLocalities, - tLogs[bestSet]->tLogPolicy, - tLogs[bestSet]->tLogReplicationFactor)); - } else { - cursors.push_back(makeReference( - tLogs[bestSet]->logServers[tLogs[bestSet]->bestLocationFor(tag)], - tag, - tLogs[bestSet]->startVersion, - end, - false, - false)); - } + .detail("LastBegin", lastBegin) + .detail("OldLogDataSize", oldLogData.size()); + throw worker_removed(); } - Version lastBegin = tLogs[bestSet]->startVersion; - for (int i = 0; begin < lastBegin; i++) { - if (i == oldLogData.size()) { - if ((tag == txsTag || tag.locality == tagLocalityTxs) && cursors.size()) { - break; - } - TraceEvent("TLogPeekLocalDead", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("LastBegin", lastBegin) - .detail("OldLogDataSize", oldLogData.size()); - throw worker_removed(); - } - int bestOldSet = -1; - logCount = 0; - bool nextFoundSpecial = false; - for (int t = 0; t < oldLogData[i].tLogs.size(); t++) { - if (oldLogData[i].tLogs[t]->logServers.size() && - oldLogData[i].tLogs[t]->locality != tagLocalitySatellite) { - logCount++; + int bestOldSet = -1; + logCount = 0; + bool nextFoundSpecial = false; + for (int t = 0; t < oldLogData[i].tLogs.size(); t++) { + if (oldLogData[i].tLogs[t]->logServers.size() && + oldLogData[i].tLogs[t]->locality != tagLocalitySatellite) { + logCount++; + } + if (oldLogData[i].tLogs[t]->logServers.size() && + (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || + oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || + oldLogData[i].tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || + peekLocality == tagLocalitySpecial)) { + if (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || + oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded) { + nextFoundSpecial = true; } - if (oldLogData[i].tLogs[t]->logServers.size() && - (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || - oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || - oldLogData[i].tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || - peekLocality == tagLocalitySpecial)) { - if (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || - oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded) { - nextFoundSpecial = true; - } - if (foundSpecial && !oldLogData[i].tLogs[t]->isLocal) { - TraceEvent("TLogPeekLocalRemoteBeforeSpecial", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("LastBegin", lastBegin) - .detail("OldLogDataSize", oldLogData.size()) - .detail("Idx", i); - throw worker_removed(); - } - bestOldSet = t; - break; + if (foundSpecial && !oldLogData[i].tLogs[t]->isLocal) { + TraceEvent("TLogPeekLocalRemoteBeforeSpecial", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .detail("LastBegin", lastBegin) + .detail("OldLogDataSize", oldLogData.size()) + .detail("Idx", i); + throw worker_removed(); } + bestOldSet = t; + break; } + } - if (bestOldSet == -1) { - TraceEvent("TLogPeekLocalNoBestSet", dbgid) + if (bestOldSet == -1) { + TraceEvent("TLogPeekLocalNoBestSet", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("End", end) + .detail("LastBegin", lastBegin) + .detail("OldLogDataSize", oldLogData.size()) + .detail("Idx", i) + .detail("LogRouterTags", oldLogData[i].logRouterTags) + .detail("LogCount", logCount) + .detail("FoundSpecial", foundSpecial); + if (oldLogData[i].logRouterTags == 0 || logCount > 1 || foundSpecial) { + throw worker_removed(); + } + continue; + } + + foundSpecial = nextFoundSpecial; + + Version thisBegin = std::max(oldLogData[i].tLogs[bestOldSet]->startVersion, begin); + if (thisBegin < lastBegin) { + if (thisBegin < end) { + TraceEvent("TLogPeekLocalAddingOldBest", dbgid) .detail("Tag", tag.toString()) .detail("Begin", begin) .detail("End", end) - .detail("LastBegin", lastBegin) - .detail("OldLogDataSize", oldLogData.size()) - .detail("Idx", i) - .detail("LogRouterTags", oldLogData[i].logRouterTags) - .detail("LogCount", logCount) - .detail("FoundSpecial", foundSpecial); - if (oldLogData[i].logRouterTags == 0 || logCount > 1 || foundSpecial) { - throw worker_removed(); - } - continue; + .detail("BestOldSet", bestOldSet) + .detail("LogServers", oldLogData[i].tLogs[bestOldSet]->logServerString()) + .detail("ThisBegin", thisBegin) + .detail("LastBegin", lastBegin); + // detail("LogId", + // oldLogData[i].tLogs[bestOldSet]->logServers[tLogs[bestOldSet]->bestLocationFor( tag + // )]->get().id()); + cursors.push_back(makeReference( + oldLogData[i].tLogs[bestOldSet]->logServers, + oldLogData[i].tLogs[bestOldSet]->bestLocationFor(tag), + oldLogData[i].tLogs[bestOldSet]->logServers.size() + 1 - + oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor, + tag, + thisBegin, + std::min(lastBegin, end), + useMergePeekCursors, + oldLogData[i].tLogs[bestOldSet]->tLogLocalities, + oldLogData[i].tLogs[bestOldSet]->tLogPolicy, + oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor)); + epochEnds.emplace_back(std::min(lastBegin, end)); } + lastBegin = thisBegin; + } + } - foundSpecial = nextFoundSpecial; + return makeReference(cursors, epochEnds); + } +} - Version thisBegin = std::max(oldLogData[i].tLogs[bestOldSet]->startVersion, begin); - if (thisBegin < lastBegin) { - if (thisBegin < end) { - TraceEvent("TLogPeekLocalAddingOldBest", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("End", end) - .detail("BestOldSet", bestOldSet) - .detail("LogServers", oldLogData[i].tLogs[bestOldSet]->logServerString()) - .detail("ThisBegin", thisBegin) - .detail("LastBegin", lastBegin); - // detail("LogId", - // oldLogData[i].tLogs[bestOldSet]->logServers[tLogs[bestOldSet]->bestLocationFor( tag - // )]->get().id()); - cursors.push_back(makeReference( - oldLogData[i].tLogs[bestOldSet]->logServers, - oldLogData[i].tLogs[bestOldSet]->bestLocationFor(tag), - oldLogData[i].tLogs[bestOldSet]->logServers.size() + 1 - - oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor, - tag, - thisBegin, - std::min(lastBegin, end), - useMergePeekCursors, - oldLogData[i].tLogs[bestOldSet]->tLogLocalities, - oldLogData[i].tLogs[bestOldSet]->tLogPolicy, - oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor)); - epochEnds.emplace_back(std::min(lastBegin, end)); - } - lastBegin = thisBegin; - } - } +Reference TagPartitionedLogSystem::peekTxs(UID dbgid, + Version begin, + int8_t peekLocality, + Version localEnd, + bool canDiscardPopped) { + Version end = getEnd(); + if (!tLogs.size()) { + TraceEvent("TLogPeekTxsNoLogs", dbgid).log(); + return makeReference( + Reference>>(), txsTag, begin, end, false, false); + } + TraceEvent("TLogPeekTxs", dbgid) + .detail("Begin", begin) + .detail("End", end) + .detail("LocalEnd", localEnd) + .detail("PeekLocality", peekLocality) + .detail("CanDiscardPopped", canDiscardPopped); - return makeReference(cursors, epochEnds); - } + int maxTxsTags = txsTags; + bool needsOldTxs = tLogs[0]->tLogVersion < TLogVersion::V4; + for (auto& it : oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.txsTags); + needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; } - Reference peekTxs(UID dbgid, - Version begin, - int8_t peekLocality, - Version localEnd, - bool canDiscardPopped) final { - Version end = getEnd(); - if (!tLogs.size()) { - TraceEvent("TLogPeekTxsNoLogs", dbgid).log(); - return makeReference( - Reference>>(), txsTag, begin, end, false, false); + if (peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) { + std::vector> cursors; + cursors.reserve(maxTxsTags); + for (int i = 0; i < maxTxsTags; i++) { + cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); } - TraceEvent("TLogPeekTxs", dbgid) - .detail("Begin", begin) - .detail("End", end) - .detail("LocalEnd", localEnd) - .detail("PeekLocality", peekLocality) - .detail("CanDiscardPopped", canDiscardPopped); - - int maxTxsTags = txsTags; - bool needsOldTxs = tLogs[0]->tLogVersion < TLogVersion::V4; - for (auto& it : oldLogData) { - maxTxsTags = std::max(maxTxsTags, it.txsTags); - needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; + // SOMEDAY: remove once upgrades from 6.2 are no longer supported + if (needsOldTxs) { + cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); } - if (peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) { + return makeReference(cursors, begin, end, false, false, canDiscardPopped); + } + + try { + if (localEnd >= end) { std::vector> cursors; cursors.reserve(maxTxsTags); for (int i = 0; i < maxTxsTags; i++) { - cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); + cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality)); } // SOMEDAY: remove once upgrades from 6.2 are no longer supported if (needsOldTxs) { - cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); + cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality)); } return makeReference(cursors, begin, end, false, false, canDiscardPopped); } - try { - if (localEnd >= end) { - std::vector> cursors; - cursors.reserve(maxTxsTags); - for (int i = 0; i < maxTxsTags; i++) { - cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality)); - } - // SOMEDAY: remove once upgrades from 6.2 are no longer supported - if (needsOldTxs) { - cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality)); - } + std::vector> cursors; + std::vector epochEnds; - return makeReference(cursors, begin, end, false, false, canDiscardPopped); - } + cursors.resize(2); - std::vector> cursors; - std::vector epochEnds; + std::vector> localCursors; + std::vector> allCursors; + for (int i = 0; i < maxTxsTags; i++) { + localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality)); + allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true)); + } + // SOMEDAY: remove once upgrades from 6.2 are no longer supported + if (needsOldTxs) { + localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality)); + allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true)); + } - cursors.resize(2); + cursors[1] = + makeReference(localCursors, begin, localEnd, false, false, canDiscardPopped); + cursors[0] = makeReference(allCursors, localEnd, end, false, false, false); + epochEnds.emplace_back(localEnd); - std::vector> localCursors; - std::vector> allCursors; + return makeReference(cursors, epochEnds); + } catch (Error& e) { + if (e.code() == error_code_worker_removed) { + std::vector> cursors; + cursors.reserve(maxTxsTags); for (int i = 0; i < maxTxsTags; i++) { - localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality)); - allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true)); + cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); } // SOMEDAY: remove once upgrades from 6.2 are no longer supported if (needsOldTxs) { - localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality)); - allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true)); + cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); } - cursors[1] = makeReference( - localCursors, begin, localEnd, false, false, canDiscardPopped); - cursors[0] = makeReference(allCursors, localEnd, end, false, false, false); - epochEnds.emplace_back(localEnd); - - return makeReference(cursors, epochEnds); - } catch (Error& e) { - if (e.code() == error_code_worker_removed) { - std::vector> cursors; - cursors.reserve(maxTxsTags); - for (int i = 0; i < maxTxsTags; i++) { - cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); - } - // SOMEDAY: remove once upgrades from 6.2 are no longer supported - if (needsOldTxs) { - cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); - } - - return makeReference(cursors, begin, end, false, false, canDiscardPopped); - } - throw; + return makeReference(cursors, begin, end, false, false, canDiscardPopped); } + throw; } +} - Reference peekSingle(UID dbgid, - Version begin, - Tag tag, - std::vector> history) final { - while (history.size() && begin >= history.back().first) { - history.pop_back(); - } +Reference TagPartitionedLogSystem::peekSingle(UID dbgid, + Version begin, + Tag tag, + std::vector> history) { + while (history.size() && begin >= history.back().first) { + history.pop_back(); + } - if (history.size() == 0) { - TraceEvent("TLogPeekSingleNoHistory", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); - return peekLocal(dbgid, tag, begin, getPeekEnd(), false); - } else { - std::vector> cursors; - std::vector epochEnds; + if (history.size() == 0) { + TraceEvent("TLogPeekSingleNoHistory", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); + return peekLocal(dbgid, tag, begin, getPeekEnd(), false); + } else { + std::vector> cursors; + std::vector epochEnds; - TraceEvent("TLogPeekSingleAddingLocal", dbgid) + TraceEvent("TLogPeekSingleAddingLocal", dbgid).detail("Tag", tag.toString()).detail("Begin", history[0].first); + cursors.push_back(peekLocal(dbgid, tag, history[0].first, getPeekEnd(), false)); + + for (int i = 0; i < history.size(); i++) { + TraceEvent("TLogPeekSingleAddingOld", dbgid) .detail("Tag", tag.toString()) - .detail("Begin", history[0].first); - cursors.push_back(peekLocal(dbgid, tag, history[0].first, getPeekEnd(), false)); + .detail("HistoryTag", history[i].second.toString()) + .detail("Begin", i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin)) + .detail("End", history[i].first); + cursors.push_back(peekLocal(dbgid, + history[i].second, + i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), + history[i].first, + false)); + epochEnds.emplace_back(history[i].first); + } - for (int i = 0; i < history.size(); i++) { - TraceEvent("TLogPeekSingleAddingOld", dbgid) - .detail("Tag", tag.toString()) - .detail("HistoryTag", history[i].second.toString()) - .detail("Begin", i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin)) - .detail("End", history[i].first); - cursors.push_back(peekLocal(dbgid, - history[i].second, - i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), - history[i].first, - false)); - epochEnds.emplace_back(history[i].first); - } + return makeReference(cursors, epochEnds); + } +} - return makeReference(cursors, epochEnds); +Reference TagPartitionedLogSystem::peekLogRouter(UID dbgid, Version begin, Tag tag) { + bool found = false; + for (const auto& log : tLogs) { + found = log->hasLogRouter(dbgid) || log->hasBackupWorker(dbgid); + if (found) { + break; } } + if (found) { + if (stopped) { + std::vector> localSets; + int bestPrimarySet = 0; + int bestSatelliteSet = -1; + for (auto& log : tLogs) { + if (log->isLocal && log->logServers.size()) { + TraceEvent("TLogPeekLogRouterLocalSet", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("LogServers", log->logServerString()); + localSets.push_back(log); + if (log->locality == tagLocalitySatellite) { + bestSatelliteSet = localSets.size() - 1; + } else { + bestPrimarySet = localSets.size() - 1; + } + } + } + int bestSet = bestPrimarySet; + if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && + tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { + bestSet = bestSatelliteSet; + } - // LogRouter or BackupWorker use this function to obtain a cursor for peeking tlogs of a generation (i.e., epoch). - // Specifically, the epoch is determined by looking up "dbgid" in tlog sets of generations. - // The returned cursor can peek data at the "tag" from the given "begin" version to that epoch's end version or - // the recovery version for the latest old epoch. For the current epoch, the cursor has no end version. - Reference peekLogRouter(UID dbgid, Version begin, Tag tag) final { - bool found = false; - for (const auto& log : tLogs) { + TraceEvent("TLogPeekLogRouterSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); + // FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies + // across the WAN + return makeReference( + localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, getPeekEnd(), true); + } else { + int bestPrimarySet = -1; + int bestSatelliteSet = -1; + for (int i = 0; i < tLogs.size(); i++) { + const auto& log = tLogs[i]; + if (log->logServers.size() && log->isLocal) { + if (log->locality == tagLocalitySatellite) { + bestSatelliteSet = i; + break; + } else { + if (bestPrimarySet == -1) + bestPrimarySet = i; + } + } + } + int bestSet = bestPrimarySet; + if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && + tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { + bestSet = bestSatelliteSet; + } + const auto& log = tLogs[bestSet]; + TraceEvent("TLogPeekLogRouterBestOnly", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("LogId", log->logServers[log->bestLocationFor(tag)]->get().id()); + return makeReference( + log->logServers[log->bestLocationFor(tag)], tag, begin, getPeekEnd(), false, true); + } + } + bool firstOld = true; + for (const auto& old : oldLogData) { + found = false; + for (const auto& log : old.tLogs) { found = log->hasLogRouter(dbgid) || log->hasBackupWorker(dbgid); if (found) { break; } } if (found) { - if (stopped) { - std::vector> localSets; - int bestPrimarySet = 0; - int bestSatelliteSet = -1; - for (auto& log : tLogs) { - if (log->isLocal && log->logServers.size()) { - TraceEvent("TLogPeekLogRouterLocalSet", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("LogServers", log->logServerString()); - localSets.push_back(log); - if (log->locality == tagLocalitySatellite) { - bestSatelliteSet = localSets.size() - 1; - } else { - bestPrimarySet = localSets.size() - 1; - } - } - } - int bestSet = bestPrimarySet; - if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && - tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { - bestSet = bestSatelliteSet; - } - - TraceEvent("TLogPeekLogRouterSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); - // FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies - // across the WAN - return makeReference( - localSets, bestSet, localSets[bestSet]->bestLocationFor(tag), tag, begin, getPeekEnd(), true); - } else { - int bestPrimarySet = -1; - int bestSatelliteSet = -1; - for (int i = 0; i < tLogs.size(); i++) { - const auto& log = tLogs[i]; - if (log->logServers.size() && log->isLocal) { - if (log->locality == tagLocalitySatellite) { - bestSatelliteSet = i; - break; - } else { - if (bestPrimarySet == -1) - bestPrimarySet = i; - } + int bestPrimarySet = 0; + int bestSatelliteSet = -1; + std::vector> localSets; + for (auto& log : old.tLogs) { + if (log->isLocal && log->logServers.size()) { + TraceEvent("TLogPeekLogRouterOldLocalSet", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("LogServers", log->logServerString()); + localSets.push_back(log); + if (log->locality == tagLocalitySatellite) { + bestSatelliteSet = localSets.size() - 1; + } else { + bestPrimarySet = localSets.size() - 1; } } - int bestSet = bestPrimarySet; - if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && - tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { - bestSet = bestSatelliteSet; - } - const auto& log = tLogs[bestSet]; - TraceEvent("TLogPeekLogRouterBestOnly", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("LogId", log->logServers[log->bestLocationFor(tag)]->get().id()); - return makeReference( - log->logServers[log->bestLocationFor(tag)], tag, begin, getPeekEnd(), false, true); } - } - bool firstOld = true; - for (const auto& old : oldLogData) { - found = false; - for (const auto& log : old.tLogs) { - found = log->hasLogRouter(dbgid) || log->hasBackupWorker(dbgid); - if (found) { - break; - } + int bestSet = bestPrimarySet; + if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && + old.tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { + bestSet = bestSatelliteSet; } - if (found) { - int bestPrimarySet = 0; - int bestSatelliteSet = -1; - std::vector> localSets; - for (auto& log : old.tLogs) { - if (log->isLocal && log->logServers.size()) { - TraceEvent("TLogPeekLogRouterOldLocalSet", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("LogServers", log->logServerString()); - localSets.push_back(log); - if (log->locality == tagLocalitySatellite) { - bestSatelliteSet = localSets.size() - 1; - } else { - bestPrimarySet = localSets.size() - 1; - } - } - } - int bestSet = bestPrimarySet; - if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && bestSatelliteSet != -1 && - old.tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4) { - bestSet = bestSatelliteSet; - } - TraceEvent("TLogPeekLogRouterOldSets", dbgid) - .detail("Tag", tag.toString()) - .detail("Begin", begin) - .detail("OldEpoch", old.epochEnd) - .detail("RecoveredAt", recoveredAt.present() ? recoveredAt.get() : -1) - .detail("FirstOld", firstOld); - // FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies - // across the WAN - return makeReference( - localSets, - bestSet, - localSets[bestSet]->bestLocationFor(tag), - tag, - begin, - firstOld && recoveredAt.present() ? recoveredAt.get() + 1 : old.epochEnd, - true); - } - firstOld = false; + TraceEvent("TLogPeekLogRouterOldSets", dbgid) + .detail("Tag", tag.toString()) + .detail("Begin", begin) + .detail("OldEpoch", old.epochEnd) + .detail("RecoveredAt", recoveredAt.present() ? recoveredAt.get() : -1) + .detail("FirstOld", firstOld); + // FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies + // across the WAN + return makeReference(localSets, + bestSet, + localSets[bestSet]->bestLocationFor(tag), + tag, + begin, + firstOld && recoveredAt.present() ? recoveredAt.get() + 1 + : old.epochEnd, + true); + } + firstOld = false; + } + return makeReference( + Reference>>(), tag, begin, getPeekEnd(), false, false); +} + +Version TagPartitionedLogSystem::getKnownCommittedVersion() { + Version result = invalidVersion; + for (auto& it : lockResults) { + auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, it); + if (versions.present()) { + result = std::max(result, versions.get().first); } - return makeReference( - Reference>>(), tag, begin, getPeekEnd(), false, false); } + return result; +} - Version getKnownCommittedVersion() final { - Version result = invalidVersion; - for (auto& it : lockResults) { - auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, it); - if (versions.present()) { - result = std::max(result, versions.get().first); - } - } - return result; +Future TagPartitionedLogSystem::onKnownCommittedVersionChange() { + std::vector> result; + for (auto& it : lockResults) { + result.push_back(TagPartitionedLogSystem::getDurableVersionChanged(it)); } + if (!result.size()) { + return Never(); + } + return waitForAny(result); +} - Future onKnownCommittedVersionChange() final { - std::vector> result; - for (auto& it : lockResults) { - result.push_back(TagPartitionedLogSystem::getDurableVersionChanged(it)); - } - if (!result.size()) { - return Never(); +void TagPartitionedLogSystem::popLogRouter( + Version upTo, + Tag tag, + Version durableKnownCommittedVersion, + int8_t popLocality) { // FIXME: do not need to pop all generations of old logs + if (!upTo) + return; + for (auto& t : tLogs) { + if (t->locality == popLocality) { + for (auto& log : t->logRouters) { + Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; + if (prev < upTo) + outstandingPops[std::make_pair(log->get().id(), tag)] = + std::make_pair(upTo, durableKnownCommittedVersion); + if (prev == 0) { + popActors.add(popFromLog( + this, log, tag, 0.0)); // Fast pop time because log routers can only hold 5 seconds of data. + } + } } - return waitForAny(result); } - void popLogRouter(Version upTo, - Tag tag, - Version durableKnownCommittedVersion, - int8_t popLocality) { // FIXME: do not need to pop all generations of old logs - if (!upTo) - return; - for (auto& t : tLogs) { + for (auto& old : oldLogData) { + for (auto& t : old.tLogs) { if (t->locality == popLocality) { for (auto& log : t->logRouters) { Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; if (prev < upTo) outstandingPops[std::make_pair(log->get().id(), tag)] = std::make_pair(upTo, durableKnownCommittedVersion); - if (prev == 0) { - popActors.add(popFromLog( - this, log, tag, 0.0)); // Fast pop time because log routers can only hold 5 seconds of data. - } - } - } - } - - for (auto& old : oldLogData) { - for (auto& t : old.tLogs) { - if (t->locality == popLocality) { - for (auto& log : t->logRouters) { - Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; - if (prev < upTo) - outstandingPops[std::make_pair(log->get().id(), tag)] = - std::make_pair(upTo, durableKnownCommittedVersion); - if (prev == 0) - popActors.add(popFromLog(this, log, tag, 0.0)); - } + if (prev == 0) + popActors.add(popFromLog(this, log, tag, 0.0)); } } } } +} - void popTxs(Version upTo, int8_t popLocality) final { - if (getTLogVersion() < TLogVersion::V4) { - pop(upTo, txsTag, 0, popLocality); - } else { - for (int i = 0; i < txsTags; i++) { - pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality); - } +void TagPartitionedLogSystem::popTxs(Version upTo, int8_t popLocality) { + if (getTLogVersion() < TLogVersion::V4) { + pop(upTo, txsTag, 0, popLocality); + } else { + for (int i = 0; i < txsTags; i++) { + pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality); } } +} - // pop 'tag.locality' type data up to the 'upTo' version - void pop(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality) final { - if (upTo <= 0) - return; - if (tag.locality == tagLocalityRemoteLog) { - popLogRouter(upTo, tag, durableKnownCommittedVersion, popLocality); - return; - } - for (auto& t : tLogs) { - if (t->locality == tagLocalitySpecial || t->locality == tag.locality || - tag.locality == tagLocalityUpgraded || - (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { - for (auto& log : t->logServers) { - Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; - if (prev < upTo) { - // update pop version for popFromLog actor - outstandingPops[std::make_pair(log->get().id(), tag)] = - std::make_pair(upTo, durableKnownCommittedVersion); - } - if (prev == 0) { - // pop tag from log upto version defined in outstandingPops[].first - popActors.add(popFromLog(this, log, tag, 1.0)); //< FIXME: knob - } +void TagPartitionedLogSystem::pop(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality) { + if (upTo <= 0) + return; + if (tag.locality == tagLocalityRemoteLog) { + popLogRouter(upTo, tag, durableKnownCommittedVersion, popLocality); + return; + } + for (auto& t : tLogs) { + if (t->locality == tagLocalitySpecial || t->locality == tag.locality || tag.locality == tagLocalityUpgraded || + (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { + for (auto& log : t->logServers) { + Version prev = outstandingPops[std::make_pair(log->get().id(), tag)].first; + if (prev < upTo) { + // update pop version for popFromLog actor + outstandingPops[std::make_pair(log->get().id(), tag)] = + std::make_pair(upTo, durableKnownCommittedVersion); + } + if (prev == 0) { + // pop tag from log upto version defined in outstandingPops[].first + popActors.add(popFromLog(this, log, tag, 1.0)); //< FIXME: knob } } } } +} + +ACTOR Future TagPartitionedLogSystem::popFromLog(TagPartitionedLogSystem* self, + Reference>> log, + Tag tag, + double time) { + state Version last = 0; + loop { + wait(delay(time, TaskPriority::TLogPop)); - // pop tag from log up to the version defined in self->outstandingPops[].first - ACTOR static Future popFromLog(TagPartitionedLogSystem* self, - Reference>> log, - Tag tag, - double time) { - state Version last = 0; - loop { - wait(delay(time, TaskPriority::TLogPop)); + // to: first is upto version, second is durableKnownComittedVersion + state std::pair to = self->outstandingPops[std::make_pair(log->get().id(), tag)]; - // to: first is upto version, second is durableKnownComittedVersion - state std::pair to = self->outstandingPops[std::make_pair(log->get().id(), tag)]; + if (to.first <= last) { + self->outstandingPops.erase(std::make_pair(log->get().id(), tag)); + return Void(); + } - if (to.first <= last) { - self->outstandingPops.erase(std::make_pair(log->get().id(), tag)); + try { + if (!log->get().present()) return Void(); - } + wait(log->get().interf().popMessages.getReply(TLogPopRequest(to.first, to.second, tag), + TaskPriority::TLogPop)); - try { - if (!log->get().present()) - return Void(); - wait(log->get().interf().popMessages.getReply(TLogPopRequest(to.first, to.second, tag), - TaskPriority::TLogPop)); + last = to.first; + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + TraceEvent((e.code() == error_code_broken_promise) ? SevInfo : SevError, "LogPopError", self->dbgid) + .error(e) + .detail("Log", log->get().id()); + return Void(); // Leaving outstandingPops filled in means no further pop requests to this tlog from this + // logSystem + } + } +} + +ACTOR Future TagPartitionedLogSystem::getPoppedFromTLog( + Reference>> log, + Tag tag) { - last = to.first; - } catch (Error& e) { - if (e.code() == error_code_actor_cancelled) - throw; - TraceEvent((e.code() == error_code_broken_promise) ? SevInfo : SevError, "LogPopError", self->dbgid) - .error(e) - .detail("Log", log->get().id()); - return Void(); // Leaving outstandingPops filled in means no further pop requests to this tlog from this - // logSystem + loop { + choose { + when(TLogPeekReply rep = + wait(log->get().present() ? brokenPromiseToNever(log->get().interf().peekMessages.getReply( + TLogPeekRequest(-1, tag, false, false))) + : Never())) { + ASSERT(rep.popped.present()); + return rep.popped.get(); } + when(wait(log->onChange())) {} } } +} - ACTOR static Future getPoppedFromTLog(Reference>> log, Tag tag) { - loop { - choose { - when(TLogPeekReply rep = - wait(log->get().present() ? brokenPromiseToNever(log->get().interf().peekMessages.getReply( - TLogPeekRequest(-1, tag, false, false))) - : Never())) { - ASSERT(rep.popped.present()); - return rep.popped.get(); - } - when(wait(log->onChange())) {} +ACTOR Future TagPartitionedLogSystem::getPoppedTxs(TagPartitionedLogSystem* self) { + state std::vector>> poppedFutures; + state std::vector> poppedReady; + if (self->tLogs.size()) { + poppedFutures.push_back(std::vector>()); + for (auto& it : self->tLogs) { + for (auto& log : it->logServers) { + poppedFutures.back().push_back(TagPartitionedLogSystem::getPoppedFromTLog( + log, self->tLogs[0]->tLogVersion < TLogVersion::V4 ? txsTag : Tag(tagLocalityTxs, 0))); } } + poppedReady.push_back(waitForAny(poppedFutures.back())); } - ACTOR static Future getPoppedTxs(TagPartitionedLogSystem* self) { - state std::vector>> poppedFutures; - state std::vector> poppedReady; - if (self->tLogs.size()) { + for (auto& old : self->oldLogData) { + if (old.tLogs.size()) { poppedFutures.push_back(std::vector>()); - for (auto& it : self->tLogs) { + for (auto& it : old.tLogs) { for (auto& log : it->logServers) { - poppedFutures.back().push_back(getPoppedFromTLog( - log, self->tLogs[0]->tLogVersion < TLogVersion::V4 ? txsTag : Tag(tagLocalityTxs, 0))); + poppedFutures.back().push_back(TagPartitionedLogSystem::getPoppedFromTLog( + log, old.tLogs[0]->tLogVersion < TLogVersion::V4 ? txsTag : Tag(tagLocalityTxs, 0))); } } poppedReady.push_back(waitForAny(poppedFutures.back())); } + } - for (auto& old : self->oldLogData) { - if (old.tLogs.size()) { - poppedFutures.push_back(std::vector>()); - for (auto& it : old.tLogs) { - for (auto& log : it->logServers) { - poppedFutures.back().push_back(getPoppedFromTLog( - log, old.tLogs[0]->tLogVersion < TLogVersion::V4 ? txsTag : Tag(tagLocalityTxs, 0))); - } - } - poppedReady.push_back(waitForAny(poppedFutures.back())); - } - } - - state UID dbgid = self->dbgid; - state Future maxGetPoppedDuration = delay(SERVER_KNOBS->TXS_POPPED_MAX_DELAY); - wait(waitForAll(poppedReady) || maxGetPoppedDuration); + state UID dbgid = self->dbgid; + state Future maxGetPoppedDuration = delay(SERVER_KNOBS->TXS_POPPED_MAX_DELAY); + wait(waitForAll(poppedReady) || maxGetPoppedDuration); - if (maxGetPoppedDuration.isReady()) { - TraceEvent(SevWarnAlways, "PoppedTxsNotReady", dbgid).log(); - } + if (maxGetPoppedDuration.isReady()) { + TraceEvent(SevWarnAlways, "PoppedTxsNotReady", dbgid).log(); + } - Version maxPopped = 1; - for (auto& it : poppedFutures) { - for (auto& v : it) { - if (v.isReady()) { - maxPopped = std::max(maxPopped, v.get()); - } + Version maxPopped = 1; + for (auto& it : poppedFutures) { + for (auto& v : it) { + if (v.isReady()) { + maxPopped = std::max(maxPopped, v.get()); } } - return maxPopped; } + return maxPopped; +} - Future getTxsPoppedVersion() final { return getPoppedTxs(this); } +Future TagPartitionedLogSystem::getTxsPoppedVersion() { + return getPoppedTxs(this); +} - ACTOR static Future confirmEpochLive_internal(Reference logSet, Optional debugID) { - state vector> alive; - int numPresent = 0; - for (auto& t : logSet->logServers) { - if (t->get().present()) { - alive.push_back(brokenPromiseToNever(t->get().interf().confirmRunning.getReply( - TLogConfirmRunningRequest(debugID), TaskPriority::TLogConfirmRunningReply))); - numPresent++; - } else { - alive.push_back(Never()); - } +ACTOR Future TagPartitionedLogSystem::confirmEpochLive_internal(Reference logSet, Optional debugID) { + state vector> alive; + int numPresent = 0; + for (auto& t : logSet->logServers) { + if (t->get().present()) { + alive.push_back(brokenPromiseToNever(t->get().interf().confirmRunning.getReply( + TLogConfirmRunningRequest(debugID), TaskPriority::TLogConfirmRunningReply))); + numPresent++; + } else { + alive.push_back(Never()); } + } - wait(quorum(alive, std::min(logSet->tLogReplicationFactor, numPresent - logSet->tLogWriteAntiQuorum))); + wait(quorum(alive, std::min(logSet->tLogReplicationFactor, numPresent - logSet->tLogWriteAntiQuorum))); - state std::vector aliveEntries; - state std::vector responded(alive.size(), false); - loop { - for (int i = 0; i < alive.size(); i++) { - if (!responded[i] && alive[i].isReady() && !alive[i].isError()) { - aliveEntries.push_back(logSet->logEntryArray[i]); - responded[i] = true; - } - } - - if (logSet->satisfiesPolicy(aliveEntries)) { - return Void(); + state std::vector aliveEntries; + state std::vector responded(alive.size(), false); + loop { + for (int i = 0; i < alive.size(); i++) { + if (!responded[i] && alive[i].isReady() && !alive[i].isError()) { + aliveEntries.push_back(logSet->logEntryArray[i]); + responded[i] = true; } + } - // The current set of responders that we have weren't enough to form a quorum, so we must - // wait for more responses and try again. - std::vector> changes; - for (int i = 0; i < alive.size(); i++) { - if (!alive[i].isReady()) { - changes.push_back(ready(alive[i])); - } else if (alive[i].isReady() && alive[i].isError() && - alive[i].getError().code() == error_code_tlog_stopped) { - // All commits must go to all TLogs. If any TLog is stopped, then our epoch has ended. - return Never(); - } - } - ASSERT(changes.size() != 0); - wait(waitForAny(changes)); + if (logSet->satisfiesPolicy(aliveEntries)) { + return Void(); } - } - // Returns success after confirming that pushes in the current epoch are still possible - Future confirmEpochLive(Optional debugID) final { - vector> quorumResults; - for (auto& it : tLogs) { - if (it->isLocal && it->logServers.size()) { - quorumResults.push_back(confirmEpochLive_internal(it, debugID)); + // The current set of responders that we have weren't enough to form a quorum, so we must + // wait for more responses and try again. + std::vector> changes; + for (int i = 0; i < alive.size(); i++) { + if (!alive[i].isReady()) { + changes.push_back(ready(alive[i])); + } else if (alive[i].isReady() && alive[i].isError() && + alive[i].getError().code() == error_code_tlog_stopped) { + // All commits must go to all TLogs. If any TLog is stopped, then our epoch has ended. + return Never(); } } - - return waitForAll(quorumResults); + ASSERT(changes.size() != 0); + wait(waitForAny(changes)); } +} - Future endEpoch() final { - std::vector> lockResults; - for (auto& logSet : tLogs) { - for (auto& log : logSet->logServers) { - lockResults.push_back(success(lockTLog(dbgid, log))); - } +Future TagPartitionedLogSystem::confirmEpochLive(Optional debugID) { + vector> quorumResults; + for (auto& it : tLogs) { + if (it->isLocal && it->logServers.size()) { + quorumResults.push_back(confirmEpochLive_internal(it, debugID)); } - return waitForAll(lockResults); } - // Call only after end_epoch() has successfully completed. Returns a new epoch immediately following this one. - // The new epoch is only provisional until the caller updates the coordinated DBCoreState. - Future> newEpoch(RecruitFromConfigurationReply const& recr, - Future const& fRemoteWorkers, - DatabaseConfiguration const& config, - LogEpoch recoveryCount, - int8_t primaryLocality, - int8_t remoteLocality, - std::vector const& allTags, - Reference> const& recruitmentStalled) final { - return newEpoch(Reference::addRef(this), - recr, - fRemoteWorkers, - config, - recoveryCount, - primaryLocality, - remoteLocality, - allTags, - recruitmentStalled); - } + return waitForAll(quorumResults); +} - LogSystemConfig getLogSystemConfig() const final { - LogSystemConfig logSystemConfig(epoch); - logSystemConfig.logSystemType = logSystemType; - logSystemConfig.expectedLogSets = expectedLogSets; - logSystemConfig.logRouterTags = logRouterTags; - logSystemConfig.txsTags = txsTags; - logSystemConfig.recruitmentID = recruitmentID; - logSystemConfig.stopped = stopped; - logSystemConfig.recoveredAt = recoveredAt; - logSystemConfig.pseudoLocalities = pseudoLocalities; - logSystemConfig.oldestBackupEpoch = oldestBackupEpoch; - for (const Reference& logSet : tLogs) { - if (logSet->isLocal || remoteLogsWrittenToCoreState) { - logSystemConfig.tLogs.emplace_back(*logSet); - } +Future TagPartitionedLogSystem::endEpoch() { + std::vector> lockResults; + for (auto& logSet : tLogs) { + for (auto& log : logSet->logServers) { + lockResults.push_back(success(lockTLog(dbgid, log))); } + } + return waitForAll(lockResults); +} - if (!recoveryCompleteWrittenToCoreState.get()) { - for (const auto& oldData : oldLogData) { - logSystemConfig.oldTLogs.emplace_back(oldData); - } +Future> TagPartitionedLogSystem::newEpoch( + RecruitFromConfigurationReply const& recr, + Future const& fRemoteWorkers, + DatabaseConfiguration const& config, + LogEpoch recoveryCount, + int8_t primaryLocality, + int8_t remoteLocality, + std::vector const& allTags, + Reference> const& recruitmentStalled) { + return newEpoch(Reference::addRef(this), + recr, + fRemoteWorkers, + config, + recoveryCount, + primaryLocality, + remoteLocality, + allTags, + recruitmentStalled); +} + +LogSystemConfig TagPartitionedLogSystem::getLogSystemConfig() const { + LogSystemConfig logSystemConfig(epoch); + logSystemConfig.logSystemType = logSystemType; + logSystemConfig.expectedLogSets = expectedLogSets; + logSystemConfig.logRouterTags = logRouterTags; + logSystemConfig.txsTags = txsTags; + logSystemConfig.recruitmentID = recruitmentID; + logSystemConfig.stopped = stopped; + logSystemConfig.recoveredAt = recoveredAt; + logSystemConfig.pseudoLocalities = pseudoLocalities; + logSystemConfig.oldestBackupEpoch = oldestBackupEpoch; + for (const Reference& logSet : tLogs) { + if (logSet->isLocal || remoteLogsWrittenToCoreState) { + logSystemConfig.tLogs.emplace_back(*logSet); } - return logSystemConfig; } - Standalone getLogsValue() const final { - vector> logs; - vector> oldLogs; - for (auto& t : tLogs) { - if (t->isLocal || remoteLogsWrittenToCoreState) { - for (int i = 0; i < t->logServers.size(); i++) { - logs.emplace_back(t->logServers[i]->get().id(), - t->logServers[i]->get().present() ? t->logServers[i]->get().interf().address() - : NetworkAddress()); - } - } - } - if (!recoveryCompleteWrittenToCoreState.get()) { - for (int i = 0; i < oldLogData.size(); i++) { - for (auto& t : oldLogData[i].tLogs) { - for (int j = 0; j < t->logServers.size(); j++) { - oldLogs.emplace_back(t->logServers[j]->get().id(), - t->logServers[j]->get().present() - ? t->logServers[j]->get().interf().address() - : NetworkAddress()); - } - } - } + if (!recoveryCompleteWrittenToCoreState.get()) { + for (const auto& oldData : oldLogData) { + logSystemConfig.oldTLogs.emplace_back(oldData); } - return logsValue(logs, oldLogs); } + return logSystemConfig; +} - Future onLogSystemConfigChange() final { - std::vector> changes; - changes.push_back(logSystemConfigChanged.onTrigger()); - for (auto& t : tLogs) { +Standalone TagPartitionedLogSystem::getLogsValue() const { + vector> logs; + vector> oldLogs; + for (auto& t : tLogs) { + if (t->isLocal || remoteLogsWrittenToCoreState) { for (int i = 0; i < t->logServers.size(); i++) { - changes.push_back(t->logServers[i]->onChange()); + logs.emplace_back(t->logServers[i]->get().id(), + t->logServers[i]->get().present() ? t->logServers[i]->get().interf().address() + : NetworkAddress()); } } + } + if (!recoveryCompleteWrittenToCoreState.get()) { for (int i = 0; i < oldLogData.size(); i++) { for (auto& t : oldLogData[i].tLogs) { for (int j = 0; j < t->logServers.size(); j++) { - changes.push_back(t->logServers[j]->onChange()); + oldLogs.emplace_back(t->logServers[j]->get().id(), + t->logServers[j]->get().present() ? t->logServers[j]->get().interf().address() + : NetworkAddress()); } } } + } + return logsValue(logs, oldLogs); +} - if (hasRemoteServers && !remoteRecovery.isReady()) { - changes.push_back(remoteRecovery); +Future TagPartitionedLogSystem::onLogSystemConfigChange() { + std::vector> changes; + changes.push_back(logSystemConfigChanged.onTrigger()); + for (auto& t : tLogs) { + for (int i = 0; i < t->logServers.size(); i++) { + changes.push_back(t->logServers[i]->onChange()); } - - return waitForAny(changes); } - - Version getEnd() const final { - ASSERT(recoverAt.present()); - return recoverAt.get() + 1; + for (int i = 0; i < oldLogData.size(); i++) { + for (auto& t : oldLogData[i].tLogs) { + for (int j = 0; j < t->logServers.size(); j++) { + changes.push_back(t->logServers[j]->onChange()); + } + } } - Version getPeekEnd() const { - if (recoverAt.present()) - return getEnd(); - else - return std::numeric_limits::max(); + if (hasRemoteServers && !remoteRecovery.isReady()) { + changes.push_back(remoteRecovery); } - void getPushLocations(VectorRef tags, std::vector& locations, bool allLocations) const final { - int locationOffset = 0; - for (auto& log : tLogs) { - if (log->isLocal && log->logServers.size()) { - log->getPushLocations(tags, locations, locationOffset, allLocations); - locationOffset += log->logServers.size(); - } + return waitForAny(changes); +} + +Version TagPartitionedLogSystem::getEnd() const { + ASSERT(recoverAt.present()); + return recoverAt.get() + 1; +} + +Version TagPartitionedLogSystem::getPeekEnd() const { + if (recoverAt.present()) + return getEnd(); + else + return std::numeric_limits::max(); +} + +void TagPartitionedLogSystem::getPushLocations(VectorRef tags, + std::vector& locations, + bool allLocations) const { + int locationOffset = 0; + for (auto& log : tLogs) { + if (log->isLocal && log->logServers.size()) { + log->getPushLocations(tags, locations, locationOffset, allLocations); + locationOffset += log->logServers.size(); } } +} + +bool TagPartitionedLogSystem::hasRemoteLogs() const { + return logRouterTags > 0 || pseudoLocalities.size() > 0; +} - bool hasRemoteLogs() const final { return logRouterTags > 0 || pseudoLocalities.size() > 0; } +Tag TagPartitionedLogSystem::getRandomRouterTag() const { + return Tag(tagLocalityLogRouter, deterministicRandom()->randomInt(0, logRouterTags)); +} - Tag getRandomRouterTag() const final { - return Tag(tagLocalityLogRouter, deterministicRandom()->randomInt(0, logRouterTags)); - } +Tag TagPartitionedLogSystem::getRandomTxsTag() const { + return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, txsTags)); +} - Tag getRandomTxsTag() const final { return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, txsTags)); } +TLogVersion TagPartitionedLogSystem::getTLogVersion() const { + return tLogs[0]->tLogVersion; +} + +int TagPartitionedLogSystem::getLogRouterTags() const { + return logRouterTags; +} - TLogVersion getTLogVersion() const final { return tLogs[0]->tLogVersion; } +Version TagPartitionedLogSystem::getBackupStartVersion() const { + ASSERT(tLogs.size() > 0); + return backupStartVersion; +} - int getLogRouterTags() const final { return logRouterTags; } +std::map TagPartitionedLogSystem::getOldEpochTagsVersionsInfo() const { + std::map epochInfos; + for (const auto& old : oldLogData) { + epochInfos.insert( + { old.epoch, ILogSystem::EpochTagsVersionsInfo(old.logRouterTags, old.epochBegin, old.epochEnd) }); + TraceEvent("OldEpochTagsVersions", dbgid) + .detail("Epoch", old.epoch) + .detail("Tags", old.logRouterTags) + .detail("BeginVersion", old.epochBegin) + .detail("EndVersion", old.epochEnd); + } + return epochInfos; +} - Version getBackupStartVersion() const final { - ASSERT(tLogs.size() > 0); - return backupStartVersion; +inline Reference TagPartitionedLogSystem::getEpochLogSet(LogEpoch epoch) const { + for (const auto& old : oldLogData) { + if (epoch == old.epoch) + return old.tLogs[0]; } + return Reference(nullptr); +} - std::map getOldEpochTagsVersionsInfo() const final { - std::map epochInfos; - for (const auto& old : oldLogData) { - epochInfos.insert( - { old.epoch, ILogSystem::EpochTagsVersionsInfo(old.logRouterTags, old.epochBegin, old.epochEnd) }); - TraceEvent("OldEpochTagsVersions", dbgid) - .detail("Epoch", old.epoch) - .detail("Tags", old.logRouterTags) - .detail("BeginVersion", old.epochBegin) - .detail("EndVersion", old.epochEnd); +void TagPartitionedLogSystem::setBackupWorkers(const std::vector& replies) { + ASSERT(tLogs.size() > 0); + + Reference logset = tLogs[0]; // Master recruits this epoch's worker first. + LogEpoch logsetEpoch = this->epoch; + oldestBackupEpoch = this->epoch; + for (const auto& reply : replies) { + if (removedBackupWorkers.count(reply.interf.id()) > 0) { + removedBackupWorkers.erase(reply.interf.id()); + continue; + } + auto worker = makeReference>>( + OptionalInterface(reply.interf)); + if (reply.backupEpoch != logsetEpoch) { + // find the logset from oldLogData + logsetEpoch = reply.backupEpoch; + oldestBackupEpoch = std::min(oldestBackupEpoch, logsetEpoch); + logset = getEpochLogSet(logsetEpoch); + ASSERT(logset.isValid()); + } + logset->backupWorkers.push_back(worker); + TraceEvent("AddBackupWorker", dbgid).detail("Epoch", logsetEpoch).detail("BackupWorkerID", reply.interf.id()); + } + TraceEvent("SetOldestBackupEpoch", dbgid).detail("Epoch", oldestBackupEpoch); + backupWorkerChanged.trigger(); +} + +bool TagPartitionedLogSystem::removeBackupWorker(const BackupWorkerDoneRequest& req) { + bool removed = false; + Reference logset = getEpochLogSet(req.backupEpoch); + if (logset.isValid()) { + for (auto it = logset->backupWorkers.begin(); it != logset->backupWorkers.end(); it++) { + if (it->getPtr()->get().interf().id() == req.workerUID) { + logset->backupWorkers.erase(it); + removed = true; + break; + } } - return epochInfos; } - inline Reference getEpochLogSet(LogEpoch epoch) const { + if (removed) { + oldestBackupEpoch = epoch; for (const auto& old : oldLogData) { - if (epoch == old.epoch) - return old.tLogs[0]; + if (old.epoch < oldestBackupEpoch && old.tLogs[0]->backupWorkers.size() > 0) { + oldestBackupEpoch = old.epoch; + } } - return Reference(nullptr); + backupWorkerChanged.trigger(); + } else { + removedBackupWorkers.insert(req.workerUID); } - void setBackupWorkers(const std::vector& replies) final { - ASSERT(tLogs.size() > 0); + TraceEvent("RemoveBackupWorker", dbgid) + .detail("Removed", removed) + .detail("BackupEpoch", req.backupEpoch) + .detail("WorkerID", req.workerUID) + .detail("OldestBackupEpoch", oldestBackupEpoch); + return removed; +} - Reference logset = tLogs[0]; // Master recruits this epoch's worker first. - LogEpoch logsetEpoch = this->epoch; - oldestBackupEpoch = this->epoch; - for (const auto& reply : replies) { - if (removedBackupWorkers.count(reply.interf.id()) > 0) { - removedBackupWorkers.erase(reply.interf.id()); - continue; - } - auto worker = makeReference>>( - OptionalInterface(reply.interf)); - if (reply.backupEpoch != logsetEpoch) { - // find the logset from oldLogData - logsetEpoch = reply.backupEpoch; - oldestBackupEpoch = std::min(oldestBackupEpoch, logsetEpoch); - logset = getEpochLogSet(logsetEpoch); - ASSERT(logset.isValid()); - } - logset->backupWorkers.push_back(worker); - TraceEvent("AddBackupWorker", dbgid) - .detail("Epoch", logsetEpoch) - .detail("BackupWorkerID", reply.interf.id()); - } - TraceEvent("SetOldestBackupEpoch", dbgid).detail("Epoch", oldestBackupEpoch); - backupWorkerChanged.trigger(); - } +LogEpoch TagPartitionedLogSystem::getOldestBackupEpoch() const { + return oldestBackupEpoch; +} - bool removeBackupWorker(const BackupWorkerDoneRequest& req) final { - bool removed = false; - Reference logset = getEpochLogSet(req.backupEpoch); - if (logset.isValid()) { - for (auto it = logset->backupWorkers.begin(); it != logset->backupWorkers.end(); it++) { - if (it->getPtr()->get().interf().id() == req.workerUID) { - logset->backupWorkers.erase(it); - removed = true; - break; - } - } - } +void TagPartitionedLogSystem::setOldestBackupEpoch(LogEpoch epoch) { + oldestBackupEpoch = epoch; + backupWorkerChanged.trigger(); +} - if (removed) { - oldestBackupEpoch = epoch; - for (const auto& old : oldLogData) { - if (old.epoch < oldestBackupEpoch && old.tLogs[0]->backupWorkers.size() > 0) { - oldestBackupEpoch = old.epoch; - } - } - backupWorkerChanged.trigger(); +ACTOR Future TagPartitionedLogSystem::monitorLog(Reference>> logServer, + Reference> failed) { + state Future waitFailure; + loop { + if (logServer->get().present()) + waitFailure = waitFailureTracker(logServer->get().interf().waitFailure, failed); + else + failed->set(true); + wait(logServer->onChange()); + } +} + +Optional> TagPartitionedLogSystem::getDurableVersion( + UID dbgid, + LogLockInfo lockInfo, + std::vector>> failed, + Optional lastEnd) { + + Reference logSet = lockInfo.logSet; + // To ensure consistent recovery, the number of servers NOT in the write quorum plus the number of servers NOT + // in the read quorum have to be strictly less than the replication factor. Otherwise there could be a replica + // set consistent entirely of servers that are out of date due to not being in the write quorum or unavailable + // due to not being in the read quorum. So with N = # of tlogs, W = antiquorum, R = required count, F = + // replication factor, W + (N - R) < F, and optimally (N-W)+(N-R)=F-1. Thus R=N+1-F+W. + int requiredCount = + (int)logSet->logServers.size() + 1 - logSet->tLogReplicationFactor + logSet->tLogWriteAntiQuorum; + ASSERT(requiredCount > 0 && requiredCount <= logSet->logServers.size()); + ASSERT(logSet->tLogReplicationFactor >= 1 && logSet->tLogReplicationFactor <= logSet->logServers.size()); + ASSERT(logSet->tLogWriteAntiQuorum >= 0 && logSet->tLogWriteAntiQuorum < logSet->logServers.size()); + + std::vector availableItems, badCombo; + std::vector results; + std::string sServerState; + LocalityGroup unResponsiveSet; + + for (int t = 0; t < logSet->logServers.size(); t++) { + if (lockInfo.replies[t].isReady() && !lockInfo.replies[t].isError() && (!failed.size() || !failed[t]->get())) { + results.push_back(lockInfo.replies[t].get()); + availableItems.push_back(logSet->tLogLocalities[t]); + sServerState += 'a'; } else { - removedBackupWorkers.insert(req.workerUID); + unResponsiveSet.add(logSet->tLogLocalities[t]); + sServerState += 'f'; } - - TraceEvent("RemoveBackupWorker", dbgid) - .detail("Removed", removed) - .detail("BackupEpoch", req.backupEpoch) - .detail("WorkerID", req.workerUID) - .detail("OldestBackupEpoch", oldestBackupEpoch); - return removed; } - LogEpoch getOldestBackupEpoch() const final { return oldestBackupEpoch; } + // Check if the list of results is not larger than the anti quorum + bool bTooManyFailures = (results.size() <= logSet->tLogWriteAntiQuorum); - void setOldestBackupEpoch(LogEpoch epoch) final { - oldestBackupEpoch = epoch; - backupWorkerChanged.trigger(); + // Check if failed logs complete the policy + bTooManyFailures = bTooManyFailures || ((unResponsiveSet.size() >= logSet->tLogReplicationFactor) && + (unResponsiveSet.validate(logSet->tLogPolicy))); + + // Check all combinations of the AntiQuorum within the failed + if (!bTooManyFailures && (logSet->tLogWriteAntiQuorum) && + (!validateAllCombinations( + badCombo, unResponsiveSet, logSet->tLogPolicy, availableItems, logSet->tLogWriteAntiQuorum, false))) { + TraceEvent("EpochEndBadCombo", dbgid) + .detail("Required", requiredCount) + .detail("Present", results.size()) + .detail("ServerState", sServerState); + bTooManyFailures = true; } - ACTOR static Future monitorLog(Reference>> logServer, - Reference> failed) { - state Future waitFailure; - loop { - if (logServer->get().present()) - waitFailure = waitFailureTracker(logServer->get().interf().waitFailure, failed); - else - failed->set(true); - wait(logServer->onChange()); - } - } - - Optional> static getDurableVersion( - UID dbgid, - LogLockInfo lockInfo, - std::vector>> failed = std::vector>>(), - Optional lastEnd = Optional()) { - Reference logSet = lockInfo.logSet; - // To ensure consistent recovery, the number of servers NOT in the write quorum plus the number of servers NOT - // in the read quorum have to be strictly less than the replication factor. Otherwise there could be a replica - // set consistent entirely of servers that are out of date due to not being in the write quorum or unavailable - // due to not being in the read quorum. So with N = # of tlogs, W = antiquorum, R = required count, F = - // replication factor, W + (N - R) < F, and optimally (N-W)+(N-R)=F-1. Thus R=N+1-F+W. - int requiredCount = - (int)logSet->logServers.size() + 1 - logSet->tLogReplicationFactor + logSet->tLogWriteAntiQuorum; - ASSERT(requiredCount > 0 && requiredCount <= logSet->logServers.size()); - ASSERT(logSet->tLogReplicationFactor >= 1 && logSet->tLogReplicationFactor <= logSet->logServers.size()); - ASSERT(logSet->tLogWriteAntiQuorum >= 0 && logSet->tLogWriteAntiQuorum < logSet->logServers.size()); - - std::vector availableItems, badCombo; - std::vector results; - std::string sServerState; - LocalityGroup unResponsiveSet; - - for (int t = 0; t < logSet->logServers.size(); t++) { - if (lockInfo.replies[t].isReady() && !lockInfo.replies[t].isError() && - (!failed.size() || !failed[t]->get())) { - results.push_back(lockInfo.replies[t].get()); - availableItems.push_back(logSet->tLogLocalities[t]); - sServerState += 'a'; - } else { - unResponsiveSet.add(logSet->tLogLocalities[t]); - sServerState += 'f'; - } - } + ASSERT(logSet->logServers.size() == lockInfo.replies.size()); + if (!bTooManyFailures) { + std::sort(results.begin(), results.end(), [](const TLogLockResult& a, const TLogLockResult& b) -> bool { + return a.end < b.end; + }); + int absent = logSet->logServers.size() - results.size(); + int safe_range_begin = logSet->tLogWriteAntiQuorum; + int new_safe_range_begin = std::min(logSet->tLogWriteAntiQuorum, (int)(results.size() - 1)); + int safe_range_end = logSet->tLogReplicationFactor - absent; - // Check if the list of results is not larger than the anti quorum - bool bTooManyFailures = (results.size() <= logSet->tLogWriteAntiQuorum); + if (!lastEnd.present() || ((safe_range_end > 0) && (safe_range_end - 1 < results.size()) && + results[safe_range_end - 1].end < lastEnd.get())) { + Version knownCommittedVersion = 0; + for (int i = 0; i < results.size(); i++) { + knownCommittedVersion = std::max(knownCommittedVersion, results[i].knownCommittedVersion); + } - // Check if failed logs complete the policy - bTooManyFailures = bTooManyFailures || ((unResponsiveSet.size() >= logSet->tLogReplicationFactor) && - (unResponsiveSet.validate(logSet->tLogPolicy))); + if (knownCommittedVersion > results[new_safe_range_begin].end) { + knownCommittedVersion = results[new_safe_range_begin].end; + } - // Check all combinations of the AntiQuorum within the failed - if (!bTooManyFailures && (logSet->tLogWriteAntiQuorum) && - (!validateAllCombinations( - badCombo, unResponsiveSet, logSet->tLogPolicy, availableItems, logSet->tLogWriteAntiQuorum, false))) { - TraceEvent("EpochEndBadCombo", dbgid) + TraceEvent("GetDurableResult", dbgid) .detail("Required", requiredCount) .detail("Present", results.size()) - .detail("ServerState", sServerState); - bTooManyFailures = true; - } - - ASSERT(logSet->logServers.size() == lockInfo.replies.size()); - if (!bTooManyFailures) { - std::sort(results.begin(), results.end(), sort_by_end()); - int absent = logSet->logServers.size() - results.size(); - int safe_range_begin = logSet->tLogWriteAntiQuorum; - int new_safe_range_begin = std::min(logSet->tLogWriteAntiQuorum, (int)(results.size() - 1)); - int safe_range_end = logSet->tLogReplicationFactor - absent; - - if (!lastEnd.present() || ((safe_range_end > 0) && (safe_range_end - 1 < results.size()) && - results[safe_range_end - 1].end < lastEnd.get())) { - Version knownCommittedVersion = 0; - for (int i = 0; i < results.size(); i++) { - knownCommittedVersion = std::max(knownCommittedVersion, results[i].knownCommittedVersion); - } + .detail("ServerState", sServerState) + .detail("RecoveryVersion", + ((safe_range_end > 0) && (safe_range_end - 1 < results.size())) + ? results[safe_range_end - 1].end + : -1) + .detail("EndVersion", results[new_safe_range_begin].end) + .detail("SafeBegin", safe_range_begin) + .detail("SafeEnd", safe_range_end) + .detail("NewSafeBegin", new_safe_range_begin) + .detail("KnownCommittedVersion", knownCommittedVersion) + .detail("EpochEnd", lockInfo.epochEnd); + + return std::make_pair(knownCommittedVersion, results[new_safe_range_begin].end); + } + } + TraceEvent("GetDurableResultWaiting", dbgid) + .detail("Required", requiredCount) + .detail("Present", results.size()) + .detail("ServerState", sServerState); + return Optional>(); +} - if (knownCommittedVersion > results[new_safe_range_begin].end) { - knownCommittedVersion = results[new_safe_range_begin].end; - } +ACTOR Future TagPartitionedLogSystem::getDurableVersionChanged(LogLockInfo lockInfo, + std::vector>> failed) { + // Wait for anything relevant to change + std::vector> changes; + for (int j = 0; j < lockInfo.logSet->logServers.size(); j++) { + if (!lockInfo.replies[j].isReady()) + changes.push_back(ready(lockInfo.replies[j])); + else { + changes.push_back(lockInfo.logSet->logServers[j]->onChange()); + if (failed.size()) { + changes.push_back(failed[j]->onChange()); + } + } + } + ASSERT(changes.size()); + wait(waitForAny(changes)); + return Void(); +} - TraceEvent("GetDurableResult", dbgid) - .detail("Required", requiredCount) - .detail("Present", results.size()) - .detail("ServerState", sServerState) - .detail("RecoveryVersion", - ((safe_range_end > 0) && (safe_range_end - 1 < results.size())) - ? results[safe_range_end - 1].end - : -1) - .detail("EndVersion", results[new_safe_range_begin].end) - .detail("SafeBegin", safe_range_begin) - .detail("SafeEnd", safe_range_end) - .detail("NewSafeBegin", new_safe_range_begin) - .detail("KnownCommittedVersion", knownCommittedVersion) - .detail("EpochEnd", lockInfo.epochEnd); - - return std::make_pair(knownCommittedVersion, results[new_safe_range_begin].end); - } - } - TraceEvent("GetDurableResultWaiting", dbgid) - .detail("Required", requiredCount) - .detail("Present", results.size()) - .detail("ServerState", sServerState); - return Optional>(); +ACTOR Future TagPartitionedLogSystem::epochEnd(Reference>> outLogSystem, + UID dbgid, + DBCoreState prevState, + FutureStream rejoinRequests, + LocalityData locality, + bool* forceRecovery) { + // Stops a co-quorum of tlogs so that no further versions can be committed until the DBCoreState coordination + // state is changed Creates a new logSystem representing the (now frozen) epoch No other important side effects. + // The writeQuorum in the master info is from the previous configuration + + if (!prevState.tLogs.size()) { + // This is a brand new database + auto logSystem = makeReference(dbgid, locality, 0); + logSystem->logSystemType = prevState.logSystemType; + logSystem->recoverAt = 0; + logSystem->knownCommittedVersion = 0; + logSystem->stopped = true; + outLogSystem->set(logSystem); + wait(Future(Never())); + throw internal_error(); } - ACTOR static Future getDurableVersionChanged( - LogLockInfo lockInfo, - std::vector>> failed = std::vector>>()) { - // Wait for anything relevant to change - std::vector> changes; - for (int j = 0; j < lockInfo.logSet->logServers.size(); j++) { - if (!lockInfo.replies[j].isReady()) - changes.push_back(ready(lockInfo.replies[j])); - else { - changes.push_back(lockInfo.logSet->logServers[j]->onChange()); - if (failed.size()) { - changes.push_back(failed[j]->onChange()); - } + if (*forceRecovery) { + DBCoreState modifiedState = prevState; + + int8_t primaryLocality = -1; + for (auto& coreSet : modifiedState.tLogs) { + if (coreSet.isLocal && coreSet.locality >= 0 && coreSet.tLogLocalities[0].dcId() != locality.dcId()) { + primaryLocality = coreSet.locality; + break; } } - ASSERT(changes.size()); - wait(waitForAny(changes)); - return Void(); - } - - ACTOR static Future epochEnd(Reference>> outLogSystem, - UID dbgid, - DBCoreState prevState, - FutureStream rejoinRequests, - LocalityData locality, - bool* forceRecovery) { - // Stops a co-quorum of tlogs so that no further versions can be committed until the DBCoreState coordination - // state is changed Creates a new logSystem representing the (now frozen) epoch No other important side effects. - // The writeQuorum in the master info is from the previous configuration - - if (!prevState.tLogs.size()) { - // This is a brand new database - auto logSystem = makeReference(dbgid, locality, 0); - logSystem->logSystemType = prevState.logSystemType; - logSystem->recoverAt = 0; - logSystem->knownCommittedVersion = 0; - logSystem->stopped = true; - outLogSystem->set(logSystem); - wait(Future(Never())); - throw internal_error(); - } - - if (*forceRecovery) { - DBCoreState modifiedState = prevState; - int8_t primaryLocality = -1; - for (auto& coreSet : modifiedState.tLogs) { - if (coreSet.isLocal && coreSet.locality >= 0 && coreSet.tLogLocalities[0].dcId() != locality.dcId()) { - primaryLocality = coreSet.locality; + bool foundRemote = false; + int8_t remoteLocality = -1; + int modifiedLogSets = 0; + int removedLogSets = 0; + if (primaryLocality >= 0) { + auto copiedLogs = modifiedState.tLogs; + for (auto& coreSet : copiedLogs) { + if (coreSet.locality != primaryLocality && coreSet.locality >= 0) { + foundRemote = true; + remoteLocality = coreSet.locality; + modifiedState.tLogs.clear(); + modifiedState.tLogs.push_back(coreSet); + modifiedState.tLogs[0].isLocal = true; + modifiedState.logRouterTags = 0; + modifiedLogSets++; break; } } - bool foundRemote = false; - int8_t remoteLocality = -1; - int modifiedLogSets = 0; - int removedLogSets = 0; - if (primaryLocality >= 0) { - auto copiedLogs = modifiedState.tLogs; - for (auto& coreSet : copiedLogs) { - if (coreSet.locality != primaryLocality && coreSet.locality >= 0) { + while (!foundRemote && modifiedState.oldTLogData.size()) { + for (auto& coreSet : modifiedState.oldTLogData[0].tLogs) { + if (coreSet.locality != primaryLocality && coreSet.locality >= tagLocalitySpecial) { foundRemote = true; remoteLocality = coreSet.locality; modifiedState.tLogs.clear(); modifiedState.tLogs.push_back(coreSet); modifiedState.tLogs[0].isLocal = true; modifiedState.logRouterTags = 0; + modifiedState.txsTags = modifiedState.oldTLogData[0].txsTags; modifiedLogSets++; break; } } - - while (!foundRemote && modifiedState.oldTLogData.size()) { - for (auto& coreSet : modifiedState.oldTLogData[0].tLogs) { - if (coreSet.locality != primaryLocality && coreSet.locality >= tagLocalitySpecial) { - foundRemote = true; - remoteLocality = coreSet.locality; - modifiedState.tLogs.clear(); - modifiedState.tLogs.push_back(coreSet); - modifiedState.tLogs[0].isLocal = true; - modifiedState.logRouterTags = 0; - modifiedState.txsTags = modifiedState.oldTLogData[0].txsTags; - modifiedLogSets++; + modifiedState.oldTLogData.erase(modifiedState.oldTLogData.begin()); + removedLogSets++; + } + + if (foundRemote) { + for (int i = 0; i < modifiedState.oldTLogData.size(); i++) { + bool found = false; + auto copiedLogs = modifiedState.oldTLogData[i].tLogs; + for (auto& coreSet : copiedLogs) { + if (coreSet.locality == remoteLocality || coreSet.locality == tagLocalitySpecial) { + found = true; + if (!coreSet.isLocal || copiedLogs.size() > 1) { + modifiedState.oldTLogData[i].tLogs.clear(); + modifiedState.oldTLogData[i].tLogs.push_back(coreSet); + modifiedState.oldTLogData[i].tLogs[0].isLocal = true; + modifiedState.oldTLogData[i].logRouterTags = 0; + modifiedState.oldTLogData[i].epochBegin = + modifiedState.oldTLogData[i].tLogs[0].startVersion; + modifiedState.oldTLogData[i].epochEnd = + (i == 0 ? modifiedState.tLogs[0].startVersion + : modifiedState.oldTLogData[i - 1].tLogs[0].startVersion); + modifiedLogSets++; + } break; } } - modifiedState.oldTLogData.erase(modifiedState.oldTLogData.begin()); - removedLogSets++; - } - - if (foundRemote) { - for (int i = 0; i < modifiedState.oldTLogData.size(); i++) { - bool found = false; - auto copiedLogs = modifiedState.oldTLogData[i].tLogs; - for (auto& coreSet : copiedLogs) { - if (coreSet.locality == remoteLocality || coreSet.locality == tagLocalitySpecial) { - found = true; - if (!coreSet.isLocal || copiedLogs.size() > 1) { - modifiedState.oldTLogData[i].tLogs.clear(); - modifiedState.oldTLogData[i].tLogs.push_back(coreSet); - modifiedState.oldTLogData[i].tLogs[0].isLocal = true; - modifiedState.oldTLogData[i].logRouterTags = 0; - modifiedState.oldTLogData[i].epochBegin = - modifiedState.oldTLogData[i].tLogs[0].startVersion; - modifiedState.oldTLogData[i].epochEnd = - (i == 0 ? modifiedState.tLogs[0].startVersion - : modifiedState.oldTLogData[i - 1].tLogs[0].startVersion); - modifiedLogSets++; - } - break; - } - } - if (!found) { - modifiedState.oldTLogData.erase(modifiedState.oldTLogData.begin() + i); - removedLogSets++; - i--; - } + if (!found) { + modifiedState.oldTLogData.erase(modifiedState.oldTLogData.begin() + i); + removedLogSets++; + i--; } - prevState = modifiedState; - } else { - *forceRecovery = false; } + prevState = modifiedState; } else { *forceRecovery = false; } - TraceEvent(SevWarnAlways, "ForcedRecovery", dbgid) - .detail("PrimaryLocality", primaryLocality) - .detail("RemoteLocality", remoteLocality) - .detail("FoundRemote", foundRemote) - .detail("Modified", modifiedLogSets) - .detail("Removed", removedLogSets); - for (int i = 0; i < prevState.tLogs.size(); i++) { + } else { + *forceRecovery = false; + } + TraceEvent(SevWarnAlways, "ForcedRecovery", dbgid) + .detail("PrimaryLocality", primaryLocality) + .detail("RemoteLocality", remoteLocality) + .detail("FoundRemote", foundRemote) + .detail("Modified", modifiedLogSets) + .detail("Removed", removedLogSets); + for (int i = 0; i < prevState.tLogs.size(); i++) { + TraceEvent("ForcedRecoveryTLogs", dbgid) + .detail("I", i) + .detail("Log", ::describe(prevState.tLogs[i].tLogs)) + .detail("Loc", prevState.tLogs[i].locality) + .detail("Txs", prevState.txsTags); + } + for (int i = 0; i < prevState.oldTLogData.size(); i++) { + for (int j = 0; j < prevState.oldTLogData[i].tLogs.size(); j++) { TraceEvent("ForcedRecoveryTLogs", dbgid) .detail("I", i) - .detail("Log", ::describe(prevState.tLogs[i].tLogs)) - .detail("Loc", prevState.tLogs[i].locality) - .detail("Txs", prevState.txsTags); - } - for (int i = 0; i < prevState.oldTLogData.size(); i++) { - for (int j = 0; j < prevState.oldTLogData[i].tLogs.size(); j++) { - TraceEvent("ForcedRecoveryTLogs", dbgid) - .detail("I", i) - .detail("J", j) - .detail("Log", ::describe(prevState.oldTLogData[i].tLogs[j].tLogs)) - .detail("Loc", prevState.oldTLogData[i].tLogs[j].locality) - .detail("Txs", prevState.oldTLogData[i].txsTags); + .detail("J", j) + .detail("Log", ::describe(prevState.oldTLogData[i].tLogs[j].tLogs)) + .detail("Loc", prevState.oldTLogData[i].tLogs[j].locality) + .detail("Txs", prevState.oldTLogData[i].txsTags); + } + } + } + + TEST(true); // Master recovery from pre-existing database + + // trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their + // TLogInterfaces + state std::vector lockResults; + state std::vector>>, Reference>> + allLogServers; + state std::vector> logServers; + state std::vector oldLogData; + state std::vector>>> logFailed; + state std::vector> failureTrackers; + + for (const CoreTLogSet& coreSet : prevState.tLogs) { + logServers.push_back(makeReference(coreSet)); + std::vector>> failed; + + for (const auto& logVar : logServers.back()->logServers) { + allLogServers.emplace_back(logVar, coreSet.tLogPolicy); + failed.push_back(makeReference>()); + failureTrackers.push_back(TagPartitionedLogSystem::monitorLog(logVar, failed.back())); + } + logFailed.push_back(failed); + } + + for (const auto& oldTlogData : prevState.oldTLogData) { + oldLogData.emplace_back(oldTlogData); + + for (const auto& logSet : oldLogData.back().tLogs) { + for (const auto& logVar : logSet->logServers) { + allLogServers.emplace_back(logVar, logSet->tLogPolicy); + } + } + } + state Future rejoins = TagPartitionedLogSystem::trackRejoins(dbgid, allLogServers, rejoinRequests); + + lockResults.resize(logServers.size()); + std::set lockedLocalities; + bool foundSpecial = false; + for (int i = 0; i < logServers.size(); i++) { + if (logServers[i]->locality == tagLocalitySpecial || logServers[i]->locality == tagLocalityUpgraded) { + foundSpecial = true; + } + lockedLocalities.insert(logServers[i]->locality); + lockResults[i].isCurrent = true; + lockResults[i].logSet = logServers[i]; + for (int t = 0; t < logServers[i]->logServers.size(); t++) { + lockResults[i].replies.push_back(TagPartitionedLogSystem::lockTLog(dbgid, logServers[i]->logServers[t])); + } + } + + for (auto& old : oldLogData) { + if (foundSpecial) { + break; + } + for (auto& log : old.tLogs) { + if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { + foundSpecial = true; + break; + } + if (!lockedLocalities.count(log->locality)) { + TraceEvent("EpochEndLockExtra").detail("Locality", log->locality); + TEST(true); // locking old generations for version information + lockedLocalities.insert(log->locality); + LogLockInfo lockResult; + lockResult.epochEnd = old.epochEnd; + lockResult.logSet = log; + for (int t = 0; t < log->logServers.size(); t++) { + lockResult.replies.push_back(TagPartitionedLogSystem::lockTLog(dbgid, log->logServers[t])); } + lockResults.push_back(lockResult); } } + } - TEST(true); // Master recovery from pre-existing database - - // trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their - // TLogInterfaces - state std::vector lockResults; - state - std::vector>>, Reference>> - allLogServers; - state std::vector> logServers; - state std::vector oldLogData; - state std::vector>>> logFailed; - state std::vector> failureTrackers; - - for (const CoreTLogSet& coreSet : prevState.tLogs) { - logServers.push_back(makeReference(coreSet)); - std::vector>> failed; - - for (const auto& logVar : logServers.back()->logServers) { - allLogServers.emplace_back(logVar, coreSet.tLogPolicy); - failed.push_back(makeReference>()); - failureTrackers.push_back(monitorLog(logVar, failed.back())); + if (*forceRecovery) { + state std::vector allLockResults; + ASSERT(lockResults.size() == 1); + allLockResults.push_back(lockResults[0]); + for (auto& old : oldLogData) { + ASSERT(old.tLogs.size() == 1); + LogLockInfo lockResult; + lockResult.epochEnd = old.epochEnd; + lockResult.logSet = old.tLogs[0]; + for (int t = 0; t < old.tLogs[0]->logServers.size(); t++) { + lockResult.replies.push_back(TagPartitionedLogSystem::lockTLog(dbgid, old.tLogs[0]->logServers[t])); } - logFailed.push_back(failed); + allLockResults.push_back(lockResult); } - for (const auto& oldTlogData : prevState.oldTLogData) { - oldLogData.emplace_back(oldTlogData); - - for (const auto& logSet : oldLogData.back().tLogs) { - for (const auto& logVar : logSet->logServers) { - allLogServers.emplace_back(logVar, logSet->tLogPolicy); - } + state int lockNum = 0; + state Version maxRecoveryVersion = 0; + state int maxRecoveryIndex = 0; + while (lockNum < allLockResults.size()) { + auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, allLockResults[lockNum]); + if (versions.present()) { + if (versions.get().second > maxRecoveryVersion) { + TraceEvent("HigherRecoveryVersion", dbgid) + .detail("Idx", lockNum) + .detail("Ver", versions.get().second); + maxRecoveryVersion = versions.get().second; + maxRecoveryIndex = lockNum; + } + lockNum++; + } else { + wait(TagPartitionedLogSystem::getDurableVersionChanged(allLockResults[lockNum])); } } - state Future rejoins = trackRejoins(dbgid, allLogServers, rejoinRequests); + if (maxRecoveryIndex > 0) { + logServers = oldLogData[maxRecoveryIndex - 1].tLogs; + prevState.txsTags = oldLogData[maxRecoveryIndex - 1].txsTags; + lockResults[0] = allLockResults[maxRecoveryIndex]; + lockResults[0].isCurrent = true; - lockResults.resize(logServers.size()); - std::set lockedLocalities; - bool foundSpecial = false; - for (int i = 0; i < logServers.size(); i++) { - if (logServers[i]->locality == tagLocalitySpecial || logServers[i]->locality == tagLocalityUpgraded) { - foundSpecial = true; - } - lockedLocalities.insert(logServers[i]->locality); - lockResults[i].isCurrent = true; - lockResults[i].logSet = logServers[i]; - for (int t = 0; t < logServers[i]->logServers.size(); t++) { - lockResults[i].replies.push_back(lockTLog(dbgid, logServers[i]->logServers[t])); + std::vector>> failed; + for (auto& log : logServers[0]->logServers) { + failed.push_back(makeReference>()); + failureTrackers.push_back(TagPartitionedLogSystem::monitorLog(log, failed.back())); } + ASSERT(logFailed.size() == 1); + logFailed[0] = failed; + oldLogData.erase(oldLogData.begin(), oldLogData.begin() + maxRecoveryIndex); } + } - for (auto& old : oldLogData) { - if (foundSpecial) { - break; + state Optional lastEnd; + state Version knownCommittedVersion = 0; + loop { + Version minEnd = std::numeric_limits::max(); + Version maxEnd = 0; + std::vector> changes; + for (int log = 0; log < logServers.size(); log++) { + if (!logServers[log]->isLocal) { + continue; } - for (auto& log : old.tLogs) { - if (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded) { - foundSpecial = true; - break; - } - if (!lockedLocalities.count(log->locality)) { - TraceEvent("EpochEndLockExtra").detail("Locality", log->locality); - TEST(true); // locking old generations for version information - lockedLocalities.insert(log->locality); - LogLockInfo lockResult; - lockResult.epochEnd = old.epochEnd; - lockResult.logSet = log; - for (int t = 0; t < log->logServers.size(); t++) { - lockResult.replies.push_back(lockTLog(dbgid, log->logServers[t])); - } - lockResults.push_back(lockResult); - } + auto versions = + TagPartitionedLogSystem::getDurableVersion(dbgid, lockResults[log], logFailed[log], lastEnd); + if (versions.present()) { + knownCommittedVersion = std::max(knownCommittedVersion, versions.get().first); + maxEnd = std::max(maxEnd, versions.get().second); + minEnd = std::min(minEnd, versions.get().second); } + changes.push_back(TagPartitionedLogSystem::getDurableVersionChanged(lockResults[log], logFailed[log])); } - if (*forceRecovery) { - state std::vector allLockResults; - ASSERT(lockResults.size() == 1); - allLockResults.push_back(lockResults[0]); - for (auto& old : oldLogData) { - ASSERT(old.tLogs.size() == 1); - LogLockInfo lockResult; - lockResult.epochEnd = old.epochEnd; - lockResult.logSet = old.tLogs[0]; - for (int t = 0; t < old.tLogs[0]->logServers.size(); t++) { - lockResult.replies.push_back(lockTLog(dbgid, old.tLogs[0]->logServers[t])); - } - allLockResults.push_back(lockResult); - } + if (maxEnd > 0 && (!lastEnd.present() || maxEnd < lastEnd.get())) { + TEST(lastEnd.present()); // Restarting recovery at an earlier point - state int lockNum = 0; - state Version maxRecoveryVersion = 0; - state int maxRecoveryIndex = 0; - while (lockNum < allLockResults.size()) { - auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, allLockResults[lockNum]); - if (versions.present()) { - if (versions.get().second > maxRecoveryVersion) { - TraceEvent("HigherRecoveryVersion", dbgid) - .detail("Idx", lockNum) - .detail("Ver", versions.get().second); - maxRecoveryVersion = versions.get().second; - maxRecoveryIndex = lockNum; - } - lockNum++; - } else { - wait(TagPartitionedLogSystem::getDurableVersionChanged(allLockResults[lockNum])); - } - } - if (maxRecoveryIndex > 0) { - logServers = oldLogData[maxRecoveryIndex - 1].tLogs; - prevState.txsTags = oldLogData[maxRecoveryIndex - 1].txsTags; - lockResults[0] = allLockResults[maxRecoveryIndex]; - lockResults[0].isCurrent = true; + auto logSystem = makeReference(dbgid, locality, prevState.recoveryCount); - std::vector>> failed; - for (auto& log : logServers[0]->logServers) { - failed.push_back(makeReference>()); - failureTrackers.push_back(monitorLog(log, failed.back())); - } - ASSERT(logFailed.size() == 1); - logFailed[0] = failed; - oldLogData.erase(oldLogData.begin(), oldLogData.begin() + maxRecoveryIndex); - } + lastEnd = minEnd; + logSystem->tLogs = logServers; + logSystem->logRouterTags = prevState.logRouterTags; + logSystem->txsTags = prevState.txsTags; + logSystem->oldLogData = oldLogData; + logSystem->logSystemType = prevState.logSystemType; + logSystem->rejoins = rejoins; + logSystem->lockResults = lockResults; + if (knownCommittedVersion > minEnd) { + knownCommittedVersion = minEnd; + } + logSystem->recoverAt = minEnd; + logSystem->knownCommittedVersion = knownCommittedVersion; + TraceEvent(SevDebug, "FinalRecoveryVersionInfo") + .detail("KCV", knownCommittedVersion) + .detail("MinEnd", minEnd); + logSystem->remoteLogsWrittenToCoreState = true; + logSystem->stopped = true; + logSystem->pseudoLocalities = prevState.pseudoLocalities; + + outLogSystem->set(logSystem); } - state Optional lastEnd; - state Version knownCommittedVersion = 0; - loop { - Version minEnd = std::numeric_limits::max(); - Version maxEnd = 0; - std::vector> changes; - for (int log = 0; log < logServers.size(); log++) { - if (!logServers[log]->isLocal) { - continue; - } - auto versions = - TagPartitionedLogSystem::getDurableVersion(dbgid, lockResults[log], logFailed[log], lastEnd); - if (versions.present()) { - knownCommittedVersion = std::max(knownCommittedVersion, versions.get().first); - maxEnd = std::max(maxEnd, versions.get().second); - minEnd = std::min(minEnd, versions.get().second); - } - changes.push_back(TagPartitionedLogSystem::getDurableVersionChanged(lockResults[log], logFailed[log])); + wait(waitForAny(changes)); + } +} + +ACTOR Future TagPartitionedLogSystem::recruitOldLogRouters(TagPartitionedLogSystem* self, + vector workers, + LogEpoch recoveryCount, + int8_t locality, + Version startVersion, + std::vector tLogLocalities, + Reference tLogPolicy, + bool forRemote) { + state vector>> logRouterInitializationReplies; + state vector> allReplies; + int nextRouter = 0; + state Version lastStart = std::numeric_limits::max(); + + if (!forRemote) { + Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(self->tLogs); + + lastStart = std::max(startVersion, maxStart); + if (self->logRouterTags == 0) { + ASSERT_WE_THINK(false); + self->logSystemConfigChanged.trigger(); + return Void(); + } + + bool found = false; + for (auto& tLogs : self->tLogs) { + if (tLogs->locality == locality) { + found = true; } - if (maxEnd > 0 && (!lastEnd.present() || maxEnd < lastEnd.get())) { - TEST(lastEnd.present()); // Restarting recovery at an earlier point + tLogs->logRouters.clear(); + } - auto logSystem = makeReference(dbgid, locality, prevState.recoveryCount); + if (!found) { + TraceEvent("RecruitingOldLogRoutersAddingLocality") + .detail("Locality", locality) + .detail("LastStart", lastStart); + auto newLogSet = makeReference(); + newLogSet->locality = locality; + newLogSet->startVersion = lastStart; + newLogSet->isLocal = false; + self->tLogs.push_back(newLogSet); + } - lastEnd = minEnd; - logSystem->tLogs = logServers; - logSystem->logRouterTags = prevState.logRouterTags; - logSystem->txsTags = prevState.txsTags; - logSystem->oldLogData = oldLogData; - logSystem->logSystemType = prevState.logSystemType; - logSystem->rejoins = rejoins; - logSystem->lockResults = lockResults; - if (knownCommittedVersion > minEnd) { - knownCommittedVersion = minEnd; + for (auto& tLogs : self->tLogs) { + // Recruit log routers for old generations of the primary locality + if (tLogs->locality == locality) { + logRouterInitializationReplies.emplace_back(); + for (int i = 0; i < self->logRouterTags; i++) { + InitializeLogRouterRequest req; + req.recoveryCount = recoveryCount; + req.routerTag = Tag(tagLocalityLogRouter, i); + req.startVersion = lastStart; + req.tLogLocalities = tLogLocalities; + req.tLogPolicy = tLogPolicy; + req.locality = locality; + auto reply = transformErrors( + throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed()); + logRouterInitializationReplies.back().push_back(reply); + allReplies.push_back(reply); + nextRouter = (nextRouter + 1) % workers.size(); } - logSystem->recoverAt = minEnd; - logSystem->knownCommittedVersion = knownCommittedVersion; - TraceEvent(SevDebug, "FinalRecoveryVersionInfo") - .detail("KCV", knownCommittedVersion) - .detail("MinEnd", minEnd); - logSystem->remoteLogsWrittenToCoreState = true; - logSystem->stopped = true; - logSystem->pseudoLocalities = prevState.pseudoLocalities; - - outLogSystem->set(logSystem); - } - - wait(waitForAny(changes)); - } - } - - ACTOR static Future recruitOldLogRouters(TagPartitionedLogSystem* self, - vector workers, - LogEpoch recoveryCount, - int8_t locality, - Version startVersion, - std::vector tLogLocalities, - Reference tLogPolicy, - bool forRemote) { - state vector>> logRouterInitializationReplies; - state vector> allReplies; - int nextRouter = 0; - state Version lastStart = std::numeric_limits::max(); - - if (!forRemote) { - Version maxStart = getMaxLocalStartVersion(self->tLogs); - - lastStart = std::max(startVersion, maxStart); - if (self->logRouterTags == 0) { - ASSERT_WE_THINK(false); - self->logSystemConfigChanged.trigger(); - return Void(); } + } + } - bool found = false; - for (auto& tLogs : self->tLogs) { - if (tLogs->locality == locality) { - found = true; - } + for (auto& old : self->oldLogData) { + Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(old.tLogs); - tLogs->logRouters.clear(); - } - - if (!found) { - TraceEvent("RecruitingOldLogRoutersAddingLocality") - .detail("Locality", locality) - .detail("LastStart", lastStart); - auto newLogSet = makeReference(); - newLogSet->locality = locality; - newLogSet->startVersion = lastStart; - newLogSet->isLocal = false; - self->tLogs.push_back(newLogSet); - } - - for (auto& tLogs : self->tLogs) { - // Recruit log routers for old generations of the primary locality - if (tLogs->locality == locality) { - logRouterInitializationReplies.emplace_back(); - for (int i = 0; i < self->logRouterTags; i++) { - InitializeLogRouterRequest req; - req.recoveryCount = recoveryCount; - req.routerTag = Tag(tagLocalityLogRouter, i); - req.startVersion = lastStart; - req.tLogLocalities = tLogLocalities; - req.tLogPolicy = tLogPolicy; - req.locality = locality; - auto reply = transformErrors( - throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( - req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed()); - logRouterInitializationReplies.back().push_back(reply); - allReplies.push_back(reply); - nextRouter = (nextRouter + 1) % workers.size(); - } - } + if (old.logRouterTags == 0 || maxStart >= lastStart) { + break; + } + lastStart = std::max(startVersion, maxStart); + bool found = false; + for (auto& tLogs : old.tLogs) { + if (tLogs->locality == locality) { + found = true; } + tLogs->logRouters.clear(); } - for (auto& old : self->oldLogData) { - Version maxStart = getMaxLocalStartVersion(old.tLogs); + if (!found) { + TraceEvent("RecruitingOldLogRoutersAddingLocality") + .detail("Locality", locality) + .detail("LastStart", lastStart); + auto newLogSet = makeReference(); + newLogSet->locality = locality; + newLogSet->startVersion = lastStart; + old.tLogs.push_back(newLogSet); + } - if (old.logRouterTags == 0 || maxStart >= lastStart) { - break; - } - lastStart = std::max(startVersion, maxStart); - bool found = false; - for (auto& tLogs : old.tLogs) { - if (tLogs->locality == locality) { - found = true; - } - tLogs->logRouters.clear(); - } - - if (!found) { - TraceEvent("RecruitingOldLogRoutersAddingLocality") - .detail("Locality", locality) - .detail("LastStart", lastStart); - auto newLogSet = makeReference(); - newLogSet->locality = locality; - newLogSet->startVersion = lastStart; - old.tLogs.push_back(newLogSet); - } - - for (auto& tLogs : old.tLogs) { - // Recruit log routers for old generations of the primary locality - if (tLogs->locality == locality) { - logRouterInitializationReplies.emplace_back(); - for (int i = 0; i < old.logRouterTags; i++) { - InitializeLogRouterRequest req; - req.recoveryCount = recoveryCount; - req.routerTag = Tag(tagLocalityLogRouter, i); - req.startVersion = lastStart; - req.tLogLocalities = tLogLocalities; - req.tLogPolicy = tLogPolicy; - req.locality = locality; - auto reply = transformErrors( - throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( - req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed()); - logRouterInitializationReplies.back().push_back(reply); - allReplies.push_back(reply); - nextRouter = (nextRouter + 1) % workers.size(); - } + for (auto& tLogs : old.tLogs) { + // Recruit log routers for old generations of the primary locality + if (tLogs->locality == locality) { + logRouterInitializationReplies.emplace_back(); + for (int i = 0; i < old.logRouterTags; i++) { + InitializeLogRouterRequest req; + req.recoveryCount = recoveryCount; + req.routerTag = Tag(tagLocalityLogRouter, i); + req.startVersion = lastStart; + req.tLogLocalities = tLogLocalities; + req.tLogPolicy = tLogPolicy; + req.locality = locality; + auto reply = transformErrors( + throwErrorOr(workers[nextRouter].logRouter.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed()); + logRouterInitializationReplies.back().push_back(reply); + allReplies.push_back(reply); + nextRouter = (nextRouter + 1) % workers.size(); } } } + } + + wait(waitForAll(allReplies)); - wait(waitForAll(allReplies)); + int nextReplies = 0; + lastStart = std::numeric_limits::max(); + vector> failed; - int nextReplies = 0; - lastStart = std::numeric_limits::max(); - vector> failed; + if (!forRemote) { + Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(self->tLogs); - if (!forRemote) { - Version maxStart = getMaxLocalStartVersion(self->tLogs); + lastStart = std::max(startVersion, maxStart); + for (auto& tLogs : self->tLogs) { + if (tLogs->locality == locality) { + for (int i = 0; i < logRouterInitializationReplies[nextReplies].size(); i++) { + tLogs->logRouters.push_back(makeReference>>( + OptionalInterface(logRouterInitializationReplies[nextReplies][i].get()))); + failed.push_back( + waitFailureClient(logRouterInitializationReplies[nextReplies][i].get().waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } + nextReplies++; + } + } + } - lastStart = std::max(startVersion, maxStart); - for (auto& tLogs : self->tLogs) { - if (tLogs->locality == locality) { - for (int i = 0; i < logRouterInitializationReplies[nextReplies].size(); i++) { - tLogs->logRouters.push_back(makeReference>>( - OptionalInterface(logRouterInitializationReplies[nextReplies][i].get()))); + for (auto& old : self->oldLogData) { + Version maxStart = TagPartitionedLogSystem::getMaxLocalStartVersion(old.tLogs); + if (old.logRouterTags == 0 || maxStart >= lastStart) { + break; + } + lastStart = std::max(startVersion, maxStart); + for (auto& tLogs : old.tLogs) { + if (tLogs->locality == locality) { + for (int i = 0; i < logRouterInitializationReplies[nextReplies].size(); i++) { + tLogs->logRouters.push_back(makeReference>>( + OptionalInterface(logRouterInitializationReplies[nextReplies][i].get()))); + if (!forRemote) { failed.push_back(waitFailureClient( logRouterInitializationReplies[nextReplies][i].get().waitFailure, SERVER_KNOBS->TLOG_TIMEOUT, -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, /*trace=*/true)); } - nextReplies++; - } - } - } - - for (auto& old : self->oldLogData) { - Version maxStart = getMaxLocalStartVersion(old.tLogs); - if (old.logRouterTags == 0 || maxStart >= lastStart) { - break; - } - lastStart = std::max(startVersion, maxStart); - for (auto& tLogs : old.tLogs) { - if (tLogs->locality == locality) { - for (int i = 0; i < logRouterInitializationReplies[nextReplies].size(); i++) { - tLogs->logRouters.push_back(makeReference>>( - OptionalInterface(logRouterInitializationReplies[nextReplies][i].get()))); - if (!forRemote) { - failed.push_back(waitFailureClient( - logRouterInitializationReplies[nextReplies][i].get().waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } - } - nextReplies++; } + nextReplies++; } } + } - if (!forRemote) { - self->logSystemConfigChanged.trigger(); - wait(failed.size() ? tagError(quorum(failed, 1), master_tlog_failed()) : Future(Never())); - throw internal_error(); - } - return Void(); + if (!forRemote) { + self->logSystemConfigChanged.trigger(); + wait(failed.size() ? tagError(quorum(failed, 1), master_tlog_failed()) : Future(Never())); + throw internal_error(); } + return Void(); +} - static Version getMaxLocalStartVersion(const std::vector>& tLogs) { - Version maxStart = 0; - for (const auto& logSet : tLogs) { - if (logSet->isLocal) { - maxStart = std::max(maxStart, logSet->startVersion); - } +Version TagPartitionedLogSystem::getMaxLocalStartVersion(const std::vector>& tLogs) { + Version maxStart = 0; + for (const auto& logSet : tLogs) { + if (logSet->isLocal) { + maxStart = std::max(maxStart, logSet->startVersion); } - return maxStart; } + return maxStart; +} - static std::vector getLocalTags(int8_t locality, const std::vector& allTags) { - std::vector localTags; - for (const auto& tag : allTags) { - if (locality == tagLocalitySpecial || locality == tag.locality || tag.locality < 0) { - localTags.push_back(tag); - } +std::vector TagPartitionedLogSystem::getLocalTags(int8_t locality, const std::vector& allTags) { + std::vector localTags; + for (const auto& tag : allTags) { + if (locality == tagLocalitySpecial || locality == tag.locality || tag.locality < 0) { + localTags.push_back(tag); } - return localTags; } + return localTags; +} - ACTOR static Future newRemoteEpoch(TagPartitionedLogSystem* self, - Reference oldLogSystem, - Future fRemoteWorkers, - DatabaseConfiguration configuration, - LogEpoch recoveryCount, - int8_t remoteLocality, - std::vector allTags) { - TraceEvent("RemoteLogRecruitment_WaitingForWorkers").log(); - state RecruitRemoteFromConfigurationReply remoteWorkers = wait(fRemoteWorkers); - - state Reference logSet(new LogSet()); - logSet->tLogReplicationFactor = configuration.getRemoteTLogReplicationFactor(); - logSet->tLogVersion = configuration.tLogVersion; - logSet->tLogPolicy = configuration.getRemoteTLogPolicy(); - logSet->isLocal = false; - logSet->locality = remoteLocality; - - logSet->startVersion = oldLogSystem->knownCommittedVersion + 1; - state int lockNum = 0; - while (lockNum < oldLogSystem->lockResults.size()) { - if (oldLogSystem->lockResults[lockNum].logSet->locality == remoteLocality) { - loop { - auto versions = - TagPartitionedLogSystem::getDurableVersion(self->dbgid, oldLogSystem->lockResults[lockNum]); - if (versions.present()) { - logSet->startVersion = - std::min(std::min(versions.get().first + 1, oldLogSystem->lockResults[lockNum].epochEnd), - logSet->startVersion); - break; - } - wait(TagPartitionedLogSystem::getDurableVersionChanged(oldLogSystem->lockResults[lockNum])); +ACTOR Future TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSystem* self, + Reference oldLogSystem, + Future fRemoteWorkers, + DatabaseConfiguration configuration, + LogEpoch recoveryCount, + int8_t remoteLocality, + std::vector allTags) { + TraceEvent("RemoteLogRecruitment_WaitingForWorkers").log(); + state RecruitRemoteFromConfigurationReply remoteWorkers = wait(fRemoteWorkers); + + state Reference logSet(new LogSet()); + logSet->tLogReplicationFactor = configuration.getRemoteTLogReplicationFactor(); + logSet->tLogVersion = configuration.tLogVersion; + logSet->tLogPolicy = configuration.getRemoteTLogPolicy(); + logSet->isLocal = false; + logSet->locality = remoteLocality; + + logSet->startVersion = oldLogSystem->knownCommittedVersion + 1; + state int lockNum = 0; + while (lockNum < oldLogSystem->lockResults.size()) { + if (oldLogSystem->lockResults[lockNum].logSet->locality == remoteLocality) { + loop { + auto versions = + TagPartitionedLogSystem::getDurableVersion(self->dbgid, oldLogSystem->lockResults[lockNum]); + if (versions.present()) { + logSet->startVersion = + std::min(std::min(versions.get().first + 1, oldLogSystem->lockResults[lockNum].epochEnd), + logSet->startVersion); + break; } - break; + wait(TagPartitionedLogSystem::getDurableVersionChanged(oldLogSystem->lockResults[lockNum])); } - lockNum++; - } - - vector localities; - localities.resize(remoteWorkers.remoteTLogs.size()); - for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { - localities[i] = remoteWorkers.remoteTLogs[i].locality; - } - - state Future oldRouterRecruitment = Void(); - if (logSet->startVersion < oldLogSystem->knownCommittedVersion + 1) { - ASSERT(oldLogSystem->logRouterTags > 0); - oldRouterRecruitment = TagPartitionedLogSystem::recruitOldLogRouters(self, - remoteWorkers.logRouters, - recoveryCount, - remoteLocality, - logSet->startVersion, - localities, - logSet->tLogPolicy, - true); - } - - state vector> logRouterInitializationReplies; - const Version startVersion = oldLogSystem->logRouterTags == 0 - ? oldLogSystem->recoverAt.get() + 1 - : std::max(self->tLogs[0]->startVersion, logSet->startVersion); - for (int i = 0; i < self->logRouterTags; i++) { - InitializeLogRouterRequest req; - req.recoveryCount = recoveryCount; - req.routerTag = Tag(tagLocalityLogRouter, i); - req.startVersion = startVersion; - req.tLogLocalities = localities; - req.tLogPolicy = logSet->tLogPolicy; - req.locality = remoteLocality; - logRouterInitializationReplies.push_back(transformErrors( - throwErrorOr( - remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].logRouter.getReplyUnlessFailedFor( - req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed())); + break; + } + lockNum++; + } + + vector localities; + localities.resize(remoteWorkers.remoteTLogs.size()); + for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { + localities[i] = remoteWorkers.remoteTLogs[i].locality; + } + + state Future oldRouterRecruitment = Void(); + if (logSet->startVersion < oldLogSystem->knownCommittedVersion + 1) { + ASSERT(oldLogSystem->logRouterTags > 0); + oldRouterRecruitment = TagPartitionedLogSystem::recruitOldLogRouters(self, + remoteWorkers.logRouters, + recoveryCount, + remoteLocality, + logSet->startVersion, + localities, + logSet->tLogPolicy, + true); + } + + state vector> logRouterInitializationReplies; + const Version startVersion = oldLogSystem->logRouterTags == 0 + ? oldLogSystem->recoverAt.get() + 1 + : std::max(self->tLogs[0]->startVersion, logSet->startVersion); + for (int i = 0; i < self->logRouterTags; i++) { + InitializeLogRouterRequest req; + req.recoveryCount = recoveryCount; + req.routerTag = Tag(tagLocalityLogRouter, i); + req.startVersion = startVersion; + req.tLogLocalities = localities; + req.tLogPolicy = logSet->tLogPolicy; + req.locality = remoteLocality; + logRouterInitializationReplies.push_back(transformErrors( + throwErrorOr( + remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].logRouter.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + } + + std::vector localTags = TagPartitionedLogSystem::getLocalTags(remoteLocality, allTags); + LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); + + logSet->tLogLocalities.resize(remoteWorkers.remoteTLogs.size()); + logSet->logServers.resize( + remoteWorkers.remoteTLogs + .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size + logSet->updateLocalitySet(localities); + + state vector> remoteTLogInitializationReplies; + vector remoteTLogReqs(remoteWorkers.remoteTLogs.size()); + + bool nonShardedTxs = self->getTLogVersion() < TLogVersion::V4; + if (oldLogSystem->logRouterTags == 0) { + std::vector locations; + for (Tag tag : localTags) { + locations.clear(); + logSet->getPushLocations(VectorRef(&tag, 1), locations, 0); + for (int loc : locations) + remoteTLogReqs[loc].recoverTags.push_back(tag); } - std::vector localTags = getLocalTags(remoteLocality, allTags); - LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); - - logSet->tLogLocalities.resize(remoteWorkers.remoteTLogs.size()); - logSet->logServers.resize( - remoteWorkers.remoteTLogs - .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size - logSet->updateLocalitySet(localities); - - state vector> remoteTLogInitializationReplies; - vector remoteTLogReqs(remoteWorkers.remoteTLogs.size()); - - bool nonShardedTxs = self->getTLogVersion() < TLogVersion::V4; - if (oldLogSystem->logRouterTags == 0) { - std::vector locations; - for (Tag tag : localTags) { + if (oldLogSystem->tLogs.size()) { + int maxTxsTags = oldLogSystem->txsTags; + bool needsOldTxs = oldLogSystem->tLogs[0]->tLogVersion < TLogVersion::V4; + for (auto& it : oldLogSystem->oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.txsTags); + needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; + } + for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { + Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); + Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % self->txsTags); locations.clear(); - logSet->getPushLocations(VectorRef(&tag, 1), locations, 0); + logSet->getPushLocations(VectorRef(&pushTag, 1), locations, 0); for (int loc : locations) remoteTLogReqs[loc].recoverTags.push_back(tag); } - - if (oldLogSystem->tLogs.size()) { - int maxTxsTags = oldLogSystem->txsTags; - bool needsOldTxs = oldLogSystem->tLogs[0]->tLogVersion < TLogVersion::V4; - for (auto& it : oldLogSystem->oldLogData) { - maxTxsTags = std::max(maxTxsTags, it.txsTags); - needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; - } - for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { - Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); - Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % self->txsTags); - locations.clear(); - logSet->getPushLocations(VectorRef(&pushTag, 1), locations, 0); - for (int loc : locations) - remoteTLogReqs[loc].recoverTags.push_back(tag); - } - } } + } - if (oldLogSystem->tLogs.size()) { - if (nonShardedTxs) { - localTags.push_back(txsTag); - } else { - for (int i = 0; i < self->txsTags; i++) { - localTags.push_back(Tag(tagLocalityTxs, i)); - } - } - } - - for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { - InitializeTLogRequest& req = remoteTLogReqs[i]; - req.recruitmentID = self->recruitmentID; - req.logVersion = configuration.tLogVersion; - req.storeType = configuration.tLogDataStoreType; - req.spillType = configuration.tLogSpillType; - req.recoverFrom = oldLogSystemConfig; - req.recoverAt = oldLogSystem->recoverAt.get(); - req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; - req.epoch = recoveryCount; - req.remoteTag = Tag(tagLocalityRemoteLog, i); - req.locality = remoteLocality; - req.isPrimary = false; - req.allTags = localTags; - req.startVersion = logSet->startVersion; - req.logRouterTags = 0; - req.txsTags = self->txsTags; - } - - remoteTLogInitializationReplies.reserve(remoteWorkers.remoteTLogs.size()); - for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) - remoteTLogInitializationReplies.push_back(transformErrors( - throwErrorOr(remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor( - remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed())); - - TraceEvent("RemoteLogRecruitment_InitializingRemoteLogs") - .detail("StartVersion", logSet->startVersion) - .detail("LocalStart", self->tLogs[0]->startVersion) - .detail("LogRouterTags", self->logRouterTags); - wait(waitForAll(remoteTLogInitializationReplies) && waitForAll(logRouterInitializationReplies) && - oldRouterRecruitment); + if (oldLogSystem->tLogs.size()) { + if (nonShardedTxs) { + localTags.push_back(txsTag); + } else { + for (int i = 0; i < self->txsTags; i++) { + localTags.push_back(Tag(tagLocalityTxs, i)); + } + } + } + + for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) { + InitializeTLogRequest& req = remoteTLogReqs[i]; + req.recruitmentID = self->recruitmentID; + req.logVersion = configuration.tLogVersion; + req.storeType = configuration.tLogDataStoreType; + req.spillType = configuration.tLogSpillType; + req.recoverFrom = oldLogSystemConfig; + req.recoverAt = oldLogSystem->recoverAt.get(); + req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; + req.epoch = recoveryCount; + req.remoteTag = Tag(tagLocalityRemoteLog, i); + req.locality = remoteLocality; + req.isPrimary = false; + req.allTags = localTags; + req.startVersion = logSet->startVersion; + req.logRouterTags = 0; + req.txsTags = self->txsTags; + } + + remoteTLogInitializationReplies.reserve(remoteWorkers.remoteTLogs.size()); + for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) + remoteTLogInitializationReplies.push_back(transformErrors( + throwErrorOr(remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor( + remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + + TraceEvent("RemoteLogRecruitment_InitializingRemoteLogs") + .detail("StartVersion", logSet->startVersion) + .detail("LocalStart", self->tLogs[0]->startVersion) + .detail("LogRouterTags", self->logRouterTags); + wait(waitForAll(remoteTLogInitializationReplies) && waitForAll(logRouterInitializationReplies) && + oldRouterRecruitment); + + for (int i = 0; i < logRouterInitializationReplies.size(); i++) { + logSet->logRouters.push_back(makeReference>>( + OptionalInterface(logRouterInitializationReplies[i].get()))); + } + + for (int i = 0; i < remoteTLogInitializationReplies.size(); i++) { + logSet->logServers[i] = makeReference>>( + OptionalInterface(remoteTLogInitializationReplies[i].get())); + logSet->tLogLocalities[i] = remoteWorkers.remoteTLogs[i].locality; + } + filterLocalityDataForPolicy(logSet->tLogPolicy, &logSet->tLogLocalities); + + std::vector> recoveryComplete; + recoveryComplete.reserve(logSet->logServers.size()); + for (int i = 0; i < logSet->logServers.size(); i++) + recoveryComplete.push_back( + transformErrors(throwErrorOr(logSet->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( + TLogRecoveryFinishedRequest(), + SERVER_KNOBS->TLOG_TIMEOUT, + SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + + self->remoteRecoveryComplete = waitForAll(recoveryComplete); + self->tLogs.push_back(logSet); + TraceEvent("RemoteLogRecruitment_CompletingRecovery").log(); + return Void(); +} - for (int i = 0; i < logRouterInitializationReplies.size(); i++) { - logSet->logRouters.push_back(makeReference>>( - OptionalInterface(logRouterInitializationReplies[i].get()))); +ACTOR Future> TagPartitionedLogSystem::newEpoch( + Reference oldLogSystem, + RecruitFromConfigurationReply recr, + Future fRemoteWorkers, + DatabaseConfiguration configuration, + LogEpoch recoveryCount, + int8_t primaryLocality, + int8_t remoteLocality, + std::vector allTags, + Reference> recruitmentStalled) { + state double startTime = now(); + state Reference logSystem( + new TagPartitionedLogSystem(oldLogSystem->getDebugID(), oldLogSystem->locality, recoveryCount)); + logSystem->logSystemType = LogSystemType::tagPartitioned; + logSystem->expectedLogSets = 1; + logSystem->recoveredAt = oldLogSystem->recoverAt; + logSystem->repopulateRegionAntiQuorum = configuration.repopulateRegionAntiQuorum; + logSystem->recruitmentID = deterministicRandom()->randomUniqueID(); + logSystem->txsTags = configuration.tLogVersion >= TLogVersion::V4 ? recr.tLogs.size() : 0; + oldLogSystem->recruitmentID = logSystem->recruitmentID; + + if (configuration.usableRegions > 1) { + logSystem->logRouterTags = + recr.tLogs.size() * + std::max(1, configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())); + logSystem->expectedLogSets++; + logSystem->addPseudoLocality(tagLocalityLogRouterMapped); + TraceEvent e("AddPseudoLocality", logSystem->getDebugID()); + e.detail("Locality1", "LogRouterMapped"); + if (configuration.backupWorkerEnabled) { + logSystem->addPseudoLocality(tagLocalityBackup); + e.detail("Locality2", "Backup"); } + } else if (configuration.backupWorkerEnabled) { + // Single region uses log router tag for backup workers. + logSystem->logRouterTags = + recr.tLogs.size() * + std::max(1, configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())); + logSystem->addPseudoLocality(tagLocalityBackup); + TraceEvent("AddPseudoLocality", logSystem->getDebugID()).detail("Locality", "Backup"); + } - for (int i = 0; i < remoteTLogInitializationReplies.size(); i++) { - logSet->logServers[i] = makeReference>>( - OptionalInterface(remoteTLogInitializationReplies[i].get())); - logSet->tLogLocalities[i] = remoteWorkers.remoteTLogs[i].locality; - } - filterLocalityDataForPolicy(logSet->tLogPolicy, &logSet->tLogLocalities); + logSystem->tLogs.push_back(makeReference()); + logSystem->tLogs[0]->tLogVersion = configuration.tLogVersion; + logSystem->tLogs[0]->tLogWriteAntiQuorum = configuration.tLogWriteAntiQuorum; + logSystem->tLogs[0]->tLogReplicationFactor = configuration.tLogReplicationFactor; + logSystem->tLogs[0]->tLogPolicy = configuration.tLogPolicy; + logSystem->tLogs[0]->isLocal = true; + logSystem->tLogs[0]->locality = primaryLocality; - std::vector> recoveryComplete; - recoveryComplete.reserve(logSet->logServers.size()); - for (int i = 0; i < logSet->logServers.size(); i++) - recoveryComplete.push_back(transformErrors( - throwErrorOr(logSet->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( - TLogRecoveryFinishedRequest(), - SERVER_KNOBS->TLOG_TIMEOUT, - SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed())); + state RegionInfo region = configuration.getRegion(recr.dcId); - self->remoteRecoveryComplete = waitForAll(recoveryComplete); - self->tLogs.push_back(logSet); - TraceEvent("RemoteLogRecruitment_CompletingRecovery").log(); - return Void(); - } - - ACTOR static Future> newEpoch(Reference oldLogSystem, - RecruitFromConfigurationReply recr, - Future fRemoteWorkers, - DatabaseConfiguration configuration, - LogEpoch recoveryCount, - int8_t primaryLocality, - int8_t remoteLocality, - std::vector allTags, - Reference> recruitmentStalled) { - state double startTime = now(); - state Reference logSystem( - new TagPartitionedLogSystem(oldLogSystem->getDebugID(), oldLogSystem->locality, recoveryCount)); - logSystem->logSystemType = LogSystemType::tagPartitioned; - logSystem->expectedLogSets = 1; - logSystem->recoveredAt = oldLogSystem->recoverAt; - logSystem->repopulateRegionAntiQuorum = configuration.repopulateRegionAntiQuorum; - logSystem->recruitmentID = deterministicRandom()->randomUniqueID(); - logSystem->txsTags = configuration.tLogVersion >= TLogVersion::V4 ? recr.tLogs.size() : 0; - oldLogSystem->recruitmentID = logSystem->recruitmentID; - - if (configuration.usableRegions > 1) { - logSystem->logRouterTags = - recr.tLogs.size() * - std::max(1, configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())); - logSystem->expectedLogSets++; - logSystem->addPseudoLocality(tagLocalityLogRouterMapped); - TraceEvent e("AddPseudoLocality", logSystem->getDebugID()); - e.detail("Locality1", "LogRouterMapped"); - if (configuration.backupWorkerEnabled) { - logSystem->addPseudoLocality(tagLocalityBackup); - e.detail("Locality2", "Backup"); - } - } else if (configuration.backupWorkerEnabled) { - // Single region uses log router tag for backup workers. - logSystem->logRouterTags = - recr.tLogs.size() * - std::max(1, configuration.desiredLogRouterCount / std::max(1, recr.tLogs.size())); - logSystem->addPseudoLocality(tagLocalityBackup); - TraceEvent("AddPseudoLocality", logSystem->getDebugID()).detail("Locality", "Backup"); - } + state int maxTxsTags = oldLogSystem->txsTags; + state bool needsOldTxs = oldLogSystem->tLogs.size() && oldLogSystem->getTLogVersion() < TLogVersion::V4; + for (auto& it : oldLogSystem->oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.txsTags); + needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; + } + if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { logSystem->tLogs.push_back(makeReference()); - logSystem->tLogs[0]->tLogVersion = configuration.tLogVersion; - logSystem->tLogs[0]->tLogWriteAntiQuorum = configuration.tLogWriteAntiQuorum; - logSystem->tLogs[0]->tLogReplicationFactor = configuration.tLogReplicationFactor; - logSystem->tLogs[0]->tLogPolicy = configuration.tLogPolicy; - logSystem->tLogs[0]->isLocal = true; - logSystem->tLogs[0]->locality = primaryLocality; - - state RegionInfo region = configuration.getRegion(recr.dcId); - - state int maxTxsTags = oldLogSystem->txsTags; - state bool needsOldTxs = oldLogSystem->tLogs.size() && oldLogSystem->getTLogVersion() < TLogVersion::V4; - for (auto& it : oldLogSystem->oldLogData) { - maxTxsTags = std::max(maxTxsTags, it.txsTags); - needsOldTxs = needsOldTxs || it.tLogs[0]->tLogVersion < TLogVersion::V4; - } - - if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { - logSystem->tLogs.push_back(makeReference()); - if (recr.satelliteFallback) { - logSystem->tLogs[1]->tLogWriteAntiQuorum = region.satelliteTLogWriteAntiQuorumFallback; - logSystem->tLogs[1]->tLogReplicationFactor = region.satelliteTLogReplicationFactorFallback; - logSystem->tLogs[1]->tLogPolicy = region.satelliteTLogPolicyFallback; - } else { - logSystem->tLogs[1]->tLogWriteAntiQuorum = region.satelliteTLogWriteAntiQuorum; - logSystem->tLogs[1]->tLogReplicationFactor = region.satelliteTLogReplicationFactor; - logSystem->tLogs[1]->tLogPolicy = region.satelliteTLogPolicy; - } - logSystem->tLogs[1]->isLocal = true; - logSystem->tLogs[1]->locality = tagLocalitySatellite; - logSystem->tLogs[1]->tLogVersion = configuration.tLogVersion; - logSystem->tLogs[1]->startVersion = oldLogSystem->knownCommittedVersion + 1; - - logSystem->tLogs[1]->tLogLocalities.resize(recr.satelliteTLogs.size()); - for (int i = 0; i < recr.satelliteTLogs.size(); i++) { - logSystem->tLogs[1]->tLogLocalities[i] = recr.satelliteTLogs[i].locality; - } - filterLocalityDataForPolicy(logSystem->tLogs[1]->tLogPolicy, &logSystem->tLogs[1]->tLogLocalities); + if (recr.satelliteFallback) { + logSystem->tLogs[1]->tLogWriteAntiQuorum = region.satelliteTLogWriteAntiQuorumFallback; + logSystem->tLogs[1]->tLogReplicationFactor = region.satelliteTLogReplicationFactorFallback; + logSystem->tLogs[1]->tLogPolicy = region.satelliteTLogPolicyFallback; + } else { + logSystem->tLogs[1]->tLogWriteAntiQuorum = region.satelliteTLogWriteAntiQuorum; + logSystem->tLogs[1]->tLogReplicationFactor = region.satelliteTLogReplicationFactor; + logSystem->tLogs[1]->tLogPolicy = region.satelliteTLogPolicy; + } + logSystem->tLogs[1]->isLocal = true; + logSystem->tLogs[1]->locality = tagLocalitySatellite; + logSystem->tLogs[1]->tLogVersion = configuration.tLogVersion; + logSystem->tLogs[1]->startVersion = oldLogSystem->knownCommittedVersion + 1; - logSystem->tLogs[1]->logServers.resize( - recr.satelliteTLogs - .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size - logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); - logSystem->tLogs[1]->populateSatelliteTagLocations( - logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags); - logSystem->expectedLogSets++; + logSystem->tLogs[1]->tLogLocalities.resize(recr.satelliteTLogs.size()); + for (int i = 0; i < recr.satelliteTLogs.size(); i++) { + logSystem->tLogs[1]->tLogLocalities[i] = recr.satelliteTLogs[i].locality; } + filterLocalityDataForPolicy(logSystem->tLogs[1]->tLogPolicy, &logSystem->tLogs[1]->tLogLocalities); - if (oldLogSystem->tLogs.size()) { - logSystem->oldLogData.emplace_back(); - logSystem->oldLogData[0].tLogs = oldLogSystem->tLogs; - logSystem->oldLogData[0].epochBegin = oldLogSystem->tLogs[0]->startVersion; - logSystem->oldLogData[0].epochEnd = oldLogSystem->knownCommittedVersion + 1; - logSystem->oldLogData[0].logRouterTags = oldLogSystem->logRouterTags; - logSystem->oldLogData[0].txsTags = oldLogSystem->txsTags; - logSystem->oldLogData[0].pseudoLocalities = oldLogSystem->pseudoLocalities; - logSystem->oldLogData[0].epoch = oldLogSystem->epoch; - } - logSystem->oldLogData.insert( - logSystem->oldLogData.end(), oldLogSystem->oldLogData.begin(), oldLogSystem->oldLogData.end()); - - logSystem->tLogs[0]->startVersion = oldLogSystem->knownCommittedVersion + 1; - logSystem->backupStartVersion = oldLogSystem->knownCommittedVersion + 1; - state int lockNum = 0; - while (lockNum < oldLogSystem->lockResults.size()) { - if (oldLogSystem->lockResults[lockNum].logSet->locality == primaryLocality) { - if (oldLogSystem->lockResults[lockNum].isCurrent && - oldLogSystem->lockResults[lockNum].logSet->isLocal) { + logSystem->tLogs[1]->logServers.resize( + recr.satelliteTLogs + .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size + logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); + logSystem->tLogs[1]->populateSatelliteTagLocations( + logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags); + logSystem->expectedLogSets++; + } + + if (oldLogSystem->tLogs.size()) { + logSystem->oldLogData.emplace_back(); + logSystem->oldLogData[0].tLogs = oldLogSystem->tLogs; + logSystem->oldLogData[0].epochBegin = oldLogSystem->tLogs[0]->startVersion; + logSystem->oldLogData[0].epochEnd = oldLogSystem->knownCommittedVersion + 1; + logSystem->oldLogData[0].logRouterTags = oldLogSystem->logRouterTags; + logSystem->oldLogData[0].txsTags = oldLogSystem->txsTags; + logSystem->oldLogData[0].pseudoLocalities = oldLogSystem->pseudoLocalities; + logSystem->oldLogData[0].epoch = oldLogSystem->epoch; + } + logSystem->oldLogData.insert( + logSystem->oldLogData.end(), oldLogSystem->oldLogData.begin(), oldLogSystem->oldLogData.end()); + + logSystem->tLogs[0]->startVersion = oldLogSystem->knownCommittedVersion + 1; + logSystem->backupStartVersion = oldLogSystem->knownCommittedVersion + 1; + state int lockNum = 0; + while (lockNum < oldLogSystem->lockResults.size()) { + if (oldLogSystem->lockResults[lockNum].logSet->locality == primaryLocality) { + if (oldLogSystem->lockResults[lockNum].isCurrent && oldLogSystem->lockResults[lockNum].logSet->isLocal) { + break; + } + state Future stalledAfter = setAfter(recruitmentStalled, SERVER_KNOBS->MAX_RECOVERY_TIME, true); + loop { + auto versions = + TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, oldLogSystem->lockResults[lockNum]); + if (versions.present()) { + logSystem->tLogs[0]->startVersion = + std::min(std::min(versions.get().first + 1, oldLogSystem->lockResults[lockNum].epochEnd), + logSystem->tLogs[0]->startVersion); break; } - state Future stalledAfter = setAfter(recruitmentStalled, SERVER_KNOBS->MAX_RECOVERY_TIME, true); - loop { - auto versions = TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, - oldLogSystem->lockResults[lockNum]); - if (versions.present()) { - logSystem->tLogs[0]->startVersion = - std::min(std::min(versions.get().first + 1, oldLogSystem->lockResults[lockNum].epochEnd), - logSystem->tLogs[0]->startVersion); - break; - } - wait(TagPartitionedLogSystem::getDurableVersionChanged(oldLogSystem->lockResults[lockNum])); - } - stalledAfter.cancel(); - break; + wait(TagPartitionedLogSystem::getDurableVersionChanged(oldLogSystem->lockResults[lockNum])); } - lockNum++; - } - - vector localities; - localities.resize(recr.tLogs.size()); - for (int i = 0; i < recr.tLogs.size(); i++) { - localities[i] = recr.tLogs[i].locality; - } - - state Future oldRouterRecruitment = Never(); - TraceEvent("NewEpochStartVersion", oldLogSystem->getDebugID()) - .detail("StartVersion", logSystem->tLogs[0]->startVersion) - .detail("EpochEnd", oldLogSystem->knownCommittedVersion + 1) - .detail("Locality", primaryLocality) - .detail("OldLogRouterTags", oldLogSystem->logRouterTags); - if (oldLogSystem->logRouterTags > 0 || - logSystem->tLogs[0]->startVersion < oldLogSystem->knownCommittedVersion + 1) { - oldRouterRecruitment = TagPartitionedLogSystem::recruitOldLogRouters(oldLogSystem.getPtr(), - recr.oldLogRouters, - recoveryCount, - primaryLocality, - logSystem->tLogs[0]->startVersion, - localities, - logSystem->tLogs[0]->tLogPolicy, - false); - if (oldLogSystem->knownCommittedVersion - logSystem->tLogs[0]->startVersion > - SERVER_KNOBS->MAX_RECOVERY_VERSIONS) { - // make sure we can recover in the other DC. - for (auto& lockResult : oldLogSystem->lockResults) { - if (lockResult.logSet->locality == remoteLocality) { - if (TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, lockResult).present()) { - recruitmentStalled->set(true); - } + stalledAfter.cancel(); + break; + } + lockNum++; + } + + vector localities; + localities.resize(recr.tLogs.size()); + for (int i = 0; i < recr.tLogs.size(); i++) { + localities[i] = recr.tLogs[i].locality; + } + + state Future oldRouterRecruitment = Never(); + TraceEvent("NewEpochStartVersion", oldLogSystem->getDebugID()) + .detail("StartVersion", logSystem->tLogs[0]->startVersion) + .detail("EpochEnd", oldLogSystem->knownCommittedVersion + 1) + .detail("Locality", primaryLocality) + .detail("OldLogRouterTags", oldLogSystem->logRouterTags); + if (oldLogSystem->logRouterTags > 0 || + logSystem->tLogs[0]->startVersion < oldLogSystem->knownCommittedVersion + 1) { + oldRouterRecruitment = TagPartitionedLogSystem::recruitOldLogRouters(oldLogSystem.getPtr(), + recr.oldLogRouters, + recoveryCount, + primaryLocality, + logSystem->tLogs[0]->startVersion, + localities, + logSystem->tLogs[0]->tLogPolicy, + false); + if (oldLogSystem->knownCommittedVersion - logSystem->tLogs[0]->startVersion > + SERVER_KNOBS->MAX_RECOVERY_VERSIONS) { + // make sure we can recover in the other DC. + for (auto& lockResult : oldLogSystem->lockResults) { + if (lockResult.logSet->locality == remoteLocality) { + if (TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, lockResult).present()) { + recruitmentStalled->set(true); } } } - } else { - oldLogSystem->logSystemConfigChanged.trigger(); } + } else { + oldLogSystem->logSystemConfigChanged.trigger(); + } - std::vector localTags = getLocalTags(primaryLocality, allTags); - state LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); + std::vector localTags = TagPartitionedLogSystem::getLocalTags(primaryLocality, allTags); + state LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); - state vector> initializationReplies; - vector reqs(recr.tLogs.size()); + state vector> initializationReplies; + vector reqs(recr.tLogs.size()); - logSystem->tLogs[0]->tLogLocalities.resize(recr.tLogs.size()); - logSystem->tLogs[0]->logServers.resize( - recr.tLogs.size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size - logSystem->tLogs[0]->updateLocalitySet(localities); + logSystem->tLogs[0]->tLogLocalities.resize(recr.tLogs.size()); + logSystem->tLogs[0]->logServers.resize( + recr.tLogs.size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size + logSystem->tLogs[0]->updateLocalitySet(localities); - std::vector locations; - for (Tag tag : localTags) { + std::vector locations; + for (Tag tag : localTags) { + locations.clear(); + logSystem->tLogs[0]->getPushLocations(VectorRef(&tag, 1), locations, 0); + for (int loc : locations) + reqs[loc].recoverTags.push_back(tag); + } + for (int i = 0; i < oldLogSystem->logRouterTags; i++) { + Tag tag = Tag(tagLocalityLogRouter, i); + reqs[logSystem->tLogs[0]->bestLocationFor(tag)].recoverTags.push_back(tag); + } + bool nonShardedTxs = logSystem->getTLogVersion() < TLogVersion::V4; + if (oldLogSystem->tLogs.size()) { + for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { + Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); + Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % logSystem->txsTags); locations.clear(); - logSystem->tLogs[0]->getPushLocations(VectorRef(&tag, 1), locations, 0); + logSystem->tLogs[0]->getPushLocations(VectorRef(&pushTag, 1), locations, 0); for (int loc : locations) reqs[loc].recoverTags.push_back(tag); } - for (int i = 0; i < oldLogSystem->logRouterTags; i++) { - Tag tag = Tag(tagLocalityLogRouter, i); - reqs[logSystem->tLogs[0]->bestLocationFor(tag)].recoverTags.push_back(tag); + if (nonShardedTxs) { + localTags.push_back(txsTag); + } else { + for (int i = 0; i < logSystem->txsTags; i++) { + localTags.push_back(Tag(tagLocalityTxs, i)); + } + } + } + + for (int i = 0; i < recr.tLogs.size(); i++) { + InitializeTLogRequest& req = reqs[i]; + req.recruitmentID = logSystem->recruitmentID; + req.logVersion = configuration.tLogVersion; + req.storeType = configuration.tLogDataStoreType; + req.spillType = configuration.tLogSpillType; + req.recoverFrom = oldLogSystemConfig; + req.recoverAt = oldLogSystem->recoverAt.get(); + req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; + req.epoch = recoveryCount; + req.locality = primaryLocality; + req.remoteTag = Tag(tagLocalityRemoteLog, i); + req.isPrimary = true; + req.allTags = localTags; + req.startVersion = logSystem->tLogs[0]->startVersion; + req.logRouterTags = logSystem->logRouterTags; + req.txsTags = logSystem->txsTags; + } + + initializationReplies.reserve(recr.tLogs.size()); + for (int i = 0; i < recr.tLogs.size(); i++) + initializationReplies.push_back(transformErrors( + throwErrorOr(recr.tLogs[i].tLog.getReplyUnlessFailedFor( + reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + + state std::vector> recoveryComplete; + + if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { + state vector> satelliteInitializationReplies; + vector sreqs(recr.satelliteTLogs.size()); + std::vector satelliteTags; + + if (logSystem->logRouterTags) { + for (int i = 0; i < oldLogSystem->logRouterTags; i++) { + Tag tag = Tag(tagLocalityLogRouter, i); + // Satellite logs will index a mutation with tagLocalityLogRouter with an id greater than + // the number of log routers as having an id mod the number of log routers. We thus need + // to make sure that if we're going from more log routers in the previous generation to + // less log routers in the newer one, that we map the log router tags onto satellites that + // are the preferred location for id%logRouterTags. + Tag pushLocation = Tag(tagLocalityLogRouter, i % logSystem->logRouterTags); + locations.clear(); + logSystem->tLogs[1]->getPushLocations(VectorRef(&pushLocation, 1), locations, 0); + for (int loc : locations) + sreqs[loc].recoverTags.push_back(tag); + } } - bool nonShardedTxs = logSystem->getTLogVersion() < TLogVersion::V4; if (oldLogSystem->tLogs.size()) { for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % logSystem->txsTags); locations.clear(); - logSystem->tLogs[0]->getPushLocations(VectorRef(&pushTag, 1), locations, 0); + logSystem->tLogs[1]->getPushLocations(VectorRef(&pushTag, 1), locations, 0); for (int loc : locations) - reqs[loc].recoverTags.push_back(tag); + sreqs[loc].recoverTags.push_back(tag); } if (nonShardedTxs) { - localTags.push_back(txsTag); + satelliteTags.push_back(txsTag); } else { for (int i = 0; i < logSystem->txsTags; i++) { - localTags.push_back(Tag(tagLocalityTxs, i)); + satelliteTags.push_back(Tag(tagLocalityTxs, i)); } } } - for (int i = 0; i < recr.tLogs.size(); i++) { - InitializeTLogRequest& req = reqs[i]; + for (int i = 0; i < recr.satelliteTLogs.size(); i++) { + InitializeTLogRequest& req = sreqs[i]; req.recruitmentID = logSystem->recruitmentID; req.logVersion = configuration.tLogVersion; req.storeType = configuration.tLogDataStoreType; @@ -2887,455 +2903,157 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCountedrecoverAt.get(); req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; req.epoch = recoveryCount; - req.locality = primaryLocality; - req.remoteTag = Tag(tagLocalityRemoteLog, i); + req.locality = tagLocalitySatellite; + req.remoteTag = Tag(); req.isPrimary = true; - req.allTags = localTags; - req.startVersion = logSystem->tLogs[0]->startVersion; + req.allTags = satelliteTags; + req.startVersion = oldLogSystem->knownCommittedVersion + 1; req.logRouterTags = logSystem->logRouterTags; req.txsTags = logSystem->txsTags; } - initializationReplies.reserve(recr.tLogs.size()); - for (int i = 0; i < recr.tLogs.size(); i++) - initializationReplies.push_back(transformErrors( - throwErrorOr(recr.tLogs[i].tLog.getReplyUnlessFailedFor( - reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + satelliteInitializationReplies.reserve(recr.satelliteTLogs.size()); + for (int i = 0; i < recr.satelliteTLogs.size(); i++) + satelliteInitializationReplies.push_back(transformErrors( + throwErrorOr(recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( + sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), master_recovery_failed())); - state std::vector> recoveryComplete; - - if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { - state vector> satelliteInitializationReplies; - vector sreqs(recr.satelliteTLogs.size()); - std::vector satelliteTags; - - if (logSystem->logRouterTags) { - for (int i = 0; i < oldLogSystem->logRouterTags; i++) { - Tag tag = Tag(tagLocalityLogRouter, i); - // Satellite logs will index a mutation with tagLocalityLogRouter with an id greater than - // the number of log routers as having an id mod the number of log routers. We thus need - // to make sure that if we're going from more log routers in the previous generation to - // less log routers in the newer one, that we map the log router tags onto satellites that - // are the preferred location for id%logRouterTags. - Tag pushLocation = Tag(tagLocalityLogRouter, i % logSystem->logRouterTags); - locations.clear(); - logSystem->tLogs[1]->getPushLocations(VectorRef(&pushLocation, 1), locations, 0); - for (int loc : locations) - sreqs[loc].recoverTags.push_back(tag); - } - } - if (oldLogSystem->tLogs.size()) { - for (int i = needsOldTxs ? -1 : 0; i < maxTxsTags; i++) { - Tag tag = i == -1 ? txsTag : Tag(tagLocalityTxs, i); - Tag pushTag = (i == -1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i % logSystem->txsTags); - locations.clear(); - logSystem->tLogs[1]->getPushLocations(VectorRef(&pushTag, 1), locations, 0); - for (int loc : locations) - sreqs[loc].recoverTags.push_back(tag); - } - if (nonShardedTxs) { - satelliteTags.push_back(txsTag); - } else { - for (int i = 0; i < logSystem->txsTags; i++) { - satelliteTags.push_back(Tag(tagLocalityTxs, i)); - } - } - } + wait(waitForAll(satelliteInitializationReplies) || oldRouterRecruitment); - for (int i = 0; i < recr.satelliteTLogs.size(); i++) { - InitializeTLogRequest& req = sreqs[i]; - req.recruitmentID = logSystem->recruitmentID; - req.logVersion = configuration.tLogVersion; - req.storeType = configuration.tLogDataStoreType; - req.spillType = configuration.tLogSpillType; - req.recoverFrom = oldLogSystemConfig; - req.recoverAt = oldLogSystem->recoverAt.get(); - req.knownCommittedVersion = oldLogSystem->knownCommittedVersion; - req.epoch = recoveryCount; - req.locality = tagLocalitySatellite; - req.remoteTag = Tag(); - req.isPrimary = true; - req.allTags = satelliteTags; - req.startVersion = oldLogSystem->knownCommittedVersion + 1; - req.logRouterTags = logSystem->logRouterTags; - req.txsTags = logSystem->txsTags; - } - - satelliteInitializationReplies.reserve(recr.satelliteTLogs.size()); - for (int i = 0; i < recr.satelliteTLogs.size(); i++) - satelliteInitializationReplies.push_back(transformErrors( - throwErrorOr(recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( - sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed())); - - wait(waitForAll(satelliteInitializationReplies) || oldRouterRecruitment); - - for (int i = 0; i < satelliteInitializationReplies.size(); i++) { - logSystem->tLogs[1]->logServers[i] = makeReference>>( - OptionalInterface(satelliteInitializationReplies[i].get())); - } - - for (int i = 0; i < logSystem->tLogs[1]->logServers.size(); i++) - recoveryComplete.push_back(transformErrors( - throwErrorOr( - logSystem->tLogs[1]->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( - TLogRecoveryFinishedRequest(), - SERVER_KNOBS->TLOG_TIMEOUT, - SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), - master_recovery_failed())); - } - - wait(waitForAll(initializationReplies) || oldRouterRecruitment); - - for (int i = 0; i < initializationReplies.size(); i++) { - logSystem->tLogs[0]->logServers[i] = makeReference>>( - OptionalInterface(initializationReplies[i].get())); - logSystem->tLogs[0]->tLogLocalities[i] = recr.tLogs[i].locality; - } - filterLocalityDataForPolicy(logSystem->tLogs[0]->tLogPolicy, &logSystem->tLogs[0]->tLogLocalities); - - // Don't force failure of recovery if it took us a long time to recover. This avoids multiple long running - // recoveries causing tests to timeout - if (BUGGIFY && now() - startTime < 300 && g_network->isSimulated() && g_simulator.speedUpSimulation) - throw master_recovery_failed(); - - for (int i = 0; i < logSystem->tLogs[0]->logServers.size(); i++) + for (int i = 0; i < satelliteInitializationReplies.size(); i++) { + logSystem->tLogs[1]->logServers[i] = makeReference>>( + OptionalInterface(satelliteInitializationReplies[i].get())); + } + + for (int i = 0; i < logSystem->tLogs[1]->logServers.size(); i++) recoveryComplete.push_back(transformErrors( throwErrorOr( - logSystem->tLogs[0]->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( + logSystem->tLogs[1]->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( TLogRecoveryFinishedRequest(), SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), master_recovery_failed())); - logSystem->recoveryComplete = waitForAll(recoveryComplete); - - if (configuration.usableRegions > 1) { - logSystem->hasRemoteServers = true; - logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch(logSystem.getPtr(), - oldLogSystem, - fRemoteWorkers, - configuration, - recoveryCount, - remoteLocality, - allTags); - if (oldLogSystem->tLogs.size() > 0 && oldLogSystem->tLogs[0]->locality == tagLocalitySpecial) { - // The wait is required so that we know both primary logs and remote logs have copied the data between - // the known committed version and the recovery version. - // FIXME: we can remove this wait once we are able to have log routers which can ship data to the remote - // logs without using log router tags. - wait(logSystem->remoteRecovery); - } - } else { - logSystem->hasRemoteServers = false; - logSystem->remoteRecovery = logSystem->recoveryComplete; - logSystem->remoteRecoveryComplete = logSystem->recoveryComplete; - } + } - return logSystem; + wait(waitForAll(initializationReplies) || oldRouterRecruitment); + + for (int i = 0; i < initializationReplies.size(); i++) { + logSystem->tLogs[0]->logServers[i] = makeReference>>( + OptionalInterface(initializationReplies[i].get())); + logSystem->tLogs[0]->tLogLocalities[i] = recr.tLogs[i].locality; } + filterLocalityDataForPolicy(logSystem->tLogs[0]->tLogPolicy, &logSystem->tLogs[0]->tLogLocalities); + + // Don't force failure of recovery if it took us a long time to recover. This avoids multiple long running + // recoveries causing tests to timeout + if (BUGGIFY && now() - startTime < 300 && g_network->isSimulated() && g_simulator.speedUpSimulation) + throw master_recovery_failed(); - ACTOR static Future trackRejoins( - UID dbgid, - std::vector>>, Reference>> - logServers, - FutureStream rejoinRequests) { - state std::map> lastReply; - state std::set logsWaiting; - state double startTime = now(); - state Future warnTimeout = delay(SERVER_KNOBS->TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS); + for (int i = 0; i < logSystem->tLogs[0]->logServers.size(); i++) + recoveryComplete.push_back(transformErrors( + throwErrorOr(logSystem->tLogs[0]->logServers[i]->get().interf().recoveryFinished.getReplyUnlessFailedFor( + TLogRecoveryFinishedRequest(), + SERVER_KNOBS->TLOG_TIMEOUT, + SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + master_recovery_failed())); + logSystem->recoveryComplete = waitForAll(recoveryComplete); - for (const auto& log : logServers) { - logsWaiting.insert(log.first->get().id()); + if (configuration.usableRegions > 1) { + logSystem->hasRemoteServers = true; + logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch( + logSystem.getPtr(), oldLogSystem, fRemoteWorkers, configuration, recoveryCount, remoteLocality, allTags); + if (oldLogSystem->tLogs.size() > 0 && oldLogSystem->tLogs[0]->locality == tagLocalitySpecial) { + // The wait is required so that we know both primary logs and remote logs have copied the data between + // the known committed version and the recovery version. + // FIXME: we can remove this wait once we are able to have log routers which can ship data to the remote + // logs without using log router tags. + wait(logSystem->remoteRecovery); } + } else { + logSystem->hasRemoteServers = false; + logSystem->remoteRecovery = logSystem->recoveryComplete; + logSystem->remoteRecoveryComplete = logSystem->recoveryComplete; + } - try { - loop choose { - when(TLogRejoinRequest req = waitNext(rejoinRequests)) { - int pos = -1; - for (int i = 0; i < logServers.size(); i++) { - if (logServers[i].first->get().id() == req.myInterface.id()) { - pos = i; - logsWaiting.erase(logServers[i].first->get().id()); - break; - } - } - if (pos != -1) { - TraceEvent("TLogJoinedMe", dbgid) - .detail("TLog", req.myInterface.id()) - .detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString()); - if (!logServers[pos].first->get().present() || - req.myInterface.commit.getEndpoint() != - logServers[pos].first->get().interf().commit.getEndpoint()) { - TLogInterface interf = req.myInterface; - filterLocalityDataForPolicyDcAndProcess(logServers[pos].second, &interf.filteredLocality); - logServers[pos].first->setUnconditional(OptionalInterface(interf)); - } - lastReply[req.myInterface.id()].send(TLogRejoinReply{ false }); - lastReply[req.myInterface.id()] = req.reply; - } else { - TraceEvent("TLogJoinedMeUnknown", dbgid) - .detail("TLog", req.myInterface.id()) - .detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString()); - req.reply.send(true); + return logSystem; +} + +ACTOR Future TagPartitionedLogSystem::trackRejoins( + UID dbgid, + std::vector>>, Reference>> + logServers, + FutureStream rejoinRequests) { + state std::map> lastReply; + state std::set logsWaiting; + state double startTime = now(); + state Future warnTimeout = delay(SERVER_KNOBS->TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS); + + for (const auto& log : logServers) { + logsWaiting.insert(log.first->get().id()); + } + + try { + loop choose { + when(TLogRejoinRequest req = waitNext(rejoinRequests)) { + int pos = -1; + for (int i = 0; i < logServers.size(); i++) { + if (logServers[i].first->get().id() == req.myInterface.id()) { + pos = i; + logsWaiting.erase(logServers[i].first->get().id()); + break; } } - when(wait(warnTimeout)) { - for (const auto& logId : logsWaiting) { - TraceEvent(SevWarnAlways, "TLogRejoinSlow", dbgid) - .detail("Elapsed", startTime - now()) - .detail("LogId", logId); + if (pos != -1) { + TraceEvent("TLogJoinedMe", dbgid) + .detail("TLog", req.myInterface.id()) + .detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString()); + if (!logServers[pos].first->get().present() || + req.myInterface.commit.getEndpoint() != + logServers[pos].first->get().interf().commit.getEndpoint()) { + TLogInterface interf = req.myInterface; + filterLocalityDataForPolicyDcAndProcess(logServers[pos].second, &interf.filteredLocality); + logServers[pos].first->setUnconditional(OptionalInterface(interf)); } - warnTimeout = Never(); + lastReply[req.myInterface.id()].send(TLogRejoinReply{ false }); + lastReply[req.myInterface.id()] = req.reply; + } else { + TraceEvent("TLogJoinedMeUnknown", dbgid) + .detail("TLog", req.myInterface.id()) + .detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString()); + req.reply.send(true); + } + } + when(wait(warnTimeout)) { + for (const auto& logId : logsWaiting) { + TraceEvent(SevWarnAlways, "TLogRejoinSlow", dbgid) + .detail("Elapsed", startTime - now()) + .detail("LogId", logId); } + warnTimeout = Never(); } - } catch (...) { - for (auto it = lastReply.begin(); it != lastReply.end(); ++it) - it->second.send(TLogRejoinReply{ true }); - throw; } + } catch (...) { + for (auto it = lastReply.begin(); it != lastReply.end(); ++it) + it->second.send(TLogRejoinReply{ true }); + throw; } - - ACTOR static Future lockTLog(UID myID, Reference>> tlog) { - TraceEvent("TLogLockStarted", myID).detail("TLog", tlog->get().id()); - loop { - choose { - when(TLogLockResult data = - wait(tlog->get().present() - ? brokenPromiseToNever(tlog->get().interf().lock.getReply()) - : Never())) { - TraceEvent("TLogLocked", myID).detail("TLog", tlog->get().id()).detail("End", data.end); - return data; - } - when(wait(tlog->onChange())) {} - } - } - } - - // FIXME: disabled during merge, update and use in epochEnd() - /* - static void lockMinimalTLogSet(const UID& dbgid, const DBCoreState& prevState, - const std::vector>>>& logServers, - const std::vector>>& logFailed, - vector>* tLogReply ) { - // Invariant: tLogReply[i] must correspond to the tlog stored as logServers[i]. - ASSERT(tLogReply->size() == prevState.tLogLocalities.size()); - ASSERT(logFailed.size() == tLogReply->size()); - - // For any given index, only one of the following will be true. - auto locking_completed = [&logFailed, tLogReply](int index) { - const auto& entry = tLogReply->at(index); - return !logFailed[index]->get() && entry.isValid() && entry.isReady() && !entry.isError(); - }; - auto locking_failed = [&logFailed, tLogReply](int index) { - const auto& entry = tLogReply->at(index); - return logFailed[index]->get() || (entry.isValid() && entry.isReady() && entry.isError()); - }; - auto locking_pending = [&logFailed, tLogReply](int index) { - const auto& entry = tLogReply->at(index); - return !logFailed[index]->get() && (entry.isValid() && !entry.isReady()); - }; - auto locking_skipped = [&logFailed, tLogReply](int index) { - const auto& entry = tLogReply->at(index); - return !logFailed[index]->get() && !entry.isValid(); - }; - - auto can_obtain_quorum = [&prevState](std::function filter) { - LocalityGroup filter_true; - std::vector filter_false, unused; - for (int i = 0; i < prevState.tLogLocalities.size() ; i++) { - if (filter(i)) { - filter_true.add(prevState.tLogLocalities[i]); - } else { - filter_false.push_back(prevState.tLogLocalities[i]); - } - } - bool valid = filter_true.validate(prevState.tLogPolicy); - if (!valid && prevState.tLogWriteAntiQuorum > 0 ) { - valid = !validateAllCombinations(unused, filter_true, prevState.tLogPolicy, filter_false, - prevState.tLogWriteAntiQuorum, false); - } - return valid; - }; - - // Step 1: Verify that if all the failed TLogs come back, they can't form a quorum. - if (can_obtain_quorum(locking_failed)) { - TraceEvent(SevInfo, "MasterRecoveryTLogLockingImpossible", dbgid).log(); - return; - } - - // Step 2: It's possible for us to succeed, but we need to lock additional logs. - // - // First, we need an accurate picture of what TLogs we're capable of locking. We can't tell the - // difference between a temporarily failed TLog and a permanently failed TLog. Thus, we assume - // all failures are permanent, and manually re-issue lock requests if they rejoin. - for (int i = 0; i < logFailed.size(); i++) { - const auto& r = tLogReply->at(i); - TEST(locking_failed(i) && (r.isValid() && !r.isReady())); // A TLog failed with a pending request. - // The reboot_a_tlog BUGGIFY below should cause the above case to be hit. - if (locking_failed(i)) { - tLogReply->at(i) = Future(); - } - } - - // We're trying to paritition the set of old tlogs into two sets, L and R, such that: - // (1). R does not validate the policy - // (2). |R| is as large as possible - // (3). L contains all the already-locked TLogs - // and then we only issue lock requests to TLogs in L. This is safe, as R does not have quorum, - // so no commits may occur. It does not matter if L forms a quorum or not. - // - // We form these sets by starting with L as all machines and R as the empty set, and moving a - // random machine from L to R until (1) or (2) no longer holds as true. Code-wise, L is - // [0..end-can_omit), and R is [end-can_omit..end), and we move a random machine via randomizing - // the order of the tlogs. Choosing a random machine was verified to generate a good-enough - // result to be interesting intests sufficiently frequently that we don't need to try to - // calculate the exact optimal solution. - std::vector> tlogs; - for (int i = 0; i < prevState.tLogLocalities.size(); i++) { - tlogs.emplace_back(prevState.tLogLocalities[i], i); - } - deterministicRandom()->randomShuffle(tlogs); - // Rearrange the array such that things that the left is logs closer to being locked, and - // the right is logs that can't be locked. This makes us prefer locking already-locked TLogs, - // which is how we respect the decisions made in the previous execution. - auto idx_to_order = [&locking_completed, &locking_failed, &locking_pending, &locking_skipped](int index) { - bool complete = locking_completed(index); - bool pending = locking_pending(index); - bool skipped = locking_skipped(index); - bool failed = locking_failed(index); - - ASSERT( complete + pending + skipped + failed == 1 ); - - if (complete) return 0; - if (pending) return 1; - if (skipped) return 2; - if (failed) return 3; - - ASSERT(false); // Programmer error. - return -1; - }; - std::sort(tlogs.begin(), tlogs.end(), - // TODO: Change long type to `auto` once toolchain supports C++17. - [&idx_to_order](const std::pair& lhs, const std::pair& rhs) { - return idx_to_order(lhs.second) < idx_to_order(rhs.second); - }); - - // Indexes that aren't in the vector are the ones we're considering omitting. Remove indexes until - // the removed set forms a quorum. - int can_omit = 0; - std::vector to_lock_indexes; - for (auto it = tlogs.cbegin() ; it != tlogs.cend() - 1 ; it++ ) { - to_lock_indexes.push_back(it->second); - } - auto filter = [&to_lock_indexes](int index) { - return std::find(to_lock_indexes.cbegin(), to_lock_indexes.cend(), index) == to_lock_indexes.cend(); - }; - while(true) { - if (can_obtain_quorum(filter)) { - break; - } else { - can_omit++; - ASSERT(can_omit < tlogs.size()); - to_lock_indexes.pop_back(); - } - } - - if (prevState.tLogReplicationFactor - prevState.tLogWriteAntiQuorum == 1) { - ASSERT(can_omit == 0); - } - // Our previous check of making sure there aren't too many failed logs should have prevented this. - ASSERT(!locking_failed(tlogs[tlogs.size()-can_omit-1].second)); - - // If we've managed to leave more tlogs unlocked than (RF-AQ), it means we've hit the case - // where the policy engine has allowed us to have multiple logs in the same failure domain - // with independant sets of data. This case will validated that no code is relying on the old - // quorum=(RF-AQ) logic, and now goes through the policy engine instead. - TEST(can_omit >= prevState.tLogReplicationFactor - prevState.tLogWriteAntiQuorum); // Locking a subset of the - TLogs while ending an epoch. const bool reboot_a_tlog = g_network->now() - g_simulator.lastConnectionFailure > - g_simulator.connectionFailuresDisableDuration && BUGGIFY && deterministicRandom()->random01() < 0.25; - TraceEvent(SevInfo, "MasterRecoveryTLogLocking", dbgid) - detail("Locks", tlogs.size() - can_omit) - detail("Skipped", can_omit) - detail("Replication", prevState.tLogReplicationFactor) - detail("Antiquorum", prevState.tLogWriteAntiQuorum) - detail("RebootBuggify", reboot_a_tlog); - for (int i = 0; i < tlogs.size() - can_omit; i++) { - const int index = tlogs[i].second; - Future& entry = tLogReply->at(index); - if (!entry.isValid()) { - entry = lockTLog( dbgid, logServers[index] ); - } - } - if (reboot_a_tlog) { - g_simulator.lastConnectionFailure = g_network->now(); - for (int i = 0; i < tlogs.size() - can_omit; i++) { - const int index = tlogs[i].second; - if (logServers[index]->get().present()) { - g_simulator.rebootProcess( - g_simulator.getProcessByAddress( - logServers[index]->get().interf().address()), - ISimulator::RebootProcess); - break; - } - } - } - // Intentionally leave `tlogs.size() - can_omit` .. `tlogs.size()` as !isValid() Futures. - }*/ - - template - static vector getReadyNonError(vector> const& futures) { - // Return the values of those futures which have (non-error) values ready - std::vector result; - for (auto& f : futures) - if (f.isReady() && !f.isError()) - result.push_back(f.get()); - return result; - } - - struct sort_by_end { - bool operator()(TLogLockResult const& a, TLogLockResult const& b) const { return a.end < b.end; } - }; -}; - -Future ILogSystem::recoverAndEndEpoch(Reference>> const& outLogSystem, - UID const& dbgid, - DBCoreState const& oldState, - FutureStream const& rejoins, - LocalityData const& locality, - bool* forceRecovery) { - return TagPartitionedLogSystem::recoverAndEndEpoch(outLogSystem, dbgid, oldState, rejoins, locality, forceRecovery); -} - -Reference ILogSystem::fromLogSystemConfig(UID const& dbgid, - struct LocalityData const& locality, - struct LogSystemConfig const& conf, - bool excludeRemote, - bool useRecoveredAt, - Optional>> addActor) { - if (conf.logSystemType == LogSystemType::empty) - return Reference(); - else if (conf.logSystemType == LogSystemType::tagPartitioned) - return TagPartitionedLogSystem::fromLogSystemConfig( - dbgid, locality, conf, excludeRemote, useRecoveredAt, addActor); - else - throw internal_error(); } -Reference ILogSystem::fromOldLogSystemConfig(UID const& dbgid, - struct LocalityData const& locality, - struct LogSystemConfig const& conf) { - if (conf.logSystemType == LogSystemType::empty) - return Reference(); - else if (conf.logSystemType == LogSystemType::tagPartitioned) - return TagPartitionedLogSystem::fromOldLogSystemConfig(dbgid, locality, conf); - else - throw internal_error(); -} +ACTOR Future TagPartitionedLogSystem::lockTLog( + UID myID, + Reference>> tlog) { -Reference ILogSystem::fromServerDBInfo(UID const& dbgid, - ServerDBInfo const& dbInfo, - bool useRecoveredAt, - Optional>> addActor) { - return fromLogSystemConfig(dbgid, dbInfo.myLocality, dbInfo.logSystemConfig, false, useRecoveredAt, addActor); + TraceEvent("TLogLockStarted", myID).detail("TLog", tlog->get().id()); + loop { + choose { + when(TLogLockResult data = wait( + tlog->get().present() ? brokenPromiseToNever(tlog->get().interf().lock.getReply()) + : Never())) { + TraceEvent("TLogLocked", myID).detail("TLog", tlog->get().id()).detail("End", data.end); + return data; + } + when(wait(tlog->onChange())) {} + } + } } diff --git a/fdbserver/TagPartitionedLogSystem.actor.h b/fdbserver/TagPartitionedLogSystem.actor.h new file mode 100644 index 00000000000..acab5297585 --- /dev/null +++ b/fdbserver/TagPartitionedLogSystem.actor.h @@ -0,0 +1,383 @@ +/* + * TagPartitionedLogSystem.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_TAGPARTITIONEDLOGSYSTEM_ACTOR_G_H) +#define FDBSERVER_TAGPARTITIONEDLOGSYSTEM_ACTOR_G_H +#include "fdbserver/TagPartitionedLogSystem.actor.g.h" +#elif !defined(FDBSERVER_TAGPARTITIONEDLOGSYSTEM_ACTOR_H) +#define FDBSERVER_TAGPARTITIONEDLOGSYSTEM_ACTOR_H + +#pragma once + +#include "fdbclient/SystemData.h" +#include "fdbrpc/Replication.h" +#include "fdbrpc/ReplicationUtils.h" +#include "fdbrpc/simulator.h" +#include "fdbserver/DBCoreState.h" +#include "fdbserver/Knobs.h" +#include "fdbserver/LogProtocolMessage.h" +#include "fdbserver/LogSystem.h" +#include "fdbserver/RecoveryState.h" +#include "fdbserver/ServerDBInfo.h" +#include "fdbserver/WaitFailure.h" +#include "flow/ActorCollection.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +// TagPartitionedLogSystem info in old epoch +struct OldLogData { + std::vector> tLogs; + int32_t logRouterTags; + int32_t txsTags; // The number of txsTags, which may change across generations. + Version epochBegin, epochEnd; + std::set pseudoLocalities; + LogEpoch epoch; + + OldLogData() : logRouterTags(0), txsTags(0), epochBegin(0), epochEnd(0), epoch(0) {} + + // Constructor for T of OldTLogConf and OldTLogCoreData + template + explicit OldLogData(const T& conf) + : logRouterTags(conf.logRouterTags), txsTags(conf.txsTags), epochBegin(conf.epochBegin), epochEnd(conf.epochEnd), + pseudoLocalities(conf.pseudoLocalities), epoch(conf.epoch) { + tLogs.resize(conf.tLogs.size()); + for (int j = 0; j < conf.tLogs.size(); j++) { + auto logSet = makeReference(conf.tLogs[j]); + tLogs[j] = logSet; + } + } +}; + +struct LogLockInfo { + Version epochEnd; + bool isCurrent; + Reference logSet; + std::vector> replies; + + LogLockInfo() : epochEnd(std::numeric_limits::max()), isCurrent(false) {} +}; + +struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted { + const UID dbgid; + LogSystemType logSystemType; + std::vector> tLogs; // LogSets in different locations: primary, satellite, or remote + int expectedLogSets; + int logRouterTags; + int txsTags; + UID recruitmentID; + int repopulateRegionAntiQuorum; + bool stopped; + std::set pseudoLocalities; // Represent special localities that will be mapped to tagLocalityLogRouter + const LogEpoch epoch; + LogEpoch oldestBackupEpoch; + + // new members + std::map pseudoLocalityPopVersion; + Future rejoins; + Future recoveryComplete; + Future remoteRecovery; + Future remoteRecoveryComplete; + std::vector lockResults; + AsyncVar recoveryCompleteWrittenToCoreState; + bool remoteLogsWrittenToCoreState; + bool hasRemoteServers; + AsyncTrigger backupWorkerChanged; + std::set removedBackupWorkers; // Workers that are removed before setting them. + + Optional recoverAt; + Optional recoveredAt; + Version knownCommittedVersion; + Version backupStartVersion = invalidVersion; // max(tLogs[0].startVersion, previous epochEnd). + LocalityData locality; + // For each currently running popFromLog actor, outstandingPops is + // (logID, tag)->(max popped version, durableKnownCommittedVersion). + // Why do we need durableKnownCommittedVersion? knownCommittedVersion gives the lower bound of what data + // will need to be copied into the next generation to restore the replication factor. + // Guess: It probably serves as a minimum version of what data should be on a TLog in the next generation and + // sending a pop for anything less than durableKnownCommittedVersion for the TLog will be absurd. + std::map, std::pair> outstandingPops; + + Optional>> addActor; + ActorCollection popActors; + std::vector oldLogData; // each element has the log info. in one old epoch. + AsyncTrigger logSystemConfigChanged; + + TagPartitionedLogSystem(UID dbgid, + LocalityData locality, + LogEpoch e, + Optional>> addActor = Optional>>()) + : dbgid(dbgid), logSystemType(LogSystemType::empty), expectedLogSets(0), logRouterTags(0), txsTags(0), + repopulateRegionAntiQuorum(0), stopped(false), epoch(e), oldestBackupEpoch(0), + recoveryCompleteWrittenToCoreState(false), remoteLogsWrittenToCoreState(false), hasRemoteServers(false), + locality(locality), addActor(addActor), popActors(false) {} + + void stopRejoins() final; + + void addref() final; + + void delref() final; + + std::string describe() const final; + + UID getDebugID() const final; + + void addPseudoLocality(int8_t locality); + + Tag getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const final; + + bool hasPseudoLocality(int8_t locality) const final; + + // Return the min version of all pseudoLocalities, i.e., logRouter and backupTag + Version popPseudoLocalityTag(Tag tag, Version upTo) final; + + static Future recoverAndEndEpoch(Reference>> const& outLogSystem, + UID const& dbgid, + DBCoreState const& oldState, + FutureStream const& rejoins, + LocalityData const& locality, + bool* forceRecovery); + + static Reference fromLogSystemConfig(UID const& dbgid, + LocalityData const& locality, + LogSystemConfig const& lsConf, + bool excludeRemote, + bool useRecoveredAt, + Optional>> addActor); + + static Reference fromOldLogSystemConfig(UID const& dbgid, + LocalityData const& locality, + LogSystemConfig const& lsConf); + + // Convert TagPartitionedLogSystem to DBCoreState and override input newState as return value + void toCoreState(DBCoreState& newState) final; + + bool remoteStorageRecovered() final; + + Future onCoreStateChanged() final; + + void coreStateWritten(DBCoreState const& newState) final; + + Future onError() final; + + ACTOR static Future onError_internal(TagPartitionedLogSystem* self); + + ACTOR static Future pushResetChecker(Reference self, NetworkAddress addr); + + ACTOR static Future recordPushMetrics(Reference self, + Reference dist, + NetworkAddress addr, + Future in); + + Future push(Version prevVersion, + Version version, + Version knownCommittedVersion, + Version minKnownCommittedVersion, + LogPushData& data, + SpanID const& spanContext, + Optional debugID) final; + + Reference peekAll(UID dbgid, Version begin, Version end, Tag tag, bool parallelGetMore); + + Reference peekRemote(UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore); + + Reference peek(UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore) final; + + Reference peek(UID dbgid, + Version begin, + Optional end, + std::vector tags, + bool parallelGetMore) final; + + Reference peekLocal(UID dbgid, + Tag tag, + Version begin, + Version end, + bool useMergePeekCursors, + int8_t peekLocality = tagLocalityInvalid); + + Reference peekTxs(UID dbgid, + Version begin, + int8_t peekLocality, + Version localEnd, + bool canDiscardPopped) final; + + Reference peekSingle(UID dbgid, + Version begin, + Tag tag, + std::vector> history) final; + + // LogRouter or BackupWorker use this function to obtain a cursor for peeking tlogs of a generation (i.e., epoch). + // Specifically, the epoch is determined by looking up "dbgid" in tlog sets of generations. + // The returned cursor can peek data at the "tag" from the given "begin" version to that epoch's end version or + // the recovery version for the latest old epoch. For the current epoch, the cursor has no end version. + Reference peekLogRouter(UID dbgid, Version begin, Tag tag) final; + + Version getKnownCommittedVersion() final; + + Future onKnownCommittedVersionChange() final; + + void popLogRouter(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality); + + void popTxs(Version upTo, int8_t popLocality) final; + + // pop 'tag.locality' type data up to the 'upTo' version + void pop(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality) final; + + // pop tag from log up to the version defined in self->outstandingPops[].first + ACTOR static Future popFromLog(TagPartitionedLogSystem* self, + Reference>> log, + Tag tag, + double time); + + ACTOR static Future getPoppedFromTLog(Reference>> log, Tag tag); + + ACTOR static Future getPoppedTxs(TagPartitionedLogSystem* self); + + Future getTxsPoppedVersion() final; + + ACTOR static Future confirmEpochLive_internal(Reference logSet, Optional debugID); + + // Returns success after confirming that pushes in the current epoch are still possible + Future confirmEpochLive(Optional debugID) final; + + Future endEpoch() final; + + // Call only after end_epoch() has successfully completed. Returns a new epoch immediately following this one. + // The new epoch is only provisional until the caller updates the coordinated DBCoreState. + Future> newEpoch(RecruitFromConfigurationReply const& recr, + Future const& fRemoteWorkers, + DatabaseConfiguration const& config, + LogEpoch recoveryCount, + int8_t primaryLocality, + int8_t remoteLocality, + std::vector const& allTags, + Reference> const& recruitmentStalled) final; + + LogSystemConfig getLogSystemConfig() const final; + + Standalone getLogsValue() const final; + + Future onLogSystemConfigChange() final; + + Version getEnd() const final; + + Version getPeekEnd() const; + + void getPushLocations(VectorRef tags, std::vector& locations, bool allLocations) const final; + + bool hasRemoteLogs() const final; + + Tag getRandomRouterTag() const final; + + Tag getRandomTxsTag() const final; + + TLogVersion getTLogVersion() const final; + + int getLogRouterTags() const final; + + Version getBackupStartVersion() const final; + + std::map getOldEpochTagsVersionsInfo() const final; + + inline Reference getEpochLogSet(LogEpoch epoch) const; + + void setBackupWorkers(const std::vector& replies) final; + + bool removeBackupWorker(const BackupWorkerDoneRequest& req) final; + + LogEpoch getOldestBackupEpoch() const final; + + void setOldestBackupEpoch(LogEpoch epoch) final; + + ACTOR static Future monitorLog(Reference>> logServer, + Reference> failed); + + Optional> static getDurableVersion( + UID dbgid, + LogLockInfo lockInfo, + std::vector>> failed = std::vector>>(), + Optional lastEnd = Optional()); + + ACTOR static Future getDurableVersionChanged( + LogLockInfo lockInfo, + std::vector>> failed = std::vector>>()); + + ACTOR static Future epochEnd(Reference>> outLogSystem, + UID dbgid, + DBCoreState prevState, + FutureStream rejoinRequests, + LocalityData locality, + bool* forceRecovery); + + ACTOR static Future recruitOldLogRouters(TagPartitionedLogSystem* self, + vector workers, + LogEpoch recoveryCount, + int8_t locality, + Version startVersion, + std::vector tLogLocalities, + Reference tLogPolicy, + bool forRemote); + + static Version getMaxLocalStartVersion(const std::vector>& tLogs); + + static std::vector getLocalTags(int8_t locality, const std::vector& allTags); + + ACTOR static Future newRemoteEpoch(TagPartitionedLogSystem* self, + Reference oldLogSystem, + Future fRemoteWorkers, + DatabaseConfiguration configuration, + LogEpoch recoveryCount, + int8_t remoteLocality, + std::vector allTags); + + ACTOR static Future> newEpoch(Reference oldLogSystem, + RecruitFromConfigurationReply recr, + Future fRemoteWorkers, + DatabaseConfiguration configuration, + LogEpoch recoveryCount, + int8_t primaryLocality, + int8_t remoteLocality, + std::vector allTags, + Reference> recruitmentStalled); + + ACTOR static Future trackRejoins( + UID dbgid, + std::vector>>, Reference>> + logServers, + FutureStream rejoinRequests); + + ACTOR static Future lockTLog(UID myID, Reference>> tlog); + + template + static vector getReadyNonError(vector> const& futures); +}; + +template +vector TagPartitionedLogSystem::getReadyNonError(vector> const& futures) { + // Return the values of those futures which have (non-error) values ready + std::vector result; + for (auto& f : futures) + if (f.isReady() && !f.isError()) + result.push_back(f.get()); + return result; +} + +#include "flow/unactorcompiler.h" +#endif // FDBSERVER_TAGPARTITIONEDLOGSYSTEM_ACTOR_H \ No newline at end of file diff --git a/flow/aarch64/asmdefs.h b/flow/aarch64/asmdefs.h index 2224ccb5366..0c142984584 100644 --- a/flow/aarch64/asmdefs.h +++ b/flow/aarch64/asmdefs.h @@ -10,16 +10,14 @@ #if defined(__aarch64__) +// clang-format off + /* Branch Target Identitication support. */ -#define BTI_C hint 34 -#define BTI_J hint 36 +#define BTI_C hint 34 +#define BTI_J hint 36 /* Return address signing support (pac-ret). */ -#define PACIASP \ - hint 25; \ - .cfi_window_save -#define AUTIASP \ - hint 29; \ - .cfi_window_save +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save /* GNU_PROPERTY_AARCH64_* macros from elf.h. */ #define FEATURE_1_AND 0xc0000000 @@ -27,18 +25,18 @@ #define FEATURE_1_PAC 2 /* Add a NT_GNU_PROPERTY_TYPE_0 note. */ -#define GNU_PROPERTY(type, value) \ - .section.note.gnu.property, "a"; \ - .p2align 3; \ - .word 4; \ - .word 16; \ - .word 5; \ - .asciz "GNU"; \ - .word type; \ - .word 4; \ - .word value; \ - .word 0; \ - .text +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text /* If set then the GNU Property Note section will be added to mark objects to support BTI and PAC-RET. */ @@ -48,55 +46,57 @@ #if WANT_GNU_PROPERTY /* Add property note with supported features to all asm files. */ -GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI | FEATURE_1_PAC) +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) #endif -#define ENTRY_ALIGN(name, alignment) \ - .global name; \ - .type name, % function; \ - .align alignment; \ - name: \ - .cfi_startproc; \ - BTI_C; +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; #else #define END_FILE -#define ENTRY_ALIGN(name, alignment) \ - .global name; \ - .type name, % function; \ - .align alignment; \ - name: \ - .cfi_startproc; +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; #endif -#define ENTRY(name) ENTRY_ALIGN(name, 6) +#define ENTRY(name) ENTRY_ALIGN(name, 6) -#define ENTRY_ALIAS(name) \ - .global name; \ - .type name, % function; \ - name: +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: -#define END(name) \ - .cfi_endproc; \ - .size name, .- name; +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; -#define L(l) .L##l +#define L(l) .L ## l #ifdef __ILP32__ -/* Sanitize padding bits of pointer arguments as per aapcs64 */ -#define PTR_ARG(n) mov w##n, w##n + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n #else #define PTR_ARG(n) #endif #ifdef __ILP32__ -/* Sanitize padding bits of size arguments as per aapcs64 */ -#define SIZE_ARG(n) mov w##n, w##n + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n #else #define SIZE_ARG(n) #endif +// clang-format on + #endif