PivotalHadoopEnhancements.html


<!doctype html>
<html>
<head>
  <meta charset="utf-8">

  <!-- Always force latest IE rendering engine or request Chrome Frame -->
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible">

  <!-- REPLACE X WITH PRODUCT NAME -->
  <title>Pivotal Hadoop Enhancements | Pivotal Docs</title>
    <!-- Local CSS stylesheets -->
    <link href="/stylesheets/master.css" media="screen,print" rel="stylesheet" type="text/css" />
    <link href="/stylesheets/breadcrumbs.css" media="screen,print" rel="stylesheet" type="text/css" />
    <link href="/stylesheets/search.css" media="screen,print" rel="stylesheet" type="text/css" />
    <link href="/stylesheets/portal-style.css" media="screen,print" rel="stylesheet" type="text/css" />
    <link href="/stylesheets/printable.css" media="print" rel="stylesheet" type="text/css" /> 
    <!-- Confluence HTML stylesheet -->
    <link href="/stylesheets/site-conf.css" media="screen,print" rel="stylesheet"  type="text/css" /> 
    <!-- Left-navigation code -->
    <!-- http://www.designchemical.com/lab/jquery-vertical-accordion-menu-plugin/examples/# -->
    <link href="/stylesheets/dcaccordion.css" rel="stylesheet" type="text/css" />
    <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script>
    <script src="/javascripts/jquery.cookie.js" type="text/javascript"></script>
    <script src="/javascripts/jquery.hoverIntent.minified.js" type="text/javascript"></script>
    <script src="/javascripts/jquery.dcjqaccordion.2.7.min.js" type="text/javascript"></script>
    <script type="text/javascript">
                    $(document).ready(function($){
					$('#accordion-1').dcAccordion({
						eventType: 'click',
						autoClose: true,
						saveState: true,
						disableLink: false,
						speed: 'fast',
						classActive: 'test',
						showCount: false
					});
					});
        </script>
    <link href="/stylesheets/grey.css" rel="stylesheet" type="text/css" /> 
    <!-- End left-navigation code -->
    <script src="/javascripts/all.js" type="text/javascript"></script>
    <link href='http://www.gopivotal.com/misc/favicon.ico' rel='shortcut icon'>
    <script type="text/javascript">
    if (window.location.host === 'docs.gopivotal.com') {
        var _gaq = _gaq || [];
        _gaq.push(['_setAccount', 'UA-39702075-1']);
        _gaq.push(['_setDomainName', 'gopivotal.com']);
        _gaq.push(['_trackPageview']);

        (function() {
          var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
          ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
          var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
        })();
    }
  </script>
</head>

<body class="pivotalcf pivotalcf_getstarted pivotalcf_getstarted_index">
  <div class="viewport">
    <div class="mobile-navigation--wrapper mobile-only">
      <div class="navigation-drawer--container">
        <div class="navigation-item-list">
          <div class="navbar-link active">
            <a href="http://gopivotal.com">
              Home
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/paas">
              PaaS
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/big-data">
              Big Data
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/agile">
              Agile
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/support">
              Help &amp; Support
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/products">
              Products
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/solutions">
              Solutions
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
          <div class="navbar-link">
            <a href="http://gopivotal.com/partners">
              Partners
              <i class="icon-chevron-right pull-right"></i>
            </a>
          </div>
        </div>
      </div>
      <div class="mobile-nav">
        <div class="nav-icon js-open-nav-drawer">
          <i class="icon-reorder"></i>
        </div>
        <div class="header-center-icon">
          <a href="http://gopivotal.com">
            <div class="icon icon-pivotal-logo-mobile"></div>
          </a>
        </div>
      </div>
    </div>

    <div class='wrap'>
      <script src="//use.typekit.net/clb0qji.js" type="text/javascript"></script>
      <script type="text/javascript">
          try {
              Typekit.load();
          } catch (e) {
          }
      </script>
      <script type="text/javascript">
          document.domain = "gopivotal.com";
      </script>

	<script type="text/javascript">
	  WebFontConfig = {
	    google: { families: [ 'Source+Sans+Pro:300italic,400italic,600italic,300,400,600:latin' ] }
	  };
	  (function() {
	    var wf = document.createElement('script');
	    wf.src = ('https:' == document.location.protocol ? 'https' : 'http') +
	      '://ajax.googleapis.com/ajax/libs/webfont/1/webfont.js';
	    wf.type = 'text/javascript';
	    wf.async = 'true';
	    var s = document.getElementsByTagName('script')[0];
	    s.parentNode.insertBefore(wf, s);
	  })(); </script>

      <div id="search-dropdown-box">
        <div class="search-dropdown--container js-search-dropdown">
          <div class="container-fluid">
            <div class="close-menu-large"><img src="http://www.gopivotal.com/sites/all/themes/gopo13/images/icon-close.png" /></div>
            <div class="search-form--container">
              <div class="form-search">
                <div class='gcse-search'></div>
                <script src="http://www.google.com/jsapi" type="text/javascript"></script>
                <script src="/javascripts/cse.js" type="text/javascript"></script>
              </div>
            </div>
          </div>
        </div>
      </div>

      <header class="navbar desktop-only" id="nav">
        <div class="navbar-inner">
            <div class="container-fluid">
                <div class="pivotal-logo--container">
                    <a class="pivotal-logo" href="http://gopivotal.com"><span></span></a>
                </div>

                <ul class="nav pull-right">
                    <li class="navbar-link">
                        <a href="http://www.gopivotal.com/paas" id="paas-nav-link">PaaS</a>
                    </li>
                    <li class="navbar-link">
                        <a href="http://www.gopivotal.com/big-data" id="big-data-nav-link">BIG DATA</a>
                    </li>
                    <li class="navbar-link">
                        <a href="http://www.gopivotal.com/agile" id="agile-nav-link">AGILE</a>
                    </li>
                    <li class="navbar-link">
                        <a href="http://www.gopivotal.com/oss" id="oss-nav-link">OSS</a>
                    </li>
                    <li class="nav-search">
                        <a class="js-search-input-open" id="click-to-search"><span></span></a>
                    </li>
                </ul>
            </div>
            <a href="http://www.gopivotal.com/contact">
                <img id="get-started" src="http://www.gopivotal.com/sites/all/themes/gopo13/images/get-started.png">
            </a>
        </div>
      </header>
      <div class="main-wrap">
        <div class="container-fluid">

          <!-- Google CSE Search Box -->
          <div id='docs-search'>
              <gcse:search></gcse:search>
          </div>
          
          <div id='all-docs-link'>
            <a href="http://docs.gopivotal.com/">All Documentation</a>
          </div>
          
          <div class="container">
            <div id="sub-nav" class="nav-container">              
              
              <!-- Collapsible left-navigation-->
				<ul class="accordion"  id="accordion-1">
					<!-- REPLACE <li/> NODES-->
                        <li>
                <a href="index.html">Home</a></br>
                                
                        <li>
                <a href="PivotalHD.html">Pivotal HD 2.0.1</a>

                            <ul>
                    <li>
                <a href="PHDEnterprise2.0.1ReleaseNotes.html">PHD Enterprise 2.0.1 Release Notes</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PHDInstallationandAdministration.html">PHD Installation and Administration</a>

                            <ul>
                    <li>
                <a href="OverviewofPHD.html">Overview of PHD</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="InstallationOverview.html">Installation Overview</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PHDInstallationChecklist.html">PHD Installation Checklist</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="InstallingPHDUsingtheCLI.html">Installing PHD Using the CLI</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="UpgradeChecklist.html">Upgrade Checklist</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="UpgradingPHDUsingtheCLI.html">Upgrading PHD Using the CLI</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="AdministeringPHDUsingtheCLI.html">Administering PHD Using the CLI</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PHDFAQFrequentlyAskedQuestions.html">PHD FAQ (Frequently Asked Questions)</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PHDTroubleshooting.html">PHD Troubleshooting</a>

                    </li>
            </ul>
            </li>
            </ul>
                    <ul>
                    <li>
                <a href="StackandToolsReference.html">Stack and Tools Reference</a>

                            <ul>
                    <li>
                <a href="OverviewofApacheStackandPivotalComponents.html">Overview of Apache Stack and Pivotal Components</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="ManuallyInstallingPivotalHD2.0Stack.html">Manually Installing Pivotal HD 2.0 Stack</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="ManuallyUpgradingPivotalHDStackfrom1.1.1to2.0.html">Manually Upgrading Pivotal HD Stack from 1.1.1 to 2.0</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PivotalHadoopEnhancements.html">Pivotal Hadoop Enhancements</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="Security.html">Security</a>

                    </li>
            </ul>
            </li>
            </ul>
            </li>
                        <li>
                <a href="PivotalCommandCenter.html">Pivotal Command Center 2.2.1</a>

                            <ul>
                    <li>
                <a href="PCC2.2.1ReleaseNotes.html">PCC 2.2.1 Release Notes</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PCCUserGuide.html">PCC User Guide</a>

                            <ul>
                    <li>
                <a href="PCCOverview.html">PCC Overview</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PCCInstallationChecklist.html">PCC Installation Checklist</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="InstallingPCC.html">Installing PCC</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="UsingPCC.html">Using PCC</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="CreatingaYUMEPELRepository.html">Creating a YUM EPEL Repository</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="CommandLineReference.html">Command Line Reference</a>

                    </li>
            </ul>
            </li>
            </ul>
            </li>
                        <li>
                <a href="PivotalHAWQ.html">Pivotal HAWQ 1.2.0</a>

                            <ul>
                    <li>
                <a href="HAWQ1.2.0.1ReleaseNotes.html">HAWQ 1.2.0.1 Release Notes</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQInstallationandUpgrade.html">HAWQ Installation and Upgrade</a>

                            <ul>
                    <li>
                <a href="PreparingtoInstallHAWQ.html">Preparing to Install HAWQ</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="InstallingHAWQ.html">Installing HAWQ</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="InstallingtheHAWQComponents.html">Installing the HAWQ Components</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="UpgradingHAWQandComponents.html">Upgrading HAWQ and Components</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQConfigurationParameterReference.html">HAWQ Configuration Parameter Reference</a>

                    </li>
            </ul>
            </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQAdministration.html">HAWQ Administration</a>

                            <ul>
                    <li>
                <a href="HAWQOverview.html">HAWQ Overview</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQQueryProcessing.html">HAWQ Query Processing</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="UsingHAWQtoQueryData.html">Using HAWQ to Query Data</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="ConfiguringClientAuthentication.html">Configuring Client Authentication</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="KerberosAuthentication.html">Kerberos Authentication</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="ExpandingtheHAWQSystem.html">Expanding the HAWQ System</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQInputFormatforMapReduce.html">HAWQ InputFormat for MapReduce</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQFilespacesandHighAvailabilityEnabledHDFS.html">HAWQ Filespaces and High Availability Enabled HDFS</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="SQLCommandReference.html">SQL Command Reference</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="ManagementUtilityReference.html">Management Utility Reference</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="ClientUtilityReference.html">Client Utility Reference</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQServerConfigurationParameters.html">HAWQ Server Configuration Parameters</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQEnvironmentVariables.html">HAWQ Environment Variables</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="HAWQDataTypes.html">HAWQ Data Types</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="SystemCatalogReference.html">System Catalog Reference</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="hawq_toolkitReference.html">hawq_toolkit Reference</a>

                    </li>
            </ul>
            </li>
            </ul>
                    <ul>
                    <li>
                <a href="PivotalExtensionFrameworkPXF.html">Pivotal Extension Framework (PXF)</a>

                            <ul>
                    <li>
                <a href="PXFInstallationandAdministration.html">PXF Installation and Administration</a>

                    </li>
            </ul>
                    <ul>
                    <li>
                <a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a>

                    </li>
            </ul>
            </div><!--end of sub-nav-->
            
            <h3 class="title-container">Pivotal Hadoop Enhancements</h3>
            <div class="content">
              <!-- Python script replaces main content -->
			  <div id ="main"><div style="visibility:hidden; height:2px;">Pivotal Product Documentation : Pivotal Hadoop Enhancements</div><div class="wiki-content group" id="main-content">
<p>Pivotal HD is a full Apache Hadoop distribution with Pivotal add-ons and a native integration with the Pivotal Greenplum database.</p><p><style type="text/css">/*<![CDATA[*/
div.rbtoc1400035785721 {padding: 0px;}
div.rbtoc1400035785721 ul {list-style: disc;margin-left: 0px;}
div.rbtoc1400035785721 li {margin-left: 0px;padding-left: 0px;}

/*]]>*/</style><div class="toc-macro rbtoc1400035785721">
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-HDFSOff-ClusterClientRackAwareness">HDFS Off-Cluster Client Rack Awareness</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-Usage">Usage</a></li>
</ul>
</li>
<li><a href="#PivotalHadoopEnhancements-Vaidya">Vaidya</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-Overview">Overview</a></li>
<li><a href="#PivotalHadoopEnhancements-InstallingVaidyaFiles">Installing Vaidya Files</a></li>
<li><a href="#PivotalHadoopEnhancements-EnablingVaidya">Enabling Vaidya</a></li>
<li><a href="#PivotalHadoopEnhancements-DisablingVaidya">Disabling Vaidya</a></li>
<li><a href="#PivotalHadoopEnhancements-UsingVaidyatoAnalyzeJobs">Using Vaidya to Analyze Jobs</a></li>
<li><a href="#PivotalHadoopEnhancements-VaidyaConfigurationRules">Vaidya Configuration Rules</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-DisablingaRule">Disabling a Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-ChangingtheImportanceofaRule">Changing the Importance of a Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-ChangingSuccessThreshold">Changing Success Threshold</a></li>
<li><a href="#PivotalHadoopEnhancements-ChangingInputParameters">Changing Input Parameters</a></li>
<li><a href="#PivotalHadoopEnhancements-Other">Other</a></li>
<li><a href="#PivotalHadoopEnhancements-AddingaNewRule">Adding a New Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-CreatingaJavaBinaryforaNewRule">Creating a Java Binary for a New Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-CreatingXMLConfigurationForaNewRule">Creating XML Configuration For a New Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-Deployingfiles">Deploying files</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="#PivotalHadoopEnhancements-HVETopologyAwareness">HVE Topology Awareness</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-TopologyAwarenessConfigurationandVerification">Topology Awareness Configuration and Verification</a></li>
</ul>
</li>
<li><a href="#PivotalHadoopEnhancements-HVEElasticity">HVE Elasticity</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-Overview.1">Overview</a></li>
<li><a href="#PivotalHadoopEnhancements-FunctionList">Function List</a></li>
<li><a href="#PivotalHadoopEnhancements-Configuration">Configuration</a></li>
<li><a href="#PivotalHadoopEnhancements-CommandLineInterfaceforYARNcluster">Command Line Interface for YARN cluster</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-ListallCLIs">List all CLIs</a></li>
<li><a href="#PivotalHadoopEnhancements-ListNodes">List Nodes</a></li>
<li><a href="#PivotalHadoopEnhancements-GetNodeStatus">Get Node Status</a></li>
<li><a href="#PivotalHadoopEnhancements-Setnoderesourcecapacity">Set node resource capacity</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div></p><p> </p><p><span class="confluence-anchor-link" id="PivotalHadoopEnhancements-HDFSAwareness"></span></p><h2 id="PivotalHadoopEnhancements-HDFSOff-ClusterClientRackAwareness">HDFS Off-Cluster Client Rack Awareness</h2><p><strong>HDFS rack awareness</strong></p><p>HDFS rack awareness is a key feature to achieve localized I/O (locality).</p><p>With respect to read and write separately, HDFS has:</p><ul><li>BlockPlacementPolicy for write locality: namenode will look up network topology and construct a list of chosen nodes (pipeline) for requesting a block to locate, based on algorithms provided by a BlockPlacementPolicy.</li><li>Block pseudo distance sort for read locality: when reading a block, after obtaining all the located blocks, namenode sorts these located blocks based on their topological distance from the client. The closer nodes get higher priority for read.</li></ul><p>Both operations need to reference network topology, which is managed by the rack awareness feature. The rack awareness feature includes:</p><ul><li>A topology resolving framework: when datanodes register themselves on a namenode, that namenode will resolve their network location using their host name or ip, using DNSToSwitchMapping. This is a pluggable component that allows users to define their own topology based on their network layout. The most commonly used DNSToSwitchMapping is ScriptBasedMapping, which calls a shell script.</li><li>An in-memory topology tree: all registered datanodes' network locations are kept in a topology tree.</li></ul><p><strong>Problem: Ignored off-cluster clients</strong></p><p>The problem of the current implementation is that it does not support off-cluster clients. The figure below is an example of off-cluster clients:</p><p><img class="confluence-embedded-image" data-image-src="attachments/63901467/69468516.png" src="attachments/63901467/69468516.png"/></p><p>In this figure, node <strong>dn1</strong> is a datanode and its network location is /d1/r1, and so on for <strong>dn2</strong> and <strong>dn3</strong>. Node <strong>client0</strong> is an off-cluster node, which means there is no datanode deployed on <strong>client0</strong>. In this case, <strong>client0</strong> has no chance to register itself in the topology tree of the namenode. Therefore, both read and write operations select random nodes even though <strong>dn1</strong> is closer (more preferable) than either <strong>dn2</strong> or <strong>dn3</strong>. This problem will cause performance issues in the following cases:</p><ul><li>When a mapreduce cluster is not exactly co-located: some mapreduce clusters share the same hdfs cluster with other mapreduce clusters, or in some cases a mapreduce cluster will cover several hdfs clusters. In those cases, a large portion of I/O will be off-cluster client operations, which cannot benefit from localized I/O.</li><li>When a physical cluster is not dedicated to Hadoop: a physical cluster might not be dedicated to Hadoop and other supporting systems, such as data loading tools, might share the same cluster. In that case, the data loading tool can not benefit from localized I/O, even if the tool and hdfs shares the same rack/data center. The problem could be even more common in virtualized environments.</li></ul><p><strong>Solution: Design</strong></p><p>To tackle this problem, we changed the logic in the block placement policy and the block pseudo distance sort. We also resolved the network location of the client.</p><p><strong>Resolving client location</strong> <br/> Resolving the client location: we reused the framework that resolves datanodes. However, since we did not add client network locations into the topology tree (as explained below), we have to cache client locations to avoid unnecessary resolve operations.</p><p>As a result, we introduced two LRU caches:</p><ul><li>A black list for those clients that have no valid location or whose locations do not share the same rack with any datanode.</li><li>A white list, opposite to the black list, for those clients that are not datanodes but share the same rack with at least one datanode.</li></ul><p>Referring to the diagram of ignored off-cluster clients, the table below lists some examples of location cache.</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd" colspan="3"><p>Location Cache Examples</p></td></tr><tr><td class="confluenceTd"><p><strong>HostName</strong></p></td><td class="confluenceTd"><p><strong>Location</strong></p></td><td class="confluenceTd"><p><strong>Cache</strong></p></td></tr><tr><td class="confluenceTd"><p>client1</p></td><td class="confluenceTd"><p>d1/r1</p></td><td class="confluenceTd"><p>white list</p></td></tr><tr><td class="confluenceTd"><p>client2</p></td><td class="confluenceTd"><p>d2/r1</p></td><td class="confluenceTd"><p>black list</p></td></tr><tr><td class="confluenceTd"><p>client3</p></td><td class="confluenceTd"><p>null</p></td><td class="confluenceTd"><p>black list</p></td></tr></tbody></table></div><p><br class="atl-forced-newline"/> The size of the LRU cache is configurable, so you can limit the memory usage of namenode.</p><p><strong>Block placement policy</strong></p><p>The tables below demonstrate how the BlockPlacementPolicy has been changed to support non-datanode clients.</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd" colspan="2"><p>Former block placement algorithm</p></td></tr><tr><td class="confluenceTd"><p><strong>Replica</strong></p></td><td class="confluenceTd"><p><strong>Rule</strong></p></td></tr><tr><td class="confluenceTd"><p>1</p></td><td class="confluenceTd"><p>Client Local</p></td></tr><tr><td class="confluenceTd"><p>2</p></td><td class="confluenceTd"><p>Random node whose rack is different from replica 1</p></td></tr><tr><td class="confluenceTd"><p>3</p></td><td class="confluenceTd"><p>Random node who share the same rack with replica 2</p></td></tr><tr><td class="confluenceTd"><p>&gt;=4</p></td><td class="confluenceTd"><p>Random node</p></td></tr></tbody></table></div><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd" colspan="2"><p>Changed block placement algorithm</p></td></tr><tr><td class="confluenceTd"><p><strong>Replica</strong></p></td><td class="confluenceTd"><p><strong>Rule</strong></p></td></tr><tr><td class="confluenceTd"><p>1</p></td><td class="confluenceTd"><p>Client Local if client is datanode, or a random node that shares the same rack with client, if client is not a datanode</p></td></tr><tr><td class="confluenceTd"><p>2</p></td><td class="confluenceTd"><p>Random node whose rack is different from replica 1</p></td></tr><tr><td class="confluenceTd"><p>3</p></td><td class="confluenceTd"><p>Random node who shares the same rack with replica 2</p></td></tr><tr><td class="confluenceTd"><p>&gt;=4</p></td><td class="confluenceTd"><p>Random node</p></td></tr></tbody></table></div><h3 id="PivotalHadoopEnhancements-Usage">Usage</h3><p>The client rack aware feature is disabled by default. To enable, add the following to the <code>hdfs-site.xml</code> file:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">&lt;properties&gt;
 &lt;property&gt;
   &lt;name&gt;dfs.rackawareness.with.client&lt;/name&gt;
   &lt;value&gt;true&lt;/value&gt;
 &lt;/property&gt;
 &lt;/properties&gt;
 &lt;properties&gt;
 &lt;property&gt;
   &lt;name&gt;dfs.rackawareness.with.client.blacklist.size&lt;/name&gt;
   &lt;description&gt;Black list size of client cache, 5000 by default.&lt;/description&gt;
   &lt;value&gt;5000&lt;/value&gt;
 &lt;/property&gt;
 &lt;/properties&gt;
 &lt;properties&gt;
 &lt;property&gt;
   &lt;name&gt;dfs.rackawareness.with.client.cache.size&lt;/name&gt;
  &lt;description&gt;White list size of client cache, best set it equals 
   the size of cluster. 2000 by default.&lt;/description&gt;
   &lt;value&gt;2000&lt;/value&gt;
 &lt;/property&gt;
 &lt;/properties&gt;</pre>
</div></div><p><br/> Note that you need to restart DFS after changing the configuration.</p><p><span class="confluence-anchor-link" id="PivotalHadoopEnhancements-Vaidya"></span></p><h2 id="PivotalHadoopEnhancements-Vaidya">Vaidya</h2><h3 id="PivotalHadoopEnhancements-Overview">Overview</h3><p>Vaidya is a diagnostic tool installed with PHD for Map/Reduce jobs. After a job is executed successfully, it uses a job history log and the job configuration information to identify any performance or scalability problems with the job. Upon execution, it provides a job analysis report indicating specific problems with the job, along with the remedy to correct them. The report element includes, "rule title", "rule description",  "rule importance", "rule severity", "reference details" and "remedy/prescription" to rectify the problem. The "rule severity", is a product of rule impact and the rule importance.</p><p><strong>Note: </strong>The Vaidya tool does <em>not</em> analyze failed jobs, either for performance or scalability problems, nor for the reason for failures.</p><p>The Vaidya tool includes diagnostic rules (also referred to as "tests") where each rule analyzes a specific problem with the M/R job. A diagnostic rule is written as a Java class and captures the logic of how to detect a specific problem condition with the M/R job. Each diagnostic rule uses the job history log and job configuration information provided to it using a standard structured interface. The standard interface allows administrators and developers to independently add more diagnostic rules in the Vaidya tool.</p><h3 id="PivotalHadoopEnhancements-InstallingVaidyaFiles">Installing Vaidya Files</h3><p>By default, Vaidya files are installed at:</p><ul><li>The Vaidya JAR library is installed into <code>/usr/lib/gphd/hadoop-mapreduce/</code></li><li>The Vaidya default configuration file is installed into <code>/etc/gphd/hadoop/conf/</code></li></ul><h3 id="PivotalHadoopEnhancements-EnablingVaidya">Enabling Vaidya</h3><p> </p><p>On the history server  node, go to the PHD configuration folder (by default, <code>/etc/gphd/hadoop/conf</code>), and add the following lines into the file <code>mapred-site.xml</code>.</p><p>Restart the job history server service to ensure the change takes effect.</p><p><strong>mapred-site.xml</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">&lt;property&gt;
  &lt;name&gt;mapreduce.vaidya.enabled&lt;/name&gt;
  &lt;value&gt;true&lt;/value&gt;
 &lt;/property&gt;
 &lt;property&gt;
 &lt;name&gt;mapreduce.vaidya.jarfiles&lt;/name&gt;
 &lt;value&gt;/usr/lib/gphd/hadoop-mapreduce/hadoop-vaidya.jar&lt;/value&gt;
 &lt;/property&gt;
 &lt;property&gt;
 &lt;name&gt;mapreduce.vaidya.testconf.file&lt;/name&gt;
 &lt;value&gt;/etc/gphd/hadoop/conf/postex_diagnosis_tests.xml&lt;/value&gt;
 &lt;/property&gt;</pre>
</div></div><h3 id="PivotalHadoopEnhancements-DisablingVaidya">Disabling Vaidya</h3><p>To disable Vaidya:</p><p>Set the property <code>mapreduce.vaidya.enabled</code> value to be <code>false</code>, or remove these lines from <code>mapred-site.xml</code>.</p><ul><li>The value of property mapreduce.vaidya.enabled should be changed to point to the correct jar file you installed. By default, this is<code>/usr/lib/gphd/hadoop-mapreduce/hadoop-vaidya-&lt;HADOOP_PHD_VERSION&gt;.jar</code>.</li><li>Restart the job history server service to ensure the change takes effect.</li></ul><h3 id="PivotalHadoopEnhancements-UsingVaidyatoAnalyzeJobs">Using Vaidya to Analyze Jobs</h3><ol><li>Ensure your job history server service is running.</li><li>Successfully run a map-reduce job for Vaidya to analyze.</li><li>Open the following URL in a web browser: <code> <code>http://&lt;historyserver_host&gt;:&lt;historyserver_port&gt;/jobhistory<br/> </code> </code><p>Where:</p><ul><li><code>&lt;historyserver_host&gt;</code> refers to the host name or IP address of the machine where you run job history server service.</li><li><code>&lt;historyserver_port&gt;</code> refers to the HTTP port job history server web where the UI listens. By default, this value is 19888. Your browser should show you the job history server UI page.</li></ul><code> <br/> </code></li><li>You will see a list of jobs that have run, including the most recent job. Click the job id of any job in this list, and you should see the detailed information for the job.</li><li><p>On the left side of the navigation area, there should be a link called <strong>Vaidya report</strong> under the navigation item <strong>Job</strong>. Click the <strong>Vaidya report</strong> link and Vaidya will analyze the job for you and show a report.</p></li></ol><h3 id="PivotalHadoopEnhancements-VaidyaConfigurationRules">Vaidya Configuration Rules</h3><p>After you installed Vaidya with PHD, rules configuration is installed as a <code>postex_diagnosis_tests.xml</code> XML file in <code>/etc/gphd/hadoop/conf</code></p><p>You can find all rules to be run on a selected job in this XML file, where each rule is defined as an XML <code>PostExPerformanceDiagnosisTests/DiagnosticTest</code> element, for example:</p><p>A rule in <code>postex_diagnosis_tests.xml</code></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">&lt;DiagnosticTest&gt;
 &lt;Title&gt;&lt;![CDATA[Balanced Reduce Partitioning]]&gt;&lt;/Title&gt;
 &lt;ClassName&gt;
 &lt;![CDATA[org.apache.hadoop.vaidya.postexdiagnosis.tests.BalancedReducePartitioning]]&gt;&lt;/ClassName&gt;
 &lt;Description&gt;&lt;![CDATA[This rule tests as to how well the input to reduce tasks is balanced]]&gt;&lt;/Description&gt;
 &lt;Importance&gt;&lt;![CDATA[High]]&gt;&lt;/Importance&gt;
 &lt;SuccessThreshold&gt;&lt;![CDATA[0.40]]&gt;&lt;/SuccessThreshold&gt;
 &lt;Prescription&gt;&lt;![CDATA[advice]]&gt;&lt;/Prescription&gt;
 &lt;InputElement&gt;
 &lt;PercentReduceRecords&gt;&lt;![CDATA[85]]&gt;&lt;/PercentReduceRecords&gt;
 &lt;/InputElement&gt;
 &lt;/DiagnosticTest&gt;</pre>
</div></div><p><br/> The <code>Title</code> and <code>Description</code> elements provide a brief summary about what this rule is doing.</p><p>By editing <code>postex_diagnosis_tests.xml</code>, you can configure the rules.</p><p><strong>Notes</strong>:</p><ul><li>Remember to backup the original configuration file before editing the configuration file, invalid xml config file may cause Vaidya behavior incorrectly.</li><li>Before you start editing rules, you should have background knowledge about XML syntax and how XML represents data (for example, what the CDATA element represents).</li></ul><h4 id="PivotalHadoopEnhancements-DisablingaRule">Disabling a Rule</h4><p>Comment out or remove the entire DiagnosticTest element.</p><h4 id="PivotalHadoopEnhancements-ChangingtheImportanceofaRule">Changing the Importance of a Rule</h4><p>Importance indicates how relatively important a rule is, relative to other rules in the same set. You can change the importance value by editing the Importance element in the XML file. A level serves as a factor, which is multiplied to impact the value returned by each rule.</p><p>There are three values valid for this attribute: Low, Medium and High; their corresponding values are: 0.33, 0.66 and 0.99.</p><p>In the displayed Vaidya report, there is a value named Severity for each rule. A severity level is the result of multiplying the impact value (returned by rule) and the importance factor (defined in XML file).</p><p>For example, a rule returns an impact of 0.5, its importance is marked as Medium, then its severity is 0.5 * 0.66 = 0.33.</p><h4 id="PivotalHadoopEnhancements-ChangingSuccessThreshold">Changing Success Threshold</h4><p>Each rule calculates a value between 0 and 1 (inclusively) to indicate how healthy a job is according to the specified rule; this value is called impact. The smaller the impact is (that is, closer to 0), the healthier the job is.</p><p>To give a more straightforward result, you can set a threshold for each rule. Therefore, a rule whose impact value is larger than the threshold will be marked as "failed"; otherwise, it is marked as "passed".</p><p>Note that the threshold is compared with the impact value, rather than severity (which means making a rule less important will not make a failed rule succeed).</p><p>You can change the threshold value by editing the SuccessThreshold element in the XML file.</p><h4 id="PivotalHadoopEnhancements-ChangingInputParameters">Changing Input Parameters</h4><p>Some rules might need additional input parameters to complete their logic. You can specify additional parameters by editing/adding elements under the InputElement element of each rule.</p><h4 id="PivotalHadoopEnhancements-Other">Other</h4><p>For a full explanation and instructions about the meaning of each XML element, as well as how to change them, refer to the Apache's Official <a class="external-link" href="https://hadoop.apache.org/docs/stable1/vaidya.html" rel="nofollow">Vaidya Guide </a> for more information.</p><h4 id="PivotalHadoopEnhancements-AddingaNewRule">Adding a New Rule</h4><p>A Vaidya rule consists of the following two parts:</p><ul><li>A java class that consists of the logic of the rule</li><li>A paragraph of XML in the configuration file</li></ul><h4 id="PivotalHadoopEnhancements-CreatingaJavaBinaryforaNewRule">Creating a Java Binary for a New Rule</h4><p><strong>Important</strong>: This section assumes a working knowledge of how to write, compile, and package Java code.</p><ol><li>From where you installed PHD, download the correct <code>hadoop-vaidya-&lt;HADOOP_PHD_VERSION&gt;.jar</code> file (which you specified in<code> mapred-site.xml</code>) to your development machine, if you plan on writing Java code on another machine than the one where you installed PHD. (This is a typical case.)</li><li><p>Create a java file with an IDE or editor, which defines a class that extends the <code>org.apache.hadoop.vaidya.DiagnosticTest</code> class:<br/> <strong>myrule.java</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">package com.greenplum.vaidya.rules;
 import org.apache.hadoop.vaidya.DiagnosticTest;
 import org.apache.hadoop.vaidya.statistics.job.JobStatistics;
 public class MyRule extends DiagnosticTest {
 @Override
 public String getReferenceDetails() {
 return "";
 }
 @Override
 public String getPrescription() {
 return "";
 }
 @Override
 public double evaluate(JobStatistics jobStatistics) {
 return 0.5;
 }
 }</pre>
</div></div></li><li><p>Edit the three methods <code>getReferenceDetails</code>, <code>getPrescription</code> and evaluate to construct the logic.evaluate method should return a <strong>double</strong> value between 0.0 and 1.0 and represents the impact as the analysis result.</p><ul><li><code>getPrescription</code> method should return some text providing user suggestions/remedies about how to optimize your Map/Reduce configuration accordingly.</li><li><code>getReferenceDetails</code> method should return some text indicating the meaningful counters and their values which can help you to diagnose your Map/Reduce configuration accordingly.</li></ul></li><li><p>Compile the java class and package compiled class to a jar file, for example, <code>myrule.jar</code>. Note that you need to put the Vaidya jar file you just downloaded into your class path to make your code compile.</p></li></ol><h4 id="PivotalHadoopEnhancements-CreatingXMLConfigurationForaNewRule">Creating XML Configuration For a New Rule</h4><p>Add a <code>DiagnosticTest</code> element into the<code> postex_diagnosis_tests.xml</code> file (the file you set in <code>mapred-site.xml</code> file), according to the sample given in the configuration part. Ensure the value of <code>ClassName</code> element is set to be the full class name of the java rule class you just created.</p><h4 id="PivotalHadoopEnhancements-Deployingfiles">Deploying files</h4><ol><li>Upload the packaged jar file (<code>myrule.jar</code> for example) to the node where you installed PHD job tracker, and store it in a folder where hadoop service has the permission to read and load it. We recommend you place it under <code>/usr/lib/gphd/hadoop-mapreduce/lib/</code></li><li><p>Edit<code> mapred-site.xml</code>, append the jar file you just uploaded to the<code> mapred.vaidya.jar</code>.file or <code>mapreduce.vaidya.jar</code>files property value, for example:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">mapred-site.xml
 &lt;property&gt;
 &lt;name&gt;mapreduce.vaidya.jarfiles&lt;/name&gt;
 &lt;value&gt;/usr/lib/gphd/hadoop-mapreduce/hadoop-vaidya.jar:/usr/lib/gphd/hadoop-mapreduce/lib/myrule.jar&lt;/value&gt;
 &lt;/property&gt;</pre>
</div></div></li></ol><p><strong>Important</strong>:</p><ul><li>Do not remove the default Vaidya jar file from this property, Vaidya needs this property to load basic Vaidya classes to make it run.</li><li>Multiple jar files are separated by different separator characters on different platforms. On the Linux/Unix platform, the ":" character should be used. You can look at the <code>theFile.pathSeparator</code> attribute of your java platform to verify it.</li><li>To make your settings take effect, restart the job history server service.</li></ul><h2 id="PivotalHadoopEnhancements-HVETopologyAwareness">HVE Topology Awareness</h2><p>Hadoop Virtualization Extensions (HVE) allow Hadoop clusters implemented on virtualized infrastructure full awareness of the topology on which they are running, thus enhancing the reliability and performance of these clusters.</p><p>HVE should be enabled in the following situations:</p><ul><li>When there is more than one Hadoop VM per physical host in virtualized environments.</li><li>When Datanodes and NodeManagers/TaskTrackers exist in separate virtual machines in virtualized environments, in order to achieve graceful scaling of the compute component of the Hadoop cluster.</li><li>When there is a topology layer between host and rack (e.g. chassis), which can affect the failure/locality group between hosts, in non-virtualized environments.</li></ul><h3 id="PivotalHadoopEnhancements-TopologyAwarenessConfigurationandVerification">Topology Awareness Configuration and Verification</h3><p><strong>Sample Setup</strong></p><p>This setup has 2 logical racks, 2 physical hosts (installed by ESXi and managed by vCenter) per rack, and 2 DN/NM (VM in ESXi) nodes per host. There is also one NameNode/ResourceManager and a client node that can be used to start jobs.</p><p>In this setup, each DN/NM node has 4 vCPUs, 16G memory, and 200G (Non-SSD) disks.</p><p>The NameNode and ResourceManager are installed on another dedicated VM with 4vCPU, 4G Memory and 100G disks.</p><p>Node Distribution on Hosts:</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd"><strong>Rack 1</strong></td><td class="confluenceTd"><strong>Host 1</strong></td><td class="confluenceTd"><strong>NameNode and ResourceManager</strong></td><td class="confluenceTd"><strong>DN1</strong></td></tr><tr><td class="confluenceTd"> </td><td class="confluenceTd"><strong>Host 2</strong></td><td class="confluenceTd"><strong>DN2</strong></td><td class="confluenceTd"><strong>DN3</strong></td></tr><tr><td class="confluenceTd"><strong>Rack 2</strong></td><td class="confluenceTd"><strong>Host 3</strong></td><td class="confluenceTd"><strong>DN4</strong></td><td class="confluenceTd"><strong>DN5</strong></td></tr><tr><td class="confluenceTd"> </td><td class="confluenceTd"><strong>Host 4</strong></td><td class="confluenceTd"><strong>DN6</strong></td><td class="confluenceTd"><strong>DN7</strong></td></tr></tbody></table></div><p><strong>Enable topology awareness (Hadoop V2)</strong></p><ol><li><p>Add the following line to <code>core-site.xml</code>:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">&lt;property&gt;
  &lt;name&gt;topology.script.file.name&lt;/name&gt;
  &lt;value&gt;/hadoop/hadoop-smoke/etc/hadoop/topology.sh&lt;/value&lt;!-- point to topology.sh location.--&gt;
&lt;/property&gt;
&lt;property&gt;
  &lt;name&gt;net.topology.impl&lt;/name&gt;
  &lt;value&gt;org.apache.hadoop.net.NetworkTopologyWithNodeGroup&lt;/value&gt;
  &lt;description&gt; The default implementation of NetworkTopology which is classic three layer one.
  &lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
  &lt;name&gt;net.topology.nodegroup.aware&lt;/name&gt;
  &lt;value&gt;true&lt;/value&gt;
  &lt;description&gt; By default, network topology is not aware of nodegroup layer.
  &lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
  &lt;name&gt;dfs.block.replicator.classname&lt;/name&gt;
  &lt;value&gt;org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyWithNodeGroup&lt;/value&gt;
  &lt;description&gt; The default implementation of ReplicationTargetChooser.
  &lt;/description&gt;
&lt;/property&gt;</pre>
</div></div></li><li><p>Add the following line to <code>yarn-site.xml</code>:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">&lt;property&gt;
  &lt;description&gt;The class to use as scheduled requests.&lt;/description&gt;
  &lt;name&gt;yarn.resourcemanager.scheduled.requests.class&lt;/name&gt;
  &lt;value&gt;org.apache.hadoop.mapreduce.v2.app.rm.ScheduledRequestsWithNodeGroup&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
  &lt;description&gt; The boolean value to identify if the cluster is deployed on an environment which needs an additional layer (node group) between node and rack for network topology.
  &lt;/description&gt;
  &lt;name&gt;net.topology.with.nodegroup&lt;/name&gt;
  &lt;value&gt;true&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
  &lt;description&gt;The class to use as AbstractSchedulerElementsFactory in RM scheduler.&lt;/description&gt;
  &lt;name&gt;yarn.resourcemanager.scheduler.elements.factory.impl&lt;/name&gt;
  &lt;value&gt;org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerElementsFactoryWithNodeGroup&lt;/value&gt;
&lt;/property&gt;</pre>
</div></div><p> </p><p><strong>Topology.data sample:</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">[root@namenode enable]# cat topology.data
10.111.57.223(VM IP)   /Rack1/NodeGroup1
10.111.57.224   /Rack1/NodeGroup1
10.111.57.225   /Rack1/NodeGroup2
10.111.57.226   /Rack2/NodeGroup1
10.111.57.227   /Rack2/NodeGroup1
10.111.57.228   /Rack2/NodeGroup2
10.111.57.229   /Rack2/NodeGroup2</pre>
</div></div><p><strong>Topology.sh sample:</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">[root@namenode enable]# cat topology.sh
#! /bin/bash
HADOOP_CONF=/hadoop/hadoop-smoke/etc/hadoop  
# this is the location of topology.data
while [ $# -gt 0 ] ; do
  nodeArg=$1
  exec&lt; ${HADOOP_CONF}/topology.data
  result=""
  while read line ; do
    ar=( $line )
    if [ "${ar[0]}" = "$nodeArg" ] ; then
      result="${ar[1]}"
    fi
  done
  shift
  if [ -z "$result" ] ; then
    echo -n "/default/rack "
  else
    echo -n "$result "
  fi
done</pre>
</div></div></li><li><p>Verify HVE is enabled:</p>Run the TestDFSIO script:</li></ol><p style="margin-left: 30.0px;">The output is as follows:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">1)HVE enabled:
Job Counters
Launched map tasks=100
Launched reduce tasks=1
Data-local map tasks=26
NODEGROUP_LOCAL_MAPS=49
Rack-local map tasks=25
2)HVE disabled:
 Job Counters
Launched map tasks=100
Launched reduce tasks=1
Data-local map tasks=20
Rack-local map tasks=80</pre>
</div></div><h2 id="PivotalHadoopEnhancements-HVEElasticity">HVE Elasticity</h2><p>HVE Elastic Resource Extension enables the adaption of MapReduce tasks to changing resources on nodes/clusters where Hadoop clusters are deployed to virtualized environments, by sharing resources with VMs from other clusters or applications.</p><h3 id="PivotalHadoopEnhancements-Overview.1">Overview</h3><p>Currently, the Hadoop resource model is static at the node level, assuming the node resources are not changed while the cluster is running. This design and implementation are based on an assumption that all cluster resources are dedicated for Hadoop MapReduce jobs, so they are fully available at all times. This assumption does not hold when users want to deploy multiple applications on the same cluster, e.g. deploying HBase and MapReduce on the same HDFS cluster. In particular, in an era of cloud computing, it is common for Hadoop clusters to be deployed on virtualized environments by sharing resource with VMs from other clusters or applications.</p><p>The HVE elastic resource feature addresses scenarios in which nodes' resources are possibly changed, so that scheduling of MapReduce tasks on these nodes can adapted to changing resources.</p><p>With this feature, APIs (CLI and JMX interface) and script tools are provided to get/set resources (memory, v-cores) on Hadoop cluster nodes for MR jobs.</p><h3 id="PivotalHadoopEnhancements-FunctionList">Function List</h3><p>Below are functionalities included in this elastic feature:<br/>Function List</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh">Function</th><th class="confluenceTh">Description</th></tr><tr><td class="confluenceTd"><p>Configuration</p></td><td class="confluenceTd">Enable/disable elastic resource feature on Hadoop cluster by specifying a configuration property when starting MR cluster.</td></tr><tr><td class="confluenceTd">List nodes' status</td><td class="confluenceTd"><p>List the status of all the nodes or nodes specified by user.</p><p>The node status including its memory resource, v-core resource, hostname, health status, etc.</p></td></tr><tr><td class="confluenceTd" colspan="1">Set resource capacity in Node Manager node</td><td class="confluenceTd" colspan="1">Set resource capacity (memory, v-cores) of Node Manager to a node specified by user via CLI or JMX interface.</td></tr></tbody></table></div><p> </p><h3 id="PivotalHadoopEnhancements-Configuration">Configuration</h3><p>To enable elastic resources, make the following changes to the Hadoop configuration.</p><p>In yarn<code>-site.xml</code>, add the following property to enable the elastic resource feature:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">&lt;property&gt;
  &lt;name&gt;yarn.dynamic.resource.enable&lt;/name&gt;
  &lt;value&gt;true&lt;/value&gt;
&lt;/property&gt;</pre>
</div></div><h3 id="PivotalHadoopEnhancements-CommandLineInterfaceforYARNcluster">Command Line Interface for YARN cluster</h3><h4 id="PivotalHadoopEnhancements-ListallCLIs">List all CLIs</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn rmadmin
           [-refreshQueues]
           [-refreshNodes]
           [-refreshUserToGroupsMappings]
           [-refreshSuperUserGroupsConfiguration]
           [-refreshAdminAcls]
           [-refreshServiceAcl]
           [-getGroups [username]]
           [-updateNodeResource [NodeID][MemSize][Cores]]
           [-help [cmd]]
 
yarn node
 -all               Works with -list to list all nodes.
 -list              List all running nodes. Supports optional use of
                    -states to filter nodes based on node state, all -all
                    to list all nodes.
 -states &lt;States&gt;   Works with -list to filter nodes based on input
                    comma-separated list of node states.
 -status &lt;NodeId&gt;   Prints the status report of the node.</pre>
</div></div><h4 id="PivotalHadoopEnhancements-ListNodes">List Nodes</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -list -all</pre>
</div></div><h4 id="PivotalHadoopEnhancements-GetNodeStatus">Get Node Status</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -status &lt;NodeId&gt;</pre>
</div></div><h4 id="PivotalHadoopEnhancements-Setnoderesourcecapacity">Set node resource capacity</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn rmadmin -updateNodeResource [NodeID][MemSize][Cores]</pre>
</div></div><p><strong>Example</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeHeader panelHeader pdl" style="border-bottom-width: 1px;"><b>List all nodes</b></div><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -list -all

Total Nodes:8
         Node-Id	     Node-State	Node-Http-Address	Number-of-Running-Containers
hdsh2-a172.lss.emc.com:37804	        RUNNING	hdsh2-a172.lss.emc.com:8042	                           0
hdsh2-a173.lss.emc.com:45310	        RUNNING	hdsh2-a173.lss.emc.com:8042	                           0
hdsh2-a159.lss.emc.com:60596	        RUNNING	hdsh2-a159.lss.emc.com:8042	                           0
hdsh2-a158.lss.emc.com:51694	        RUNNING	hdsh2-a158.lss.emc.com:8042	                           0
hdsh2-a157.lss.emc.com:37348	        RUNNING	hdsh2-a157.lss.emc.com:8042	                           0
hdsh2-a174.lss.emc.com:33263	        RUNNING	hdsh2-a174.lss.emc.com:8042	                           0
hdsh2-a160.lss.emc.com:34460	        RUNNING	hdsh2-a160.lss.emc.com:8042	                           0
hdsh2-a171.lss.emc.com:49469	        RUNNING	hdsh2-a171.lss.emc.com:8042	                           0</pre>
</div></div><div class="code panel pdl" style="border-width: 1px;"><div class="codeHeader panelHeader pdl" style="border-bottom-width: 1px;"><b>Get node status</b></div><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -status hdsh2-a172.lss.emc.com:37804

Node Report : 
	Node-Id : hdsh2-a172.lss.emc.com:37804
	Rack : /dc/rc2
	Node-State : RUNNING
	Node-Http-Address : hdsh2-a172.lss.emc.com:8042
	Last-Health-Update : Wed 15/Jan/14 03:38:08:402CST
	Health-Report : 
	Containers : 0
	Memory-Used : 0MB
	Memory-Capacity : 2048MB
	CPU-Used : 0 vcores
	CPU-Capacity : 2 vcores</pre>
</div></div><div class="code panel pdl" style="border-width: 1px;"><div class="codeHeader panelHeader pdl" style="border-bottom-width: 1px;"><b>Set node resource capacity</b></div><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn rmadmin -updateNodeResource hdsh2-a172.lss.emc.com:37804 2048 2</pre>
</div></div>
</div></div>
            </div><!-- end of content-->
            
            
          </div><!-- end of container -->
        </div><!--end of container-fluid-->
      </div><!--end of main-wrap-->

      <div class="site-footer desktop-only">
          <div class="container-fluid">
              <div class="site-footer-links">
                  <span class="version"><a href='/'>Pivotal Documentation</a></span>
                  <span>&copy;
                      <script>
                          var d = new Date();
                          document.write(d.getFullYear());
                      </script>
                      <a href='http://gopivotal.com'>Pivotal Software</a> Inc. All Rights Reserved.
                  </span>
              </div>
          </div>
      </div>

      <script type="text/javascript">
          (function() {
              var didInit = false;
              function initMunchkin() {
                  if(didInit === false) {
                      didInit = true;
                      Munchkin.init('625-IUJ-009');
                  }
              }
              var s = document.createElement('script');
              s.type = 'text/javascript';
              s.async = true;
              s.src = document.location.protocol + '//munchkin.marketo.net/munchkin.js';
              s.onreadystatechange = function() {
                  if (this.readyState == 'complete' || this.readyState == 'loaded') {
                      initMunchkin();
                  }
              };
              s.onload = initMunchkin;
              document.getElementsByTagName('head')[0].appendChild(s);
          })();
      </script>
  </div><!--end of viewport-->
  <div id="scrim"></div>
</body>
</html>