forked from vmware-archive/docs-hd-staging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PivotalHadoopEnhancements.html
868 lines (789 loc) · 58.6 KB
/
PivotalHadoopEnhancements.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<!-- Always force latest IE rendering engine or request Chrome Frame -->
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible">
<!-- REPLACE X WITH PRODUCT NAME -->
<title>Pivotal Hadoop Enhancements | Pivotal Docs</title>
<!-- Local CSS stylesheets -->
<link href="/stylesheets/master.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/breadcrumbs.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/search.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/portal-style.css" media="screen,print" rel="stylesheet" type="text/css" />
<link href="/stylesheets/printable.css" media="print" rel="stylesheet" type="text/css" />
<!-- Confluence HTML stylesheet -->
<link href="/stylesheets/site-conf.css" media="screen,print" rel="stylesheet" type="text/css" />
<!-- Left-navigation code -->
<!-- http://www.designchemical.com/lab/jquery-vertical-accordion-menu-plugin/examples/# -->
<link href="/stylesheets/dcaccordion.css" rel="stylesheet" type="text/css" />
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script>
<script src="/javascripts/jquery.cookie.js" type="text/javascript"></script>
<script src="/javascripts/jquery.hoverIntent.minified.js" type="text/javascript"></script>
<script src="/javascripts/jquery.dcjqaccordion.2.7.min.js" type="text/javascript"></script>
<script type="text/javascript">
$(document).ready(function($){
$('#accordion-1').dcAccordion({
eventType: 'click',
autoClose: true,
saveState: true,
disableLink: false,
speed: 'fast',
classActive: 'test',
showCount: false
});
});
</script>
<link href="/stylesheets/grey.css" rel="stylesheet" type="text/css" />
<!-- End left-navigation code -->
<script src="/javascripts/all.js" type="text/javascript"></script>
<link href='http://www.gopivotal.com/misc/favicon.ico' rel='shortcut icon'>
<script type="text/javascript">
if (window.location.host === 'docs.gopivotal.com') {
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-39702075-1']);
_gaq.push(['_setDomainName', 'gopivotal.com']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
}
</script>
</head>
<body class="pivotalcf pivotalcf_getstarted pivotalcf_getstarted_index">
<div class="viewport">
<div class="mobile-navigation--wrapper mobile-only">
<div class="navigation-drawer--container">
<div class="navigation-item-list">
<div class="navbar-link active">
<a href="http://gopivotal.com">
Home
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/paas">
PaaS
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/big-data">
Big Data
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/agile">
Agile
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/support">
Help & Support
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/products">
Products
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/solutions">
Solutions
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
<div class="navbar-link">
<a href="http://gopivotal.com/partners">
Partners
<i class="icon-chevron-right pull-right"></i>
</a>
</div>
</div>
</div>
<div class="mobile-nav">
<div class="nav-icon js-open-nav-drawer">
<i class="icon-reorder"></i>
</div>
<div class="header-center-icon">
<a href="http://gopivotal.com">
<div class="icon icon-pivotal-logo-mobile"></div>
</a>
</div>
</div>
</div>
<div class='wrap'>
<script src="//use.typekit.net/clb0qji.js" type="text/javascript"></script>
<script type="text/javascript">
try {
Typekit.load();
} catch (e) {
}
</script>
<script type="text/javascript">
document.domain = "gopivotal.com";
</script>
<script type="text/javascript">
WebFontConfig = {
google: { families: [ 'Source+Sans+Pro:300italic,400italic,600italic,300,400,600:latin' ] }
};
(function() {
var wf = document.createElement('script');
wf.src = ('https:' == document.location.protocol ? 'https' : 'http') +
'://ajax.googleapis.com/ajax/libs/webfont/1/webfont.js';
wf.type = 'text/javascript';
wf.async = 'true';
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(wf, s);
})(); </script>
<div id="search-dropdown-box">
<div class="search-dropdown--container js-search-dropdown">
<div class="container-fluid">
<div class="close-menu-large"><img src="http://www.gopivotal.com/sites/all/themes/gopo13/images/icon-close.png" /></div>
<div class="search-form--container">
<div class="form-search">
<div class='gcse-search'></div>
<script src="http://www.google.com/jsapi" type="text/javascript"></script>
<script src="/javascripts/cse.js" type="text/javascript"></script>
</div>
</div>
</div>
</div>
</div>
<header class="navbar desktop-only" id="nav">
<div class="navbar-inner">
<div class="container-fluid">
<div class="pivotal-logo--container">
<a class="pivotal-logo" href="http://gopivotal.com"><span></span></a>
</div>
<ul class="nav pull-right">
<li class="navbar-link">
<a href="http://www.gopivotal.com/paas" id="paas-nav-link">PaaS</a>
</li>
<li class="navbar-link">
<a href="http://www.gopivotal.com/big-data" id="big-data-nav-link">BIG DATA</a>
</li>
<li class="navbar-link">
<a href="http://www.gopivotal.com/agile" id="agile-nav-link">AGILE</a>
</li>
<li class="navbar-link">
<a href="http://www.gopivotal.com/oss" id="oss-nav-link">OSS</a>
</li>
<li class="nav-search">
<a class="js-search-input-open" id="click-to-search"><span></span></a>
</li>
</ul>
</div>
<a href="http://www.gopivotal.com/contact">
<img id="get-started" src="http://www.gopivotal.com/sites/all/themes/gopo13/images/get-started.png">
</a>
</div>
</header>
<div class="main-wrap">
<div class="container-fluid">
<!-- Google CSE Search Box -->
<div id='docs-search'>
<gcse:search></gcse:search>
</div>
<div id='all-docs-link'>
<a href="http://docs.gopivotal.com/">All Documentation</a>
</div>
<div class="container">
<div id="sub-nav" class="nav-container">
<!-- Collapsible left-navigation-->
<ul class="accordion" id="accordion-1">
<!-- REPLACE <li/> NODES-->
<li>
<a href="index.html">Home</a></br>
<li>
<a href="PivotalHD.html">Pivotal HD 2.0.1</a>
<ul>
<li>
<a href="PHDEnterprise2.0.1ReleaseNotes.html">PHD Enterprise 2.0.1 Release Notes</a>
</li>
</ul>
<ul>
<li>
<a href="PHDInstallationandAdministration.html">PHD Installation and Administration</a>
<ul>
<li>
<a href="OverviewofPHD.html">Overview of PHD</a>
</li>
</ul>
<ul>
<li>
<a href="InstallationOverview.html">Installation Overview</a>
</li>
</ul>
<ul>
<li>
<a href="PHDInstallationChecklist.html">PHD Installation Checklist</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingPHDUsingtheCLI.html">Installing PHD Using the CLI</a>
</li>
</ul>
<ul>
<li>
<a href="UpgradeChecklist.html">Upgrade Checklist</a>
</li>
</ul>
<ul>
<li>
<a href="UpgradingPHDUsingtheCLI.html">Upgrading PHD Using the CLI</a>
</li>
</ul>
<ul>
<li>
<a href="AdministeringPHDUsingtheCLI.html">Administering PHD Using the CLI</a>
</li>
</ul>
<ul>
<li>
<a href="PHDFAQFrequentlyAskedQuestions.html">PHD FAQ (Frequently Asked Questions)</a>
</li>
</ul>
<ul>
<li>
<a href="PHDTroubleshooting.html">PHD Troubleshooting</a>
</li>
</ul>
</li>
</ul>
<ul>
<li>
<a href="StackandToolsReference.html">Stack and Tools Reference</a>
<ul>
<li>
<a href="OverviewofApacheStackandPivotalComponents.html">Overview of Apache Stack and Pivotal Components</a>
</li>
</ul>
<ul>
<li>
<a href="ManuallyInstallingPivotalHD2.0Stack.html">Manually Installing Pivotal HD 2.0 Stack</a>
</li>
</ul>
<ul>
<li>
<a href="ManuallyUpgradingPivotalHDStackfrom1.1.1to2.0.html">Manually Upgrading Pivotal HD Stack from 1.1.1 to 2.0</a>
</li>
</ul>
<ul>
<li>
<a href="PivotalHadoopEnhancements.html">Pivotal Hadoop Enhancements</a>
</li>
</ul>
<ul>
<li>
<a href="Security.html">Security</a>
</li>
</ul>
</li>
</ul>
</li>
<li>
<a href="PivotalCommandCenter.html">Pivotal Command Center 2.2.1</a>
<ul>
<li>
<a href="PCC2.2.1ReleaseNotes.html">PCC 2.2.1 Release Notes</a>
</li>
</ul>
<ul>
<li>
<a href="PCCUserGuide.html">PCC User Guide</a>
<ul>
<li>
<a href="PCCOverview.html">PCC Overview</a>
</li>
</ul>
<ul>
<li>
<a href="PCCInstallationChecklist.html">PCC Installation Checklist</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingPCC.html">Installing PCC</a>
</li>
</ul>
<ul>
<li>
<a href="UsingPCC.html">Using PCC</a>
</li>
</ul>
<ul>
<li>
<a href="CreatingaYUMEPELRepository.html">Creating a YUM EPEL Repository</a>
</li>
</ul>
<ul>
<li>
<a href="CommandLineReference.html">Command Line Reference</a>
</li>
</ul>
</li>
</ul>
</li>
<li>
<a href="PivotalHAWQ.html">Pivotal HAWQ 1.2.0</a>
<ul>
<li>
<a href="HAWQ1.2.0.1ReleaseNotes.html">HAWQ 1.2.0.1 Release Notes</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQInstallationandUpgrade.html">HAWQ Installation and Upgrade</a>
<ul>
<li>
<a href="PreparingtoInstallHAWQ.html">Preparing to Install HAWQ</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingHAWQ.html">Installing HAWQ</a>
</li>
</ul>
<ul>
<li>
<a href="InstallingtheHAWQComponents.html">Installing the HAWQ Components</a>
</li>
</ul>
<ul>
<li>
<a href="UpgradingHAWQandComponents.html">Upgrading HAWQ and Components</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQConfigurationParameterReference.html">HAWQ Configuration Parameter Reference</a>
</li>
</ul>
</li>
</ul>
<ul>
<li>
<a href="HAWQAdministration.html">HAWQ Administration</a>
<ul>
<li>
<a href="HAWQOverview.html">HAWQ Overview</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQQueryProcessing.html">HAWQ Query Processing</a>
</li>
</ul>
<ul>
<li>
<a href="UsingHAWQtoQueryData.html">Using HAWQ to Query Data</a>
</li>
</ul>
<ul>
<li>
<a href="ConfiguringClientAuthentication.html">Configuring Client Authentication</a>
</li>
</ul>
<ul>
<li>
<a href="KerberosAuthentication.html">Kerberos Authentication</a>
</li>
</ul>
<ul>
<li>
<a href="ExpandingtheHAWQSystem.html">Expanding the HAWQ System</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQInputFormatforMapReduce.html">HAWQ InputFormat for MapReduce</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQFilespacesandHighAvailabilityEnabledHDFS.html">HAWQ Filespaces and High Availability Enabled HDFS</a>
</li>
</ul>
<ul>
<li>
<a href="SQLCommandReference.html">SQL Command Reference</a>
</li>
</ul>
<ul>
<li>
<a href="ManagementUtilityReference.html">Management Utility Reference</a>
</li>
</ul>
<ul>
<li>
<a href="ClientUtilityReference.html">Client Utility Reference</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQServerConfigurationParameters.html">HAWQ Server Configuration Parameters</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQEnvironmentVariables.html">HAWQ Environment Variables</a>
</li>
</ul>
<ul>
<li>
<a href="HAWQDataTypes.html">HAWQ Data Types</a>
</li>
</ul>
<ul>
<li>
<a href="SystemCatalogReference.html">System Catalog Reference</a>
</li>
</ul>
<ul>
<li>
<a href="hawq_toolkitReference.html">hawq_toolkit Reference</a>
</li>
</ul>
</li>
</ul>
<ul>
<li>
<a href="PivotalExtensionFrameworkPXF.html">Pivotal Extension Framework (PXF)</a>
<ul>
<li>
<a href="PXFInstallationandAdministration.html">PXF Installation and Administration</a>
</li>
</ul>
<ul>
<li>
<a href="PXFExternalTableandAPIReference.html">PXF External Table and API Reference</a>
</li>
</ul>
</div><!--end of sub-nav-->
<h3 class="title-container">Pivotal Hadoop Enhancements</h3>
<div class="content">
<!-- Python script replaces main content -->
<div id ="main"><div style="visibility:hidden; height:2px;">Pivotal Product Documentation : Pivotal Hadoop Enhancements</div><div class="wiki-content group" id="main-content">
<p>Pivotal HD is a full Apache Hadoop distribution with Pivotal add-ons and a native integration with the Pivotal Greenplum database.</p><p><style type="text/css">/*<![CDATA[*/
div.rbtoc1400035785721 {padding: 0px;}
div.rbtoc1400035785721 ul {list-style: disc;margin-left: 0px;}
div.rbtoc1400035785721 li {margin-left: 0px;padding-left: 0px;}
/*]]>*/</style><div class="toc-macro rbtoc1400035785721">
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-HDFSOff-ClusterClientRackAwareness">HDFS Off-Cluster Client Rack Awareness</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-Usage">Usage</a></li>
</ul>
</li>
<li><a href="#PivotalHadoopEnhancements-Vaidya">Vaidya</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-Overview">Overview</a></li>
<li><a href="#PivotalHadoopEnhancements-InstallingVaidyaFiles">Installing Vaidya Files</a></li>
<li><a href="#PivotalHadoopEnhancements-EnablingVaidya">Enabling Vaidya</a></li>
<li><a href="#PivotalHadoopEnhancements-DisablingVaidya">Disabling Vaidya</a></li>
<li><a href="#PivotalHadoopEnhancements-UsingVaidyatoAnalyzeJobs">Using Vaidya to Analyze Jobs</a></li>
<li><a href="#PivotalHadoopEnhancements-VaidyaConfigurationRules">Vaidya Configuration Rules</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-DisablingaRule">Disabling a Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-ChangingtheImportanceofaRule">Changing the Importance of a Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-ChangingSuccessThreshold">Changing Success Threshold</a></li>
<li><a href="#PivotalHadoopEnhancements-ChangingInputParameters">Changing Input Parameters</a></li>
<li><a href="#PivotalHadoopEnhancements-Other">Other</a></li>
<li><a href="#PivotalHadoopEnhancements-AddingaNewRule">Adding a New Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-CreatingaJavaBinaryforaNewRule">Creating a Java Binary for a New Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-CreatingXMLConfigurationForaNewRule">Creating XML Configuration For a New Rule</a></li>
<li><a href="#PivotalHadoopEnhancements-Deployingfiles">Deploying files</a></li>
</ul>
</li>
</ul>
</li>
<li><a href="#PivotalHadoopEnhancements-HVETopologyAwareness">HVE Topology Awareness</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-TopologyAwarenessConfigurationandVerification">Topology Awareness Configuration and Verification</a></li>
</ul>
</li>
<li><a href="#PivotalHadoopEnhancements-HVEElasticity">HVE Elasticity</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-Overview.1">Overview</a></li>
<li><a href="#PivotalHadoopEnhancements-FunctionList">Function List</a></li>
<li><a href="#PivotalHadoopEnhancements-Configuration">Configuration</a></li>
<li><a href="#PivotalHadoopEnhancements-CommandLineInterfaceforYARNcluster">Command Line Interface for YARN cluster</a>
<ul class="toc-indentation">
<li><a href="#PivotalHadoopEnhancements-ListallCLIs">List all CLIs</a></li>
<li><a href="#PivotalHadoopEnhancements-ListNodes">List Nodes</a></li>
<li><a href="#PivotalHadoopEnhancements-GetNodeStatus">Get Node Status</a></li>
<li><a href="#PivotalHadoopEnhancements-Setnoderesourcecapacity">Set node resource capacity</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div></p><p> </p><p><span class="confluence-anchor-link" id="PivotalHadoopEnhancements-HDFSAwareness"></span></p><h2 id="PivotalHadoopEnhancements-HDFSOff-ClusterClientRackAwareness">HDFS Off-Cluster Client Rack Awareness</h2><p><strong>HDFS rack awareness</strong></p><p>HDFS rack awareness is a key feature to achieve localized I/O (locality).</p><p>With respect to read and write separately, HDFS has:</p><ul><li>BlockPlacementPolicy for write locality: namenode will look up network topology and construct a list of chosen nodes (pipeline) for requesting a block to locate, based on algorithms provided by a BlockPlacementPolicy.</li><li>Block pseudo distance sort for read locality: when reading a block, after obtaining all the located blocks, namenode sorts these located blocks based on their topological distance from the client. The closer nodes get higher priority for read.</li></ul><p>Both operations need to reference network topology, which is managed by the rack awareness feature. The rack awareness feature includes:</p><ul><li>A topology resolving framework: when datanodes register themselves on a namenode, that namenode will resolve their network location using their host name or ip, using DNSToSwitchMapping. This is a pluggable component that allows users to define their own topology based on their network layout. The most commonly used DNSToSwitchMapping is ScriptBasedMapping, which calls a shell script.</li><li>An in-memory topology tree: all registered datanodes' network locations are kept in a topology tree.</li></ul><p><strong>Problem: Ignored off-cluster clients</strong></p><p>The problem of the current implementation is that it does not support off-cluster clients. The figure below is an example of off-cluster clients:</p><p><img class="confluence-embedded-image" data-image-src="attachments/63901467/69468516.png" src="attachments/63901467/69468516.png"/></p><p>In this figure, node <strong>dn1</strong> is a datanode and its network location is /d1/r1, and so on for <strong>dn2</strong> and <strong>dn3</strong>. Node <strong>client0</strong> is an off-cluster node, which means there is no datanode deployed on <strong>client0</strong>. In this case, <strong>client0</strong> has no chance to register itself in the topology tree of the namenode. Therefore, both read and write operations select random nodes even though <strong>dn1</strong> is closer (more preferable) than either <strong>dn2</strong> or <strong>dn3</strong>. This problem will cause performance issues in the following cases:</p><ul><li>When a mapreduce cluster is not exactly co-located: some mapreduce clusters share the same hdfs cluster with other mapreduce clusters, or in some cases a mapreduce cluster will cover several hdfs clusters. In those cases, a large portion of I/O will be off-cluster client operations, which cannot benefit from localized I/O.</li><li>When a physical cluster is not dedicated to Hadoop: a physical cluster might not be dedicated to Hadoop and other supporting systems, such as data loading tools, might share the same cluster. In that case, the data loading tool can not benefit from localized I/O, even if the tool and hdfs shares the same rack/data center. The problem could be even more common in virtualized environments.</li></ul><p><strong>Solution: Design</strong></p><p>To tackle this problem, we changed the logic in the block placement policy and the block pseudo distance sort. We also resolved the network location of the client.</p><p><strong>Resolving client location</strong> <br/> Resolving the client location: we reused the framework that resolves datanodes. However, since we did not add client network locations into the topology tree (as explained below), we have to cache client locations to avoid unnecessary resolve operations.</p><p>As a result, we introduced two LRU caches:</p><ul><li>A black list for those clients that have no valid location or whose locations do not share the same rack with any datanode.</li><li>A white list, opposite to the black list, for those clients that are not datanodes but share the same rack with at least one datanode.</li></ul><p>Referring to the diagram of ignored off-cluster clients, the table below lists some examples of location cache.</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd" colspan="3"><p>Location Cache Examples</p></td></tr><tr><td class="confluenceTd"><p><strong>HostName</strong></p></td><td class="confluenceTd"><p><strong>Location</strong></p></td><td class="confluenceTd"><p><strong>Cache</strong></p></td></tr><tr><td class="confluenceTd"><p>client1</p></td><td class="confluenceTd"><p>d1/r1</p></td><td class="confluenceTd"><p>white list</p></td></tr><tr><td class="confluenceTd"><p>client2</p></td><td class="confluenceTd"><p>d2/r1</p></td><td class="confluenceTd"><p>black list</p></td></tr><tr><td class="confluenceTd"><p>client3</p></td><td class="confluenceTd"><p>null</p></td><td class="confluenceTd"><p>black list</p></td></tr></tbody></table></div><p><br class="atl-forced-newline"/> The size of the LRU cache is configurable, so you can limit the memory usage of namenode.</p><p><strong>Block placement policy</strong></p><p>The tables below demonstrate how the BlockPlacementPolicy has been changed to support non-datanode clients.</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd" colspan="2"><p>Former block placement algorithm</p></td></tr><tr><td class="confluenceTd"><p><strong>Replica</strong></p></td><td class="confluenceTd"><p><strong>Rule</strong></p></td></tr><tr><td class="confluenceTd"><p>1</p></td><td class="confluenceTd"><p>Client Local</p></td></tr><tr><td class="confluenceTd"><p>2</p></td><td class="confluenceTd"><p>Random node whose rack is different from replica 1</p></td></tr><tr><td class="confluenceTd"><p>3</p></td><td class="confluenceTd"><p>Random node who share the same rack with replica 2</p></td></tr><tr><td class="confluenceTd"><p>>=4</p></td><td class="confluenceTd"><p>Random node</p></td></tr></tbody></table></div><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd" colspan="2"><p>Changed block placement algorithm</p></td></tr><tr><td class="confluenceTd"><p><strong>Replica</strong></p></td><td class="confluenceTd"><p><strong>Rule</strong></p></td></tr><tr><td class="confluenceTd"><p>1</p></td><td class="confluenceTd"><p>Client Local if client is datanode, or a random node that shares the same rack with client, if client is not a datanode</p></td></tr><tr><td class="confluenceTd"><p>2</p></td><td class="confluenceTd"><p>Random node whose rack is different from replica 1</p></td></tr><tr><td class="confluenceTd"><p>3</p></td><td class="confluenceTd"><p>Random node who shares the same rack with replica 2</p></td></tr><tr><td class="confluenceTd"><p>>=4</p></td><td class="confluenceTd"><p>Random node</p></td></tr></tbody></table></div><h3 id="PivotalHadoopEnhancements-Usage">Usage</h3><p>The client rack aware feature is disabled by default. To enable, add the following to the <code>hdfs-site.xml</code> file:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;"><properties>
<property>
<name>dfs.rackawareness.with.client</name>
<value>true</value>
</property>
</properties>
<properties>
<property>
<name>dfs.rackawareness.with.client.blacklist.size</name>
<description>Black list size of client cache, 5000 by default.</description>
<value>5000</value>
</property>
</properties>
<properties>
<property>
<name>dfs.rackawareness.with.client.cache.size</name>
<description>White list size of client cache, best set it equals
the size of cluster. 2000 by default.</description>
<value>2000</value>
</property>
</properties></pre>
</div></div><p><br/> Note that you need to restart DFS after changing the configuration.</p><p><span class="confluence-anchor-link" id="PivotalHadoopEnhancements-Vaidya"></span></p><h2 id="PivotalHadoopEnhancements-Vaidya">Vaidya</h2><h3 id="PivotalHadoopEnhancements-Overview">Overview</h3><p>Vaidya is a diagnostic tool installed with PHD for Map/Reduce jobs. After a job is executed successfully, it uses a job history log and the job configuration information to identify any performance or scalability problems with the job. Upon execution, it provides a job analysis report indicating specific problems with the job, along with the remedy to correct them. The report element includes, "rule title", "rule description", "rule importance", "rule severity", "reference details" and "remedy/prescription" to rectify the problem. The "rule severity", is a product of rule impact and the rule importance.</p><p><strong>Note: </strong>The Vaidya tool does <em>not</em> analyze failed jobs, either for performance or scalability problems, nor for the reason for failures.</p><p>The Vaidya tool includes diagnostic rules (also referred to as "tests") where each rule analyzes a specific problem with the M/R job. A diagnostic rule is written as a Java class and captures the logic of how to detect a specific problem condition with the M/R job. Each diagnostic rule uses the job history log and job configuration information provided to it using a standard structured interface. The standard interface allows administrators and developers to independently add more diagnostic rules in the Vaidya tool.</p><h3 id="PivotalHadoopEnhancements-InstallingVaidyaFiles">Installing Vaidya Files</h3><p>By default, Vaidya files are installed at:</p><ul><li>The Vaidya JAR library is installed into <code>/usr/lib/gphd/hadoop-mapreduce/</code></li><li>The Vaidya default configuration file is installed into <code>/etc/gphd/hadoop/conf/</code></li></ul><h3 id="PivotalHadoopEnhancements-EnablingVaidya">Enabling Vaidya</h3><p> </p><p>On the history server node, go to the PHD configuration folder (by default, <code>/etc/gphd/hadoop/conf</code>), and add the following lines into the file <code>mapred-site.xml</code>.</p><p>Restart the job history server service to ensure the change takes effect.</p><p><strong>mapred-site.xml</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;"><property>
<name>mapreduce.vaidya.enabled</name>
<value>true</value>
</property>
<property>
<name>mapreduce.vaidya.jarfiles</name>
<value>/usr/lib/gphd/hadoop-mapreduce/hadoop-vaidya.jar</value>
</property>
<property>
<name>mapreduce.vaidya.testconf.file</name>
<value>/etc/gphd/hadoop/conf/postex_diagnosis_tests.xml</value>
</property></pre>
</div></div><h3 id="PivotalHadoopEnhancements-DisablingVaidya">Disabling Vaidya</h3><p>To disable Vaidya:</p><p>Set the property <code>mapreduce.vaidya.enabled</code> value to be <code>false</code>, or remove these lines from <code>mapred-site.xml</code>.</p><ul><li>The value of property mapreduce.vaidya.enabled should be changed to point to the correct jar file you installed. By default, this is<code>/usr/lib/gphd/hadoop-mapreduce/hadoop-vaidya-<HADOOP_PHD_VERSION>.jar</code>.</li><li>Restart the job history server service to ensure the change takes effect.</li></ul><h3 id="PivotalHadoopEnhancements-UsingVaidyatoAnalyzeJobs">Using Vaidya to Analyze Jobs</h3><ol><li>Ensure your job history server service is running.</li><li>Successfully run a map-reduce job for Vaidya to analyze.</li><li>Open the following URL in a web browser: <code> <code>http://<historyserver_host>:<historyserver_port>/jobhistory<br/> </code> </code><p>Where:</p><ul><li><code><historyserver_host></code> refers to the host name or IP address of the machine where you run job history server service.</li><li><code><historyserver_port></code> refers to the HTTP port job history server web where the UI listens. By default, this value is 19888. Your browser should show you the job history server UI page.</li></ul><code> <br/> </code></li><li>You will see a list of jobs that have run, including the most recent job. Click the job id of any job in this list, and you should see the detailed information for the job.</li><li><p>On the left side of the navigation area, there should be a link called <strong>Vaidya report</strong> under the navigation item <strong>Job</strong>. Click the <strong>Vaidya report</strong> link and Vaidya will analyze the job for you and show a report.</p></li></ol><h3 id="PivotalHadoopEnhancements-VaidyaConfigurationRules">Vaidya Configuration Rules</h3><p>After you installed Vaidya with PHD, rules configuration is installed as a <code>postex_diagnosis_tests.xml</code> XML file in <code>/etc/gphd/hadoop/conf</code></p><p>You can find all rules to be run on a selected job in this XML file, where each rule is defined as an XML <code>PostExPerformanceDiagnosisTests/DiagnosticTest</code> element, for example:</p><p>A rule in <code>postex_diagnosis_tests.xml</code></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;"><DiagnosticTest>
<Title><![CDATA[Balanced Reduce Partitioning]]></Title>
<ClassName>
<![CDATA[org.apache.hadoop.vaidya.postexdiagnosis.tests.BalancedReducePartitioning]]></ClassName>
<Description><![CDATA[This rule tests as to how well the input to reduce tasks is balanced]]></Description>
<Importance><![CDATA[High]]></Importance>
<SuccessThreshold><![CDATA[0.40]]></SuccessThreshold>
<Prescription><![CDATA[advice]]></Prescription>
<InputElement>
<PercentReduceRecords><![CDATA[85]]></PercentReduceRecords>
</InputElement>
</DiagnosticTest></pre>
</div></div><p><br/> The <code>Title</code> and <code>Description</code> elements provide a brief summary about what this rule is doing.</p><p>By editing <code>postex_diagnosis_tests.xml</code>, you can configure the rules.</p><p><strong>Notes</strong>:</p><ul><li>Remember to backup the original configuration file before editing the configuration file, invalid xml config file may cause Vaidya behavior incorrectly.</li><li>Before you start editing rules, you should have background knowledge about XML syntax and how XML represents data (for example, what the CDATA element represents).</li></ul><h4 id="PivotalHadoopEnhancements-DisablingaRule">Disabling a Rule</h4><p>Comment out or remove the entire DiagnosticTest element.</p><h4 id="PivotalHadoopEnhancements-ChangingtheImportanceofaRule">Changing the Importance of a Rule</h4><p>Importance indicates how relatively important a rule is, relative to other rules in the same set. You can change the importance value by editing the Importance element in the XML file. A level serves as a factor, which is multiplied to impact the value returned by each rule.</p><p>There are three values valid for this attribute: Low, Medium and High; their corresponding values are: 0.33, 0.66 and 0.99.</p><p>In the displayed Vaidya report, there is a value named Severity for each rule. A severity level is the result of multiplying the impact value (returned by rule) and the importance factor (defined in XML file).</p><p>For example, a rule returns an impact of 0.5, its importance is marked as Medium, then its severity is 0.5 * 0.66 = 0.33.</p><h4 id="PivotalHadoopEnhancements-ChangingSuccessThreshold">Changing Success Threshold</h4><p>Each rule calculates a value between 0 and 1 (inclusively) to indicate how healthy a job is according to the specified rule; this value is called impact. The smaller the impact is (that is, closer to 0), the healthier the job is.</p><p>To give a more straightforward result, you can set a threshold for each rule. Therefore, a rule whose impact value is larger than the threshold will be marked as "failed"; otherwise, it is marked as "passed".</p><p>Note that the threshold is compared with the impact value, rather than severity (which means making a rule less important will not make a failed rule succeed).</p><p>You can change the threshold value by editing the SuccessThreshold element in the XML file.</p><h4 id="PivotalHadoopEnhancements-ChangingInputParameters">Changing Input Parameters</h4><p>Some rules might need additional input parameters to complete their logic. You can specify additional parameters by editing/adding elements under the InputElement element of each rule.</p><h4 id="PivotalHadoopEnhancements-Other">Other</h4><p>For a full explanation and instructions about the meaning of each XML element, as well as how to change them, refer to the Apache's Official <a class="external-link" href="https://hadoop.apache.org/docs/stable1/vaidya.html" rel="nofollow">Vaidya Guide </a> for more information.</p><h4 id="PivotalHadoopEnhancements-AddingaNewRule">Adding a New Rule</h4><p>A Vaidya rule consists of the following two parts:</p><ul><li>A java class that consists of the logic of the rule</li><li>A paragraph of XML in the configuration file</li></ul><h4 id="PivotalHadoopEnhancements-CreatingaJavaBinaryforaNewRule">Creating a Java Binary for a New Rule</h4><p><strong>Important</strong>: This section assumes a working knowledge of how to write, compile, and package Java code.</p><ol><li>From where you installed PHD, download the correct <code>hadoop-vaidya-<HADOOP_PHD_VERSION>.jar</code> file (which you specified in<code> mapred-site.xml</code>) to your development machine, if you plan on writing Java code on another machine than the one where you installed PHD. (This is a typical case.)</li><li><p>Create a java file with an IDE or editor, which defines a class that extends the <code>org.apache.hadoop.vaidya.DiagnosticTest</code> class:<br/> <strong>myrule.java</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">package com.greenplum.vaidya.rules;
import org.apache.hadoop.vaidya.DiagnosticTest;
import org.apache.hadoop.vaidya.statistics.job.JobStatistics;
public class MyRule extends DiagnosticTest {
@Override
public String getReferenceDetails() {
return "";
}
@Override
public String getPrescription() {
return "";
}
@Override
public double evaluate(JobStatistics jobStatistics) {
return 0.5;
}
}</pre>
</div></div></li><li><p>Edit the three methods <code>getReferenceDetails</code>, <code>getPrescription</code> and evaluate to construct the logic.evaluate method should return a <strong>double</strong> value between 0.0 and 1.0 and represents the impact as the analysis result.</p><ul><li><code>getPrescription</code> method should return some text providing user suggestions/remedies about how to optimize your Map/Reduce configuration accordingly.</li><li><code>getReferenceDetails</code> method should return some text indicating the meaningful counters and their values which can help you to diagnose your Map/Reduce configuration accordingly.</li></ul></li><li><p>Compile the java class and package compiled class to a jar file, for example, <code>myrule.jar</code>. Note that you need to put the Vaidya jar file you just downloaded into your class path to make your code compile.</p></li></ol><h4 id="PivotalHadoopEnhancements-CreatingXMLConfigurationForaNewRule">Creating XML Configuration For a New Rule</h4><p>Add a <code>DiagnosticTest</code> element into the<code> postex_diagnosis_tests.xml</code> file (the file you set in <code>mapred-site.xml</code> file), according to the sample given in the configuration part. Ensure the value of <code>ClassName</code> element is set to be the full class name of the java rule class you just created.</p><h4 id="PivotalHadoopEnhancements-Deployingfiles">Deploying files</h4><ol><li>Upload the packaged jar file (<code>myrule.jar</code> for example) to the node where you installed PHD job tracker, and store it in a folder where hadoop service has the permission to read and load it. We recommend you place it under <code>/usr/lib/gphd/hadoop-mapreduce/lib/</code></li><li><p>Edit<code> mapred-site.xml</code>, append the jar file you just uploaded to the<code> mapred.vaidya.jar</code>.file or <code>mapreduce.vaidya.jar</code>files property value, for example:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">mapred-site.xml
<property>
<name>mapreduce.vaidya.jarfiles</name>
<value>/usr/lib/gphd/hadoop-mapreduce/hadoop-vaidya.jar:/usr/lib/gphd/hadoop-mapreduce/lib/myrule.jar</value>
</property></pre>
</div></div></li></ol><p><strong>Important</strong>:</p><ul><li>Do not remove the default Vaidya jar file from this property, Vaidya needs this property to load basic Vaidya classes to make it run.</li><li>Multiple jar files are separated by different separator characters on different platforms. On the Linux/Unix platform, the ":" character should be used. You can look at the <code>theFile.pathSeparator</code> attribute of your java platform to verify it.</li><li>To make your settings take effect, restart the job history server service.</li></ul><h2 id="PivotalHadoopEnhancements-HVETopologyAwareness">HVE Topology Awareness</h2><p>Hadoop Virtualization Extensions (HVE) allow Hadoop clusters implemented on virtualized infrastructure full awareness of the topology on which they are running, thus enhancing the reliability and performance of these clusters.</p><p>HVE should be enabled in the following situations:</p><ul><li>When there is more than one Hadoop VM per physical host in virtualized environments.</li><li>When Datanodes and NodeManagers/TaskTrackers exist in separate virtual machines in virtualized environments, in order to achieve graceful scaling of the compute component of the Hadoop cluster.</li><li>When there is a topology layer between host and rack (e.g. chassis), which can affect the failure/locality group between hosts, in non-virtualized environments.</li></ul><h3 id="PivotalHadoopEnhancements-TopologyAwarenessConfigurationandVerification">Topology Awareness Configuration and Verification</h3><p><strong>Sample Setup</strong></p><p>This setup has 2 logical racks, 2 physical hosts (installed by ESXi and managed by vCenter) per rack, and 2 DN/NM (VM in ESXi) nodes per host. There is also one NameNode/ResourceManager and a client node that can be used to start jobs.</p><p>In this setup, each DN/NM node has 4 vCPUs, 16G memory, and 200G (Non-SSD) disks.</p><p>The NameNode and ResourceManager are installed on another dedicated VM with 4vCPU, 4G Memory and 100G disks.</p><p>Node Distribution on Hosts:</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><td class="confluenceTd"><strong>Rack 1</strong></td><td class="confluenceTd"><strong>Host 1</strong></td><td class="confluenceTd"><strong>NameNode and ResourceManager</strong></td><td class="confluenceTd"><strong>DN1</strong></td></tr><tr><td class="confluenceTd"> </td><td class="confluenceTd"><strong>Host 2</strong></td><td class="confluenceTd"><strong>DN2</strong></td><td class="confluenceTd"><strong>DN3</strong></td></tr><tr><td class="confluenceTd"><strong>Rack 2</strong></td><td class="confluenceTd"><strong>Host 3</strong></td><td class="confluenceTd"><strong>DN4</strong></td><td class="confluenceTd"><strong>DN5</strong></td></tr><tr><td class="confluenceTd"> </td><td class="confluenceTd"><strong>Host 4</strong></td><td class="confluenceTd"><strong>DN6</strong></td><td class="confluenceTd"><strong>DN7</strong></td></tr></tbody></table></div><p><strong>Enable topology awareness (Hadoop V2)</strong></p><ol><li><p>Add the following line to <code>core-site.xml</code>:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;"><property>
<name>topology.script.file.name</name>
<value>/hadoop/hadoop-smoke/etc/hadoop/topology.sh</value<!-- point to topology.sh location.-->
</property>
<property>
<name>net.topology.impl</name>
<value>org.apache.hadoop.net.NetworkTopologyWithNodeGroup</value>
<description> The default implementation of NetworkTopology which is classic three layer one.
</description>
</property>
<property>
<name>net.topology.nodegroup.aware</name>
<value>true</value>
<description> By default, network topology is not aware of nodegroup layer.
</description>
</property>
<property>
<name>dfs.block.replicator.classname</name>
<value>org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyWithNodeGroup</value>
<description> The default implementation of ReplicationTargetChooser.
</description>
</property></pre>
</div></div></li><li><p>Add the following line to <code>yarn-site.xml</code>:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;"><property>
<description>The class to use as scheduled requests.</description>
<name>yarn.resourcemanager.scheduled.requests.class</name>
<value>org.apache.hadoop.mapreduce.v2.app.rm.ScheduledRequestsWithNodeGroup</value>
</property>
<property>
<description> The boolean value to identify if the cluster is deployed on an environment which needs an additional layer (node group) between node and rack for network topology.
</description>
<name>net.topology.with.nodegroup</name>
<value>true</value>
</property>
<property>
<description>The class to use as AbstractSchedulerElementsFactory in RM scheduler.</description>
<name>yarn.resourcemanager.scheduler.elements.factory.impl</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerElementsFactoryWithNodeGroup</value>
</property></pre>
</div></div><p> </p><p><strong>Topology.data sample:</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">[root@namenode enable]# cat topology.data
10.111.57.223(VM IP) /Rack1/NodeGroup1
10.111.57.224 /Rack1/NodeGroup1
10.111.57.225 /Rack1/NodeGroup2
10.111.57.226 /Rack2/NodeGroup1
10.111.57.227 /Rack2/NodeGroup1
10.111.57.228 /Rack2/NodeGroup2
10.111.57.229 /Rack2/NodeGroup2</pre>
</div></div><p><strong>Topology.sh sample:</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">[root@namenode enable]# cat topology.sh
#! /bin/bash
HADOOP_CONF=/hadoop/hadoop-smoke/etc/hadoop
# this is the location of topology.data
while [ $# -gt 0 ] ; do
nodeArg=$1
exec< ${HADOOP_CONF}/topology.data
result=""
while read line ; do
ar=( $line )
if [ "${ar[0]}" = "$nodeArg" ] ; then
result="${ar[1]}"
fi
done
shift
if [ -z "$result" ] ; then
echo -n "/default/rack "
else
echo -n "$result "
fi
done</pre>
</div></div></li><li><p>Verify HVE is enabled:</p>Run the TestDFSIO script:</li></ol><p style="margin-left: 30.0px;">The output is as follows:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">1)HVE enabled:
Job Counters
Launched map tasks=100
Launched reduce tasks=1
Data-local map tasks=26
NODEGROUP_LOCAL_MAPS=49
Rack-local map tasks=25
2)HVE disabled:
Job Counters
Launched map tasks=100
Launched reduce tasks=1
Data-local map tasks=20
Rack-local map tasks=80</pre>
</div></div><h2 id="PivotalHadoopEnhancements-HVEElasticity">HVE Elasticity</h2><p>HVE Elastic Resource Extension enables the adaption of MapReduce tasks to changing resources on nodes/clusters where Hadoop clusters are deployed to virtualized environments, by sharing resources with VMs from other clusters or applications.</p><h3 id="PivotalHadoopEnhancements-Overview.1">Overview</h3><p>Currently, the Hadoop resource model is static at the node level, assuming the node resources are not changed while the cluster is running. This design and implementation are based on an assumption that all cluster resources are dedicated for Hadoop MapReduce jobs, so they are fully available at all times. This assumption does not hold when users want to deploy multiple applications on the same cluster, e.g. deploying HBase and MapReduce on the same HDFS cluster. In particular, in an era of cloud computing, it is common for Hadoop clusters to be deployed on virtualized environments by sharing resource with VMs from other clusters or applications.</p><p>The HVE elastic resource feature addresses scenarios in which nodes' resources are possibly changed, so that scheduling of MapReduce tasks on these nodes can adapted to changing resources.</p><p>With this feature, APIs (CLI and JMX interface) and script tools are provided to get/set resources (memory, v-cores) on Hadoop cluster nodes for MR jobs.</p><h3 id="PivotalHadoopEnhancements-FunctionList">Function List</h3><p>Below are functionalities included in this elastic feature:<br/>Function List</p><div class="table-wrap"><table class="confluenceTable"><tbody><tr><th class="confluenceTh">Function</th><th class="confluenceTh">Description</th></tr><tr><td class="confluenceTd"><p>Configuration</p></td><td class="confluenceTd">Enable/disable elastic resource feature on Hadoop cluster by specifying a configuration property when starting MR cluster.</td></tr><tr><td class="confluenceTd">List nodes' status</td><td class="confluenceTd"><p>List the status of all the nodes or nodes specified by user.</p><p>The node status including its memory resource, v-core resource, hostname, health status, etc.</p></td></tr><tr><td class="confluenceTd" colspan="1">Set resource capacity in Node Manager node</td><td class="confluenceTd" colspan="1">Set resource capacity (memory, v-cores) of Node Manager to a node specified by user via CLI or JMX interface.</td></tr></tbody></table></div><p> </p><h3 id="PivotalHadoopEnhancements-Configuration">Configuration</h3><p>To enable elastic resources, make the following changes to the Hadoop configuration.</p><p>In yarn<code>-site.xml</code>, add the following property to enable the elastic resource feature:</p><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;"><property>
<name>yarn.dynamic.resource.enable</name>
<value>true</value>
</property></pre>
</div></div><h3 id="PivotalHadoopEnhancements-CommandLineInterfaceforYARNcluster">Command Line Interface for YARN cluster</h3><h4 id="PivotalHadoopEnhancements-ListallCLIs">List all CLIs</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn rmadmin
[-refreshQueues]
[-refreshNodes]
[-refreshUserToGroupsMappings]
[-refreshSuperUserGroupsConfiguration]
[-refreshAdminAcls]
[-refreshServiceAcl]
[-getGroups [username]]
[-updateNodeResource [NodeID][MemSize][Cores]]
[-help [cmd]]
yarn node
-all Works with -list to list all nodes.
-list List all running nodes. Supports optional use of
-states to filter nodes based on node state, all -all
to list all nodes.
-states <States> Works with -list to filter nodes based on input
comma-separated list of node states.
-status <NodeId> Prints the status report of the node.</pre>
</div></div><h4 id="PivotalHadoopEnhancements-ListNodes">List Nodes</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -list -all</pre>
</div></div><h4 id="PivotalHadoopEnhancements-GetNodeStatus">Get Node Status</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -status <NodeId></pre>
</div></div><h4 id="PivotalHadoopEnhancements-Setnoderesourcecapacity">Set node resource capacity</h4><div class="code panel pdl" style="border-width: 1px;"><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn rmadmin -updateNodeResource [NodeID][MemSize][Cores]</pre>
</div></div><p><strong>Example</strong></p><div class="code panel pdl" style="border-width: 1px;"><div class="codeHeader panelHeader pdl" style="border-bottom-width: 1px;"><b>List all nodes</b></div><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -list -all
Total Nodes:8
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
hdsh2-a172.lss.emc.com:37804 RUNNING hdsh2-a172.lss.emc.com:8042 0
hdsh2-a173.lss.emc.com:45310 RUNNING hdsh2-a173.lss.emc.com:8042 0
hdsh2-a159.lss.emc.com:60596 RUNNING hdsh2-a159.lss.emc.com:8042 0
hdsh2-a158.lss.emc.com:51694 RUNNING hdsh2-a158.lss.emc.com:8042 0
hdsh2-a157.lss.emc.com:37348 RUNNING hdsh2-a157.lss.emc.com:8042 0
hdsh2-a174.lss.emc.com:33263 RUNNING hdsh2-a174.lss.emc.com:8042 0
hdsh2-a160.lss.emc.com:34460 RUNNING hdsh2-a160.lss.emc.com:8042 0
hdsh2-a171.lss.emc.com:49469 RUNNING hdsh2-a171.lss.emc.com:8042 0</pre>
</div></div><div class="code panel pdl" style="border-width: 1px;"><div class="codeHeader panelHeader pdl" style="border-bottom-width: 1px;"><b>Get node status</b></div><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn node -status hdsh2-a172.lss.emc.com:37804
Node Report :
Node-Id : hdsh2-a172.lss.emc.com:37804
Rack : /dc/rc2
Node-State : RUNNING
Node-Http-Address : hdsh2-a172.lss.emc.com:8042
Last-Health-Update : Wed 15/Jan/14 03:38:08:402CST
Health-Report :
Containers : 0
Memory-Used : 0MB
Memory-Capacity : 2048MB
CPU-Used : 0 vcores
CPU-Capacity : 2 vcores</pre>
</div></div><div class="code panel pdl" style="border-width: 1px;"><div class="codeHeader panelHeader pdl" style="border-bottom-width: 1px;"><b>Set node resource capacity</b></div><div class="codeContent panelContent pdl">
<pre class="theme: Confluence; brush: java; gutter: false" style="font-size:12px;">yarn rmadmin -updateNodeResource hdsh2-a172.lss.emc.com:37804 2048 2</pre>
</div></div>
</div></div>
</div><!-- end of content-->
</div><!-- end of container -->
</div><!--end of container-fluid-->
</div><!--end of main-wrap-->
<div class="site-footer desktop-only">
<div class="container-fluid">
<div class="site-footer-links">
<span class="version"><a href='/'>Pivotal Documentation</a></span>
<span>©
<script>
var d = new Date();
document.write(d.getFullYear());
</script>
<a href='http://gopivotal.com'>Pivotal Software</a> Inc. All Rights Reserved.
</span>
</div>
</div>
</div>
<script type="text/javascript">
(function() {
var didInit = false;
function initMunchkin() {
if(didInit === false) {
didInit = true;
Munchkin.init('625-IUJ-009');
}
}
var s = document.createElement('script');
s.type = 'text/javascript';
s.async = true;
s.src = document.location.protocol + '//munchkin.marketo.net/munchkin.js';
s.onreadystatechange = function() {
if (this.readyState == 'complete' || this.readyState == 'loaded') {
initMunchkin();
}
};
s.onload = initMunchkin;
document.getElementsByTagName('head')[0].appendChild(s);
})();
</script>
</div><!--end of viewport-->
<div id="scrim"></div>
</body>
</html>