In [101]:
### Authenticate to Ambari

#### Python requirements
import difflib
import getpass
import json
import requests
import sys
import time

#### Change these to fit your Ambari configuration
ambari_protocol = 'http'
ambari_server = '208.72.157.66'
ambari_port = 8080
ambari_user = 'admin'
#cluster = 'Sandbox'

#### Above input gives us http://user:pass@hostname:port/api/v1/
api_url = ambari_protocol + '://' + ambari_server + ':' + str(ambari_port)

#### Prompt for password & build the HTTP session
ambari_pass = getpass.getpass()
s = requests.Session()
s.auth = (ambari_user, ambari_pass)
s.headers.update({'X-Requested-By':'seanorama'})

#### Authenticate & verify authentication
r = s.get(api_url + '/api/v1/clusters')
assert r.status_code == 200
print("You are authenticated to Ambari!")

········
You are authenticated to Ambari!


In [102]:
### Set cluster based on existing cluster
    
cluster = r.json()['items'][0]['Clusters']['cluster_name']
cluster

'Sandbox'

# Configure YARN Capacity Scheduler

In [103]:
## Get current configuration tag
r = s.get(api_url + '/api/v1/clusters/' + cluster + '?fields=Clusters/desired_configs/capacity-scheduler')
assert r.status_code == 200
tag = r.json()['Clusters']['desired_configs']['capacity-scheduler']['tag']

## Get current configuration
r = s.get(api_url + '/api/v1/clusters/' + cluster + '/configurations?type=capacity-scheduler&tag=' + tag)
assert r.status_code == 200
print(json.dumps(r.json(), indent=2))

## Update config
config_old = r.json()['items'][0]
config_new = r.json()['items'][0]

#### Make your changes here
config_new['properties']['yarn.scheduler.capacity.root.default.capacity'] = '50'
config_new['properties']['yarn.scheduler.capacity.root.queues'] = 'default,hiveserver'
config_new['properties']['yarn.scheduler.capacity.root.hiveserver.capacity'] = '50'
config_new['properties']['yarn.scheduler.capacity.root.hiveserver.hive1.capacity'] = '50'
config_new['properties']['yarn.scheduler.capacity.root.hiveserver.hive1.user-limit-factor'] = '4'
config_new['properties']['yarn.scheduler.capacity.root.hiveserver.hive2.capacity'] = '50'
config_new['properties']['yarn.scheduler.capacity.root.hiveserver.hive2.user-limit-factor'] = '4'
config_new['properties']['yarn.scheduler.capacity.root.hiveserver.queues'] = 'hive1,hive2'

{
  "items": [
    {
      "version": 1,
      "type": "capacity-scheduler",
      "Config": {
        "cluster_name": "Sandbox"
      },
      "properties": {
        "yarn.scheduler.capacity.root.default.user-limit-factor": "1",
        "yarn.scheduler.capacity.maximum-applications": "10000",
        "yarn.scheduler.capacity.root.default.state": "RUNNING",
        "yarn.scheduler.capacity.root.default.maximum-am-resource-percent": "0.5",
        "yarn.scheduler.capacity.root.default.maximum-capacity": "100",
        "yarn.scheduler.capacity.root.accessible-node-labels.default.maximum-capacity": "-1",
        "yarn.scheduler.capacity.root.default.capacity": "100",
        "yarn.scheduler.capacity.root.accessible-node-labels": "*",
        "yarn.scheduler.capacity.root.accessible-node-labels.default.capacity": "-1",
        "yarn.scheduler.capacity.root.queues": "default",
        "yarn.scheduler.capacity.root.acl_administer_queue": "*",
        "yarn.scheduler.capacity.default.minimum

In [104]:
#### Show the differences
a = json.dumps(config_old, indent=2).splitlines(1)
b = json.dumps(config_new, indent=2).splitlines(1)

for line in difflib.unified_diff(a, b):
     sys.stdout.write(line)  

--- 
+++ 
@@ -7,22 +7,28 @@
   "properties": {
     "yarn.scheduler.capacity.root.default.user-limit-factor": "1",
     "yarn.scheduler.capacity.maximum-applications": "10000",
-    "yarn.scheduler.capacity.root.default.state": "RUNNING",
-    "yarn.scheduler.capacity.root.default.maximum-am-resource-percent": "0.5",
+    "yarn.scheduler.capacity.root.hiveserver.hive2.capacity": "50",
     "yarn.scheduler.capacity.root.default.maximum-capacity": "100",
     "yarn.scheduler.capacity.root.accessible-node-labels.default.maximum-capacity": "-1",
-    "yarn.scheduler.capacity.root.default.capacity": "100",
     "yarn.scheduler.capacity.root.accessible-node-labels": "*",
-    "yarn.scheduler.capacity.root.accessible-node-labels.default.capacity": "-1",
-    "yarn.scheduler.capacity.root.queues": "default",
+    "yarn.scheduler.capacity.root.default.maximum-am-resource-percent": "0.5",
+    "yarn.scheduler.capacity.root.hiveserver.capacity": "50",
     "yarn.scheduler.capacity.root.acl_admini

In [105]:
#### Manipulate the document to match the format Ambari expects

#### Adds new configuration tag, deletes fields, and wraps in appropriate json
config_new['tag'] = 'version' + str(int(round(time.time() * 1000000000)))
del config_new['Config']
del config_new['href']
del config_new['version']
config_new = {"Clusters": {"desired_config": config_new}}

print(json.dumps(config_new, indent=2))

{
  "Clusters": {
    "desired_config": {
      "type": "capacity-scheduler",
      "properties": {
        "yarn.scheduler.capacity.root.default.user-limit-factor": "1",
        "yarn.scheduler.capacity.maximum-applications": "10000",
        "yarn.scheduler.capacity.root.hiveserver.hive2.capacity": "50",
        "yarn.scheduler.capacity.root.default.maximum-capacity": "100",
        "yarn.scheduler.capacity.root.accessible-node-labels.default.maximum-capacity": "-1",
        "yarn.scheduler.capacity.root.accessible-node-labels": "*",
        "yarn.scheduler.capacity.root.default.maximum-am-resource-percent": "0.5",
        "yarn.scheduler.capacity.root.hiveserver.capacity": "50",
        "yarn.scheduler.capacity.root.acl_administer_queue": "*",
        "yarn.scheduler.capacity.root.default.acl_submit_applications": "*",
        "yarn.scheduler.capacity.root.default-node-label-expression": " ",
        "yarn.scheduler.capacity.node-locality-delay": "40",
        "yarn.scheduler.capaci

In [106]:
body = config_new
r = s.put(api_url + '/api/v1/clusters/' + cluster, data=json.dumps(body))

print(r.url)
print(r.status_code)
assert r.status_code == 200
print("Configuration changed successfully!")
print(json.dumps(r.json(), indent=2))

http://208.72.157.66:8080/api/v1/clusters/Sandbox
200
Configuration changed successfully!
{
  "resources": [
    {
      "service_name": "YARN",
      "group_name": null,
      "service_config_version_note": null,
      "service_config_version": 2,
      "group_id": null,
      "configurations": [
        {
          "version": 2,
          "type": "capacity-scheduler",
          "configAttributes": {},
          "clusterName": "Sandbox",
          "configs": {
            "yarn.scheduler.capacity.root.default.user-limit-factor": "1",
            "yarn.scheduler.capacity.maximum-applications": "10000",
            "yarn.scheduler.capacity.root.hiveserver.hive2.capacity": "50",
            "yarn.scheduler.capacity.root.default.maximum-capacity": "100",
            "yarn.scheduler.capacity.root.accessible-node-labels.default.maximum-capacity": "-1",
            "yarn.scheduler.capacity.root.accessible-node-labels": "*",
            "yarn.scheduler.capacity.root.default.maximum-am-resourc

# Configure YARN Site

In [107]:
## Get current configuration tag
r = s.get(api_url + '/api/v1/clusters/' + cluster + '?fields=Clusters/desired_configs/yarn-site')
assert r.status_code == 200
tag = r.json()['Clusters']['desired_configs']['yarn-site']['tag']

## Get current configuration
r = s.get(api_url + '/api/v1/clusters/' + cluster + '/configurations?type=yarn-site&tag=' + tag)
assert r.status_code == 200
print(json.dumps(r.json(), indent=2))

## Update config
config_old = r.json()['items'][0]
config_new = r.json()['items'][0]

#### Make your changes here
config_new['properties']['yarn.nodemanager.resource.memory-mb'] = '4096'
config_new['properties']['yarn.scheduler.minimum-allocation-mb'] = '512'
config_new['properties']['yarn.scheduler.maximum-allocation-mb'] = '4096'
config_new['properties']['yarn.resourcemanager.scheduler.monitor.enable'] = 'true'
config_new['properties']['yarn.resourcemanager.scheduler.monitor.policies'] = 'org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy'
config_new['properties']['yarn.resourcemanager.monitor.capacity.preemption.monitoring_interval'] = '1000'
config_new['properties']['yarn.resourcemanager.monitor.capacity.preemption.max_wait_before_kill'] = '5000'
config_new['properties']['yarn.resourcemanager.monitor.capacity.preemption.total_preemption_per_round'] = '0.4'

{
  "items": [
    {
      "version": 1,
      "type": "yarn-site",
      "Config": {
        "cluster_name": "Sandbox"
      },
      "properties": {
        "yarn.timeline-service.address": "sandbox.hortonworks.com:10200",
        "yarn.nodemanager.container-executor.class": "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor",
        "yarn.scheduler.maximum-allocation-mb": "2250",
        "yarn.nodemanager.container-monitor.interval-ms": "3000",
        "yarn.nodemanager.log-aggregation.compression-type": "gz",
        "yarn.resourcemanager.address": "sandbox.hortonworks.com:8050",
        "yarn.nodemanager.delete.debug-delay-sec": "0",
        "yarn.nodemanager.vmem-pmem-ratio": "10",
        "yarn.timeline-service.ttl-enable": "true",
        "yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size": "10000",
        "yarn.nodemanager.bind-host": "0.0.0.0",
        "yarn.nodemanager.remote-app-log-dir": "/app-logs",
        "yarn.resourcemanager.s

In [108]:
#### Show the differences
a = json.dumps(config_old, indent=2).splitlines(1)
b = json.dumps(config_new, indent=2).splitlines(1)

for line in difflib.unified_diff(a, b):
     sys.stdout.write(line)  

--- 
+++ 
@@ -7,7 +7,7 @@
   "properties": {
     "yarn.timeline-service.address": "sandbox.hortonworks.com:10200",
     "yarn.nodemanager.container-executor.class": "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor",
-    "yarn.scheduler.maximum-allocation-mb": "2250",
+    "yarn.scheduler.maximum-allocation-mb": "4096",
     "yarn.nodemanager.container-monitor.interval-ms": "3000",
     "yarn.nodemanager.log-aggregation.compression-type": "gz",
     "yarn.resourcemanager.address": "sandbox.hortonworks.com:8050",
@@ -17,6 +17,7 @@
     "yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size": "10000",
     "yarn.nodemanager.bind-host": "0.0.0.0",
     "yarn.nodemanager.remote-app-log-dir": "/app-logs",
+    "yarn.resourcemanager.scheduler.monitor.enable": "true",
     "yarn.resourcemanager.scheduler.address": "sandbox.hortonworks.com:8030",
     "yarn.nodemanager.vmem-check-enabled": "false",
     "yarn.nodemanager.health-checker.script.timeout-ms":

In [109]:
#### Manipulate the document to match the format Ambari expects

#### Adds new configuration tag, deletes fields, and wraps in appropriate json
config_new['tag'] = 'version' + str(int(round(time.time() * 1000000000)))
del config_new['Config']
del config_new['href']
del config_new['version']
config_new = {"Clusters": {"desired_config": config_new}}

print(json.dumps(config_new, indent=2))


{
  "Clusters": {
    "desired_config": {
      "type": "yarn-site",
      "properties": {
        "yarn.timeline-service.address": "sandbox.hortonworks.com:10200",
        "yarn.nodemanager.container-executor.class": "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor",
        "yarn.scheduler.maximum-allocation-mb": "4096",
        "yarn.nodemanager.container-monitor.interval-ms": "3000",
        "yarn.nodemanager.log-aggregation.compression-type": "gz",
        "yarn.resourcemanager.address": "sandbox.hortonworks.com:8050",
        "yarn.nodemanager.delete.debug-delay-sec": "0",
        "yarn.nodemanager.vmem-pmem-ratio": "10",
        "yarn.timeline-service.ttl-enable": "true",
        "yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size": "10000",
        "yarn.nodemanager.bind-host": "0.0.0.0",
        "yarn.nodemanager.remote-app-log-dir": "/app-logs",
        "yarn.resourcemanager.scheduler.monitor.enable": "true",
        "yarn.resourcemana

In [110]:
body = config_new
r = s.put(api_url + '/api/v1/clusters/' + cluster, data=json.dumps(body))

print(r.url)
print(r.status_code)
assert r.status_code == 200
print("Configuration changed successfully!")
print(json.dumps(r.json(), indent=2))

http://208.72.157.66:8080/api/v1/clusters/Sandbox
200
Configuration changed successfully!
{
  "resources": [
    {
      "service_name": "YARN",
      "group_name": null,
      "service_config_version_note": null,
      "service_config_version": 3,
      "group_id": null,
      "configurations": [
        {
          "version": 2,
          "type": "yarn-site",
          "configAttributes": {},
          "clusterName": "Sandbox",
          "configs": {
            "yarn.timeline-service.address": "sandbox.hortonworks.com:10200",
            "yarn.nodemanager.container-executor.class": "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor",
            "yarn.scheduler.maximum-allocation-mb": "4096",
            "yarn.nodemanager.container-monitor.interval-ms": "3000",
            "yarn.nodemanager.log-aggregation.compression-type": "gz",
            "yarn.resourcemanager.address": "sandbox.hortonworks.com:8050",
            "yarn.node-labels.fs-store.root-dir": "/system/ya

# Hive Configuration

In [111]:
#### Get current configuration tag

r = s.get(api_url + '/api/v1/clusters/' + cluster + '?fields=Clusters/desired_configs/hive-site')
assert r.status_code == 200
tag = r.json()['Clusters']['desired_configs']['hive-site']['tag']

### Get current configuration
r = s.get(api_url + '/api/v1/clusters/' + cluster + '/configurations?type=hive-site&tag=' + tag)
assert r.status_code == 200

#### Change the configuration
config_old = r.json()['items'][0]
config_new = r.json()['items'][0]

#### The configurations you want to change
config_new['properties']['hive.execution.engine'] = 'tez'
config_new['properties']['hive.heapsize'] = '512'
config_new['properties']['hive.tez.container.size'] = '512'
config_new['properties']['hive.tez.java.opts'] = "-server -Xmx200m -Djava.net.preferIPv4Stack=true"
config_new['properties']['hive.support.concurrency'] = 'true'
config_new['properties']['hive.txn.manager'] = 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager'
config_new['properties']['hive.compactor.initiator.on'] = 'true'
config_new['properties']['hive.compactor.worker.threads'] = '2'
config_new['properties']['hive.enforce.bucketing'] = 'true'
config_new['properties']['hive.exec.dynamic.partition.mode'] = 'nonstrict'
config_new['properties']['hive.execution.engine'] = 'tez'
config_new['properties']['hive.server2.tez.initialize.default.sessions'] = 'true'
config_new['properties']['hive.server2.tez.default.queues'] = 'hive1,hive2'
config_new['properties']['hive.server2.tez.sessions.per.default.queue'] = '1'
config_new['properties']['hive.server2.enable.doAs'] = 'false'
config_new['properties']['hive.vectorized.groupby.maxentries'] = '10240'
config_new['properties']['hive.vectorized.groupby.flush.percent'] = '0.1'

In [112]:
#### Show the differences

a = json.dumps(config_old, indent=2).splitlines(1)
b = json.dumps(config_new, indent=2).splitlines(1)

for line in difflib.unified_diff(a, b):
     sys.stdout.write(line)  

--- 
+++ 
@@ -27,7 +27,7 @@
     "hive.map.aggr": "true",
     "hive.tez.cpu.vcores": "-1",
     "hive.exec.max.dynamic.partitions.pernode": "2000",
-    "hive.server2.tez.default.queues": "default",
+    "hive.server2.tez.default.queues": "hive1,hive2",
     "hive.mapjoin.optimized.hashtable": "true",
     "hive.compactor.check.interval": "300s",
     "hive.merge.smallfiles.avgsize": "16000000",
@@ -35,9 +35,9 @@
     "hive.compactor.delta.num.threshold": "10",
     "javax.jdo.option.ConnectionURL": "jdbc:mysql://sandbox.hortonworks.com/hive?createDatabaseIfNotExist=true",
     "hive.limit.pushdown.memory.usage": "0.04",
-    "hive.vectorized.groupby.maxentries": "100000",
+    "hive.vectorized.groupby.maxentries": "10240",
     "hive.auto.convert.sortmerge.join": "true",
-    "hive.compactor.worker.threads": "0",
+    "hive.compactor.worker.threads": "2",
     "hive.server2.transport.mode": "binary",
     "hive.exec.orc.compression.strategy": "SPEED",
     "hive.server2.tez.sessions.

In [113]:
#### Manipulate the document to match the format Ambari expects

#### Adds new configuration tag, deletes fields, and wraps in appropriate json
config_new['tag'] = 'version' + str(int(round(time.time() * 1000000000)))
del config_new['Config']
del config_new['href']
del config_new['version']
config_new = {"Clusters": {"desired_config": config_new}}

print(json.dumps(config_new, indent=2))


{
  "Clusters": {
    "desired_config": {
      "type": "hive-site",
      "properties": {
        "hive.tez.dynamic.partition.pruning": "true",
        "hive.server2.use.SSL": "false",
        "hive.vectorized.groupby.flush.percent": "0.1",
        "hive.merge.tezfiles": "false",
        "hive.metastore.sasl.enabled": "false",
        "hive.stats.dbclass": "fs",
        "hive.exec.parallel.thread.number": "8",
        "hive.compactor.delta.pct.threshold": "0.1f",
        "hive.user.install.directory": "/user/",
        "hive.exec.dynamic.partition.mode": "nonstrict",
        "hive.exec.dynamic.partition": "true",
        "hive.metastore.failure.retries": "24",
        "hive.exec.submitviachild": "false",
        "hive.mapjoin.bucket.cache.size": "10000",
        "hive.enforce.sortmergebucketmapjoin": "true",
        "hive.vectorized.execution.enabled": "true",
        "hive_metastore_user_passwd": "hive",
        "hive.security.authorization.enabled": "false",
        "hive.exec.failu

In [114]:
body = config_new
r = s.put(api_url + '/api/v1/clusters/' + cluster, data=json.dumps(body))

print(r.url)
print(r.status_code)
assert r.status_code == 200
print("Configuration changed successfully!")
print(json.dumps(r.json(), indent=2))

http://208.72.157.66:8080/api/v1/clusters/Sandbox
200
Configuration changed successfully!
{
  "resources": [
    {
      "service_name": "HIVE",
      "group_name": null,
      "service_config_version_note": null,
      "service_config_version": 3,
      "group_id": null,
      "configurations": [
        {
          "version": 3,
          "type": "hive-site",
          "configAttributes": {},
          "clusterName": "Sandbox",
          "configs": {
            "hive.tez.dynamic.partition.pruning": "true",
            "hive.map.aggr": "true",
            "hive.server2.thrift.max.worker.threads": "500",
            "hive.metastore.sasl.enabled": "false",
            "hive.stats.dbclass": "fs",
            "hive.server2.allow.user.substitution": "true",
            "hive.compactor.delta.pct.threshold": "0.1f",
            "hive.user.install.directory": "/user/",
            "hive.exec.submitviachild": "false",
            "hive.exec.dynamic.partition.mode": "nonstrict",
            "h