In [None]:

1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.
import xml.etree.ElementTree as ET

def read_hadoop_config(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    components = {}

    for property in root.findall('property'):
        name = property.find('name').text
        value = property.find('value').text
        components[name] = value

    core_components = {
        "fs.defaultFS": components.get("fs.defaultFS"),
        "dfs.replication": components.get("dfs.replication"),
        "yarn.resourcemanager.address": components.get("yarn.resourcemanager.address")
    }

    print("Core Components of Hadoop Configuration:")
    for key, value in core_components.items():
        print(f"{key}: {value}")

In [None]:
2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.
from hdfs import InsecureClient

def calculate_total_file_size(hdfs_url, directory):
    client = InsecureClient(hdfs_url)
    status = client.status(directory, strict=False)
    total_size = 0

    if status:
        files = client.list(directory, status=True)
        for file_info in files:
            total_size += file_info['length']

    return total_size

# Example usage:
# hdfs_url = 'http://localhost:50070'
# directory = '/user/hadoop/data'
# print(f"Total file size: {calculate_total_file_size(hdfs_url, directory)} bytes")

In [None]:
3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.
from collections import Counter
from mrjob.job import MRJob

class MRWordFreqCount(MRJob):
    def mapper(self, _, line):
        words = line.split()
        for word in words:
            yield word.lower(), 1

    def reducer(self, word, counts):
        yield word, sum(counts)

def top_n_words(input_file, n):
    job = MRWordFreqCount(args=[input_file])
    word_counts = Counter()

    with job.make_runner() as runner:
        runner.run()
        for word, count in job.parse_output(runner.cat_output()):
            word_counts[word] += count

    top_n = word_counts.most_common(n)
    for word, freq in top_n:
        print(f"{word}: {freq}")

# Example usage:
# top_n_words('large_text_file.txt', 10)

In [None]:
4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.
import requests
import json

def check_hadoop_health_status(namenode_url):
    response = requests.get(f'{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus')
    health_info = response.json()

    health_status = health_info['beans'][0]['State']
    print(f"NameNode Health Status: {health_status}")

    datanode_response = requests.get(f'{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem')
    datanode_info = datanode_response.json()

    live_datanodes = datanode_info['beans'][0]['NumLiveDataNodes']
    dead_datanodes = datanode_info['beans'][0]['NumDeadDataNodes']

    print(f"Live DataNodes: {live_datanodes}")
    print(f"Dead DataNodes: {dead_datanodes}")

# Example usage:
# check_hadoop_health_status('http://localhost:50070')

In [None]:
5. Develop a Python program that lists all the files and directories in a specific HDFS path.
from hdfs import InsecureClient

def list_hdfs_directory(hdfs_url, directory):
    client = InsecureClient(hdfs_url)
    files = client.list(directory)
    print(f"Files and directories in '{directory}':")
    for file in files:
        print(file)

# Example usage:
# list_hdfs_directory('http://localhost:50070', '/user/hadoop')

In [None]:
6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.
import requests

def analyze_storage_utilization(namenode_url):
    response = requests.get(f'{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo')
    datanodes_info = response.json()['beans'][0]['LiveNodes']
    datanodes = json.loads(datanodes_info)

    storage_info = []

    for node, details in datanodes.items():
        used = details['usedSpace']
        capacity = details['capacity']
        storage_info.append((node, used, capacity))

    storage_info.sort(key=lambda x: x[1]/x[2], reverse=True)

    print("DataNodes with highest storage utilization:")
    print(storage_info[0])

    print("DataNodes with lowest storage utilization:")
    print(storage_info[-1])

# Example usage:
# analyze_storage_utilization('http://localhost:50070')

In [None]:
7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.
import requests

def submit_hadoop_job(yarn_url, job_details):
    response = requests.post(f'{yarn_url}/ws/v1/cluster/apps', json=job_details)
    app_id = response.json()['app']['id']
    print(f"Job submitted with Application ID: {app_id}")

    # Monitor progress
    while True:
        status_response = requests.get(f'{yarn_url}/ws/v1/cluster/apps/{app_id}')
        status = status_response.json()
        print(f"Job {app_id} Status: {status['app']['state']}")
        if status['app']['state'] in ['FINISHED', 'FAILED', 'KILLED']:
            break

    # Retrieve output
    if status['app']['finalStatus'] == 'SUCCEEDED':
        print(f"Job {app_id} completed successfully")
        # Retrieve output logic can be implemented here

In [None]:
8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.
import requests

def submit_hadoop_job_with_resources(yarn_url, job_details, resource_requirements):
    job_details['resource'] = resource_requirements
    response = requests.post(f'{yarn_url}/ws/v1/cluster/apps', json=job_details)
    app_id = response.json()['app']['id']
    print(f"Job submitted with Application ID: {app_id}")

    # Monitor resource usage and progress
    while True:
        status_response = requests.get(f'{yarn_url}/ws/v1/cluster/apps/{app_id}')
        status = status_response.json()
        print(f"Job {app_id} Status: {status['app']['state']}")
        if status['app']['state'] in ['FINISHED', 'FAILED', 'KILLED']:
            break

        resource_usage = status['app']['resourceUsageReport']
        print(f"Resource Usage: {resource_usage}")

    # Retrieve output
    if status['app']['finalStatus'] == 'SUCCEEDED':
        print(f"Job {app_id} completed successfully")

# Example usage:
# resource_requirements = {"vCores": 4, "memory": 4096}
# submit_hadoop_job_with_resources('http://localhost:8088', job_details, resource_requirements)