**1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.**

In [1]:
import configparser

def read_hadoop_config(config_file):
    config = configparser.ConfigParser()
    config.read(config_file)

    if 'core-site' in config:
        core_components = config['core-site'].get('fs.defaultFS', '').split('://')
        if len(core_components) > 1:
            print("Core Components of Hadoop:")
            print(" - NameNode:", core_components[1])

    if 'hdfs-site' in config:
        hdfs_components = config['hdfs-site'].get('dfs.namenode.secondary.http-address', '').split(':')
        if len(hdfs_components) > 1:
            print(" - SecondaryNameNode:", hdfs_components[0])

    if 'yarn-site' in config:
        yarn_components = config['yarn-site'].get('yarn.resourcemanager.hostname', '')
        if yarn_components:
            print(" - ResourceManager:", yarn_components)

        if config['yarn-site'].getboolean('yarn.nodemanager.aux-services'):
            print(" - NodeManager")

if __name__ == '__main__':
    config_file = '/path/to/hadoop.conf'  # Specify the path to your Hadoop configuration file
    read_hadoop_config(config_file)


**2.Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.**

In [2]:
from snakebite.client import Client

def calculate_hdfs_directory_size(directory):
    client = Client('localhost', 9000)  # Replace with your HDFS NameNode host and port

    total_size = 0
    for file in client.ls([directory]):
        total_size += file['length']

    return total_size

if __name__ == '__main__':
    hdfs_directory = '/user/hadoop/data'  # Specify the HDFS directory path
    size = calculate_hdfs_directory_size(hdfs_directory)
    print(f"Total file size in HDFS directory '{hdfs_directory}': {size} bytes")


**3.Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.**

In [3]:
import string

def find_top_n_frequent_word(file,N):
    # Open the file in read mode
    text = open(file,"r")
    # Create an empty dictionary
    d = dict()
    
    for line in text:
        # Remove the leading spaces and newline character
        line = line.strip()
        # Convert the characters in line to
        # lowercase to avoid case mismatch
        line.lower()
        # Remove the punctuation marks from the line
        line = line.translate(line.maketrans("", "", string.punctuation))
  
        # Split the line into words
        words = line.split(" ")
  
        # Iterate over each word in line
        for word in words:
            # Check if the word is already in dictionary
            if word in d:
                # Increment count of word by 1
                d[word] = d[word] + 1
            else:
                # Add the word to dictionary with count 1
                d[word] = 1
    result = dict(sorted(test_dict.items(), key = lambda x: x[1], reverse = True)[:N])
    
    return result

#### 4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.


In [None]:
import requests

def check_hadoop_cluster_health():
    # Hadoop NameNode REST API endpoint
    namenode_url = 'http://<namenode-host>:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo'

    # Hadoop DataNode REST API endpoint
    datanode_url = 'http://<datanode-host>:50075/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo'

    # Check NameNode health status
    namenode_response = requests.get(namenode_url)
    namenode_status = namenode_response.json()['beans'][0]['State']

    # Check DataNode health status
    datanode_response = requests.get(datanode_url)
    datanode_status = datanode_response.json()['beans'][0]['State']

    # Print health status
    print(f"NameNode Health: {namenode_status}")
    print(f"DataNode Health: {datanode_status}")

if __name__ == '__main__':
    check_hadoop_cluster_health()


#### 5.  Develop a Python program that lists all the files and directories in a specific HDFS path.

In [None]:
from snakebite.client import Client

def list_hdfs_path(path):
    client = Client('localhost', 9000)  # Replace with your HDFS NameNode host and port

    for file in client.ls([path]):
        print(file['path'])

if __name__ == '__main__':
    hdfs_path = '/user/hadoop/data'  # Specify the HDFS path
    list_hdfs_path(hdfs_path)

#### 6.Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [6]:
import requests

def analyze_data_node_storage_utilization():
    # Hadoop DataNodes REST API endpoint
    datanodes_url = 'http://<namenode-host>:50070/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState-*'

    # Send a GET request to fetch DataNodes information
    response = requests.get(datanodes_url)
    datanodes = response.json()['beans']

    # Variables to track highest and lowest storage capacities
    highest_storage = -1
    lowest_storage = float('inf')
    highest_storage_datanode = ''
    lowest_storage_datanode = ''

    # Analyze storage utilization for each DataNode
    for datanode in datanodes:
        datanode_info = datanode['FSDatasetState']
        storage_capacity = datanode_info['Capacity']
        storage_used = datanode_info['DfsUsed']

        if storage_capacity > highest_storage:
            highest_storage = storage_capacity
            highest_storage_datanode = datanode_info['DatanodeInfo']['DatanodeID']

        if storage_capacity < lowest_storage:
            lowest_storage = storage_capacity
            lowest_storage_datanode = datanode_info['DatanodeInfo']['DatanodeID']

    # Print the results
    print("DataNode Storage Utilization Analysis:")
    print(f"Highest Storage Capacity: {highest_storage} bytes ({highest_storage_datanode})")
    print(f"Lowest Storage Capacity: {lowest_storage} bytes ({lowest_storage_datanode})")

if __name__ == '__main__':
    analyze_data_node_storage_utilization()


#### 7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.


In [7]:
import requests
import time

def submit_hadoop_job(jar_path, main_class, input_path, output_path):
    # ResourceManager API endpoint to submit a Hadoop job
    submit_job_url = 'http://<resourcemanager-host>:8088/ws/v1/cluster/apps/new-application'

    # Send a POST request to submit the job
    response = requests.post(submit_job_url)
    if response.status_code == 200:
        application_id = response.json()['application-id']
        print(f"Hadoop job submitted. Application ID: {application_id}")

        # ResourceManager API endpoint to submit the job request
        submit_request_url = f'http://<resourcemanager-host>:8088/ws/v1/cluster/apps/{application_id}/app'

        # Define the Hadoop job parameters
        data = {
            "application-id": application_id,
            "application-name": "Hadoop Job",
            "am-container-spec": {
                "commands": {
                    "command": f"hadoop jar {jar_path} {main_class} {input_path} {output_path}"
                },
                "memory": 1024,
                "vcores": 1
            },
            "unmanaged-AM": False,
            "max-app-attempts": 2,
            "resource": {
                "memory": 1024,
                "vcores": 1
            },
            "priority": 0,
            "queue": "default",
            "timeout": 0,
            "attempt-failures-validity-interval": -1,
            "keep-containers-across-application-attempts": False,
            "application-type": "MAPREDUCE"
        }

        # Send a PUT request to submit the job request
        response = requests.put(submit_request_url, json=data)
        if response.status_code == 202:
            print("Hadoop job request submitted successfully.")

            # Monitor job progress
            while True:
                # ResourceManager API endpoint to get job status
                job_status_url = f'http://<resourcemanager-host>:8088/ws/v1/cluster/apps/{application_id}'

                # Send a GET request to get job status
                response = requests.get(job_status_url)
                if response.status_code == 200:
                    job_info = response.json()['app']
                    if job_info['state'] == 'FINISHED':
                        print("Hadoop job completed.")
                        break

                    progress = job_info['progress']
                    print(f"Job progress: {progress}")

                time.sleep(5)  # Wait for 5 seconds before checking job status

        else:
            print("Failed to submit the Hadoop job request.")
    else:
        print("Failed to submit the Hadoop job.")

if __name__ == '__main__':
    jar_path = '/path/to/hadoop-job.jar'  # Specify the path to your Hadoop job JAR file
    main_class = 'com.example.hadoop.JobMainClass'  # Specify the main class of your Hadoop job
    input_path = '/user/hadoop/input'  # Specify the HDFS input path
    output_path = '/user/hadoop/output'  # Specify the HDFS output path

    submit_hadoop_job(jar_path, main_class, input_path, output_path)


#### 8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.

In [8]:
import requests
import time

def submit_hadoop_job(jar_path, main_class, input_path, output_path, memory, vcores):
    # ResourceManager API endpoint to submit a Hadoop job
    submit_job_url = 'http://<resourcemanager-host>:8088/ws/v1/cluster/apps/new-application'

    # Send a POST request to submit the job
    response = requests.post(submit_job_url)
    if response.status_code == 200:
        application_id = response.json()['application-id']
        print(f"Hadoop job submitted. Application ID: {application_id}")

        # ResourceManager API endpoint to submit the job request
        submit_request_url = f'http://<resourcemanager-host>:8088/ws/v1/cluster/apps/{application_id}/app'

        # Define the Hadoop job parameters
        data = {
            "application-id": application_id,
            "application-name": "Hadoop Job",
            "am-container-spec": {
                "commands": {
                    "command": f"hadoop jar {jar_path} {main_class} {input_path} {output_path}"
                },
                "resource": {
                    "memory": memory,
                    "vcores": vcores
                }
            },
            "resource": {
                "memory": memory,
                "vcores": vcores
            },
            "application-type": "MAPREDUCE"
        }

        # Send a PUT request to submit the job request
        response = requests.put(submit_request_url, json=data)
        if response.status_code == 202:
            print("Hadoop job request submitted successfully.")

            # ResourceManager API endpoint to get job status
            job_status_url = f'http://<resourcemanager-host>:8088/ws/v1/cluster/apps/{application_id}'

            # Monitor job progress and resource usage
            while True:
                # Send a GET request to get job status
                response = requests.get(job_status_url)
                if response.status_code == 200:
                    job_info = response.json()['app']
                    if job_info['state'] == 'FINISHED':
                        print("Hadoop job completed.")
                        break

                    progress = job_info['progress']
                    allocated_memory = job_info['allocatedMB']
                    allocated_vcores = job_info['allocatedVCores']
                    print(f"Job progress: {progress}")
                    print(f"Allocated resources: Memory = {allocated_memory} MB, vCores = {allocated_vcores}")

                time.sleep(5)  # Wait for 5 seconds before checking job status

        else:
            print("Failed to submit the Hadoop job request.")
    else:
        print("Failed to submit the Hadoop job.")

if __name__ == '__main__':
    jar_path = '/path/to/hadoop-job.jar'  # Specify the path to your Hadoop job JAR file
    main_class = 'com.example.hadoop.JobMainClass'  # Specify the main class of your Hadoop job
    input_path = '/user/hadoop/input'  # Specify the HDFS input path
    output_path = '/user/hadoop/output'  # Specify the HDFS output path
    memory = 2048  # Specify the required memory in MB
    vcores = 2  # Specify the required number of vCores

    submit_hadoop_job(jar_path, main_class, input_path, output_path, memory, vcores)


#### 9.Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.

In [9]:
from mrjob.job import MRJob
import time

class MapReduceJob(MRJob):
    
    def configure_args(self):
        super(MapReduceJob, self).configure_args()
        self.add_passthru_arg('--split-size', type=int, help='Input split size in bytes')

    def mapper(self, _, line):
        yield None, len(line)

    def reducer(self, key, values):
        yield key, sum(values)

if __name__ == '__main__':
    split_sizes = [10, 100, 1000]  # Specify the different input split sizes in bytes

    for split_size in split_sizes:
        start_time = time.time()

        # Run the MapReduce job with the specified input split size
        job = MapReduceJob(args=['-r', 'local', '--split-size', str(split_size), 'input.txt'])
        with job.make_runner() as runner:
            runner.run()

            # Print the job execution time
            execution_time = time.time() - start_time
            print(f"Input Split Size: {split_size} bytes")
            print(f"Job Execution Time: {execution_time} seconds")
