1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.

In [None]:
import configparser

# Specify the path to the Hadoop configuration file
hadoop_conf_file = '/path/to/hadoop/conf/hadoop-env.sh'

# Create a configparser object
config = configparser.ConfigParser()

# Read the Hadoop configuration file
config.read(hadoop_conf_file)

# Retrieve the core components from the configuration file
core_components = config.get('core-site', 'fs.defaultFS')

# Display the core components
print("Core Components of Hadoop:")
print(core_components)



2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.


In [None]:
pip install hdfs

from hdfs import InsecureClient

def calculate_total_file_size(hdfs_url, directory):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Get the file status for the directory
    file_status = client.list(directory, status=True)

    # Initialize total file size to 0
    total_size = 0

    # Iterate over the file status
    for file in file_status:
        # Add the file size to the total size
        total_size += file['length']

    return total_size


3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import heapq

class TopNWords(MRJob):

    def mapper_get_words(self, _, line):
        for word in line.split():
            yield word.lower(), 1

    def combiner_count_words(self, word, counts):
        yield word, sum(counts)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_top_n_words(self, _, word_counts):
        top_n = heapq.nlargest(N, word_counts)
        for count, word in top_n:
            yield word, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_n_words)
        ]


if __name__ == '__main__':
    N = 10  # Specify the value of N for the top N words
    input_file = 'path/to/large_text_file.txt'  # Replace with the path to your large text file
    mr_job = TopNWords(args=[input_file])
    with mr_job.make_runner() as runner:
        runner.run()
        for word, count in mr_job.parse_output(runner.cat_output()):
            print(f"{word}: {count}")


4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.


In [None]:
import requests

# Hadoop cluster configuration
namenode_host = 'namenode_hostname'
namenode_port = 50070
datanode_port = 50075

# Check NameNode health status
namenode_url = f"http://{namenode_host}:{namenode_port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
response = requests.get(namenode_url)
namenode_status = response.json()['beans'][0]['State']

# Check DataNode health status
datanode_url = f"http://{namenode_host}:{datanode_port}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
response = requests.get(datanode_url)
datanode_status = response.json()['beans'][0]['State']

# Display the health status
print("NameNode Status:", namenode_status)
print("DataNode Status:", datanode_status)


5. Develop a Python program that lists all the files and directories in a specific HDFS path.


In [None]:
from hdfs import InsecureClient

def list_hdfs_path(hdfs_url, hdfs_path):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # List the files and directories in the HDFS path
    file_status = client.list(hdfs_path, status=True)

    # Iterate over the file status
    for file in file_status:
        if file['type'] == 'DIRECTORY':
            print(f"[DIR] {file['path']}")
        else:
            print(f"[FILE] {file['path']}")


if __name__ == '__main__':
    hdfs_url = 'http://localhost:50070'  # Replace with the HDFS URL
    hdfs_path = '/your/hdfs/path'  # Replace with the specific HDFS path
    list_hdfs_path(hdfs_url, hdfs_path)


6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.


In [None]:
import requests

def analyze_datanode_storage(hadoop_web_url):
    # Retrieve DataNode information from Hadoop's REST API
    url = f"{hadoop_web_url}/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState-*"
    response = requests.get(url)
    data_nodes = response.json()['beans']

    # Sort DataNodes based on storage capacity
    data_nodes.sort(key=lambda x: x['Capacity'], reverse=True)

    # Identify the node with the highest storage capacity
    highest_capacity_node = data_nodes[0]

    # Identify the node with the lowest storage capacity
    lowest_capacity_node = data_nodes[-1]

    # Print the results
    print("DataNode with the highest storage capacity:")
    print(f"Hostname: {highest_capacity_node['Host']}")
    print(f"Storage Capacity: {highest_capacity_node['Capacity']} bytes")

    print("\nDataNode with the lowest storage capacity:")
    print(f"Hostname: {lowest_capacity_node['Host']}")
    print(f"Storage Capacity: {lowest_capacity_node['Capacity']} bytes")


if __name__ == '__main__':
    hadoop_web_url = 'http://localhost:9870'  # Replace with the Hadoop Web URL
    analyze_datanode_storage(hadoop_web_url)


7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.


In [None]:
import requests
import time

def submit_hadoop_job(resource_manager_url, job_file):
    # Submit the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    response = requests.post(submit_url)
    app_id = response.json()['application-id']

    # Upload the job file
    upload_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/upload"
    with open(job_file, 'rb') as file:
        requests.post(upload_url, files={'job': file})

    # Submit the job
    submit_job_url = f"{resource_manager_url}/ws/v1/cluster/apps"
    headers = {'Content-Type': 'application/json'}
    data = {
        "application-id": app_id,
        "application-name": "Hadoop Job",
        "am-container-spec": {
            "commands": {
                "command": "hadoop jar job.jar"
            }
        },
        "application-type": "MAPREDUCE"
    }
    requests.post(submit_job_url, headers=headers, json=data)

    return app_id

def monitor_job_progress(resource_manager_url, app_id):
    # Monitor the job progress
    while True:
        status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}"
        response = requests.get(status_url)
        app_status = response.json()['app']['state']

        if app_status in {'FINISHED', 'FAILED', 'KILLED'}:
            break

        print(f"Job status: {app_status}")
        time.sleep(5)

def retrieve_job_output(resource_manager_url, app_id):
    # Retrieve the job output
    output_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/reports"
    response = requests.get(output_url)
    output = response.json()['app']['appReport']['finalStatus']

    print("Job output:", output)


if __name__ == '__main__':
    resource_manager_url = 'http://localhost:8088'  # Replace with the ResourceManager URL
    job_file = 'path/to/hadoop/job.jar'  # Replace with the path to your Hadoop job file

    app_id = submit_hadoop_job(resource_manager_url, job_file)
    print("Job submitted. Application ID:", app_id)

    monitor_job_progress(resource_manager_url, app_id)
    print("Job completed.")

    retrieve_job_output(resource_manager_url, app_id)


8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.


In [None]:
import requests
import time

def submit_hadoop_job(resource_manager_url, job_file, num_containers, container_memory, container_vcores):
    # Submit the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    response = requests.post(submit_url)
    app_id = response.json()['application-id']

    # Upload the job file
    upload_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/upload"
    with open(job_file, 'rb') as file:
        requests.post(upload_url, files={'job': file})

    # Set the resource requirements
    set_resources_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}"
    headers = {'Content-Type': 'application/json'}
    data = {
        "resource": {
            "memory": container_memory,
            "vCores": container_vcores
        },
        "application-id": app_id
    }
    requests.put(set_resources_url, headers=headers, json=data)

    # Submit the job
    submit_job_url = f"{resource_manager_url}/ws/v1/cluster/apps"
    headers = {'Content-Type': 'application/json'}
    data = {
        "application-id": app_id,
        "application-name": "Hadoop Job",
        "am-container-spec": {
            "commands": {
                "command": "hadoop jar job.jar"
            },
            "resource": {
                "memory": container_memory,
                "vCores": container_vcores
            },
            "localResources": {
                "entry": [
                    {
                        "key": "job.jar",
                        "value": {
                            "resource": f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/job"
                        }
                    }
                ]
            }
        },
        "application-type": "MAPREDUCE",
        "resource": {
            "memory": container_memory,
            "vCores": container_vcores
        }
    }
    requests.post(submit_job_url, headers=headers, json=data)

    return app_id

def monitor_job_progress(resource_manager_url, app_id):
    # Monitor the job progress
    while True:
        status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}"
        response = requests.get(status_url)
        app_status = response.json()['app']['state']

        if app_status in {'FINISHED', 'FAILED', 'KILLED'}:
            break

        resources_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/appattempts"
        response = requests.get(resources_url)
        app_resources = response.json()['appAttempts']['appAttempt'][0]['resourceUsage']

        memory_used = app_resources['memorySeconds']
        vcores_used = app_resources['vcoreSeconds']

        print(f"Job status: {app_status}")
        print(f"Memory Used: {memory_used} MB-seconds")
        print(f"vCores Used: {vcores_used} vCore-seconds")
        time.sleep(5)

    resources_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/appattempts"
    response = requests.get(resources_url)
    app_resources = response.json()['appAttempts']['appAttempt'][0]['resourceUsage']

    memory_used = app_resources['memorySeconds']
    vcores_used = app_resources['vcoreSeconds']

    print(f"Final Memory Used: {memory_used} MB-seconds")
    print(f"Final vCores Used: {vcores_used} vCore-seconds")


if __name__ == '__main__':
    resource_manager_url = 'http://localhost:8088'  # Replace with the ResourceManager URL
    job_file = 'path/to/hadoop/job.jar'  # Replace with the path to your Hadoop job file
    num_containers = 2  # Specify the number of containers required
    container_memory = 1024  # Specify the memory requirement per container in MB
    container_vcores = 1  # Specify the vCores requirement per container

    app_id = submit_hadoop_job(resource_manager_url, job_file, num_containers, container_memory, container_vcores)
    print("Job submitted. Application ID:", app_id)

    monitor_job_progress(resource_manager_url, app_id)
    print("Job completed.")


9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import time

class WordCountJob(MRJob):
    
    def configure_args(self):
        super(WordCountJob, self).configure_args()
        self.add_passthru_arg(
            '--input-split-size',
            type=int,
            default=64,
            help='Size of each input split in MB (default: 64)'
        )

    def mapper(self, _, line):
        for word in line.split():
            yield word.lower(), 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, counts):
        yield word, sum(counts)

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                combiner=self.combiner,
                reducer=self.reducer,
                jobconf={
                    'mapreduce.input.fileinputformat.split.maxsize': str(self.options.input_split_size * 1024 * 1024)
                }
            )
        ]


if __name__ == '__main__':
    # Input file path
    input_file = 'path/to/input/file.txt'

    # List of input split sizes to compare
    split_sizes = [32, 64, 128]

    for split_size in split_sizes:
        print(f"Comparing input split size: {split_size} MB")

        # Start the timer
        start_time = time.time()

        # Run the WordCount job with the specified input split size
        WordCountJob.run(args=[input_file, f'--input-split-size={split_size}'])

        # Calculate and print the execution time
        execution_time = time.time() - start_time
        print(f"Execution time: {execution_time} seconds")
        print("--------------------------------------")
