In [None]:
#1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.


In [None]:
import configparser

# Specify the path to the Hadoop configuration file
config_file_path = '/path/to/hadoop/config/file.xml'

# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the Hadoop configuration file
config.read(config_file_path)

# Get the core components section
core_section = config['core']

# Extract and display the core components of Hadoop
core_components = core_section.get('fs.defaultFS'), core_section.get('hadoop.tmp.dir')
print("Core Components of Hadoop:")
for component in core_components:
    print("- " + component)


In [None]:
2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.


In [None]:
from hdfs import InsecureClient

def calculate_directory_size(hdfs_url, hdfs_path):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Calculate total file size
    total_size = 0
    for entry in client.list(hdfs_path, status=True):
        if entry['type'] == 'FILE':
            total_size += entry['length']

    return total_size

# Example usage
hdfs_url = 'http://localhost:50070'  # Replace with your HDFS NameNode URL
hdfs_path = '/path/to/hdfs/directory'  # Replace with your HDFS directory path


In [None]:
3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import heapq


class TopNWords(MRJob):

    def mapper(self, _, line):
        words = line.strip().lower().split()
        for word in words:
            yield word, 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer_init(self):
        self.topN = []

    def reducer(self, word, counts):
        heapq.heappush(self.topN, (sum(counts), word))
        if len(self.topN) > N:
            heapq.heappop(self.topN)

    def reducer_final(self):
        self.topN.sort(reverse=True)
        for count, word in self.topN:
            yield word, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer_init=self.reducer_init,
                   reducer=self.reducer,
                   reducer_final=self.reducer_final)
        ]


# Example usage
N = 10  # Specify the number of top frequent words you want to extract
input_file = 'path/to/large_text_file.txt'  # Replace with the path to your large text file

if __name__ == '__main__':
    TopNWords.run(args=[input_file])


In [None]:
4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
from collections import Counter
import heapq


class TopNWords(MRJob):

    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg('-n', '--top-n', default=10, type=int, help='Number of top frequent words')

    def mapper(self, _, line):
        words = line.strip().lower().split()
        for word in words:
            yield word, 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer_init(self):
        self.topN = []

    def reducer(self, word, counts):
        heapq.heappush(self.topN, (sum(counts), word))
        if len(self.topN) > self.options.top_n:
            heapq.heappop(self.topN)

    def reducer_final(self):
        self.topN.sort(reverse=True)
        for count, word in self.topN:
            yield word, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer_init=self.reducer_init,
                   reducer=self.reducer,
                   reducer_final=self.reducer_final)
        ]


# Example usage
if __name__ == '__main__':
    TopNWords.run()


In [None]:
5. Develop a Python program that lists all the files and directories in a specific HDFS path.

In [None]:
import pyarrow.hdfs as hdfs

# Specify the HDFS path
hdfs_path = '/path/to/hdfs/directory'  # Replace with your HDFS path

# Connect to HDFS
hdfs_client = hdfs.connect(host='localhost', port=9000)  # Replace with your HDFS NameNode host and port

# List files and directories
file_info = hdfs_client.ls(hdfs_path)

# Print file and directory names
print("Files and Directories in HDFS Path:", hdfs_path)
for info in file_info:
    print(info['name'])

# Close HDFS connection
hdfs_client.close()


In [None]:
6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.


In [None]:
from hdfs import InsecureClient

# Specify the HDFS URL and path
hdfs_url = 'http://localhost:50070'  # Replace with your HDFS NameNode URL
hdfs_path = '/path/to/hdfs/directory'  # Replace with your HDFS path

# Create an HDFS client
client = InsecureClient(hdfs_url)

# List files and directories
file_info = client.list(hdfs_path, status=True)

# Print file and directory names
print("Files and Directories in HDFS Path:", hdfs_path)


In [None]:
7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.


In [None]:
import requests
import time

# YARN ResourceManager URL
resource_manager_url = 'http://<resource-manager-host>:8088'

# Submit a Hadoop job
job_submission_url = f'{resource_manager_url}/ws/v1/cluster/apps'
headers = {'Content-Type': 'application/json'}
data = {
    'application-id': 'application_123456789_0001',
    'application-name': 'MyHadoopJob',
    'am-container-spec': {
        'commands': {
            'command': 'hadoop jar my_job.jar input_path output_path'
        }
    },
    'application-type': 'MAPREDUCE'
}
response = requests.post(job_submission_url, headers=headers, json=data)


In [None]:
8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.


In [None]:
import requests
import time

# YARN ResourceManager URL
resource_manager_url = 'http://<resource-manager-host>:8088'

# Submit a Hadoop job
job_submission_url = f'{resource_manager_url}/ws/v1/cluster/apps'
headers = {'Content-Type': 'application/json'}
data = {
    'application-id': 'application_123456789_0001',
    'application-name': 'MyHadoopJob',
    'am-container-spec': {
        'commands': {
            'command': 'hadoop jar my_job.jar input_path output_path'
        },
        'resource': {
            'memory': 2048,  # Specify the desired memory in MB
            'vCores': 2      # Specify the desired number of virtual cores
        }
    },
    'application-type': 'MAPREDUCE'
}
response = requests.post(job_submission_url, headers=headers, json=data)
if response.status_code == 202:
    print('Hadoop job submitted successfully.')
    print('Application ID:', response.json()['application-id'])
else:
    print('Failed to submit Hadoop job.')
    print('Response:', response.text)
    exit(1)

# Monitor job progress and track resource usage
application_id = response.json()['application-id']
job_status_url = f'{resource_manager_url}/ws/v1/cluster/apps/{application_id}'
while True:
    response = requests.get(job_status_url)
    if response.status_code == 200:
        app_info = response.json()['app']
        status = app_info['state']
        tracking_url = app_info['trackingUrl']
        allocated_resources = app_info['allocatedResources']
        used_resources = app_info['usedResources']

        if status == 'FINISHED':
            print('Hadoop job completed successfully.')
            break
        elif status == 'FAILED' or status == 'KILLED':
            print('Hadoop job failed or killed.')
            print('Final status:', status)
            exit(1)
        else:
            print('Job status:', status)
            print('Tracking URL:', tracking_url)
            print('Allocated Resources:', allocated_resources)
            print('Used Resources:', used_resources)
    else:
        print('Failed to get job status.')
        print('Response:', response.text)
        exit(1)
    time.sleep(5)


In [None]:
9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.



In [None]:
import subprocess
import time

# Define the Hadoop streaming jar file
hadoop_streaming_jar = '/path/to/hadoop-streaming.jar'  # Replace with the actual path to the Hadoop streaming jar file

# Define the input file path
input_file = '/path/to/input/file'  # Replace with the actual path to the input file

# Define the output directory path
output_dir = '/path/to/output/directory'  # Replace with the actual path to the output directory

# Define the different input split sizes to test
split_sizes = [64, 128, 256]  # Replace with the desired split sizes in MB

# Run the MapReduce job for each split size
for split_size in split_sizes:
    # Set the Hadoop configuration for the input split size
    config_cmd = ['hadoop', 'jar', hadoop_streaming_jar, '-D', 'mapreduce.input.fileinputformat.split.maxsize=' + str(split_size * 1024 * 1024), '-input', input_file, '-output', output_dir + '_' + str(split_size), '-mapper', 'mapper.py', '-reducer', 'reducer.py', '-file', 'mapper.py', '-file', 'reducer.py']

    # Execute the MapReduce job
    start_time = time.time()
    subprocess.call(config_cmd)
    end_time = time.time()

    # Calculate and print the execution time
    execution_time = end_time - start_time
    print(f'Split Size: {split_size} MB, Execution Time: {execution_time} seconds')
