### Hadoop Assignment 

### 1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.

In [None]:
import xml.etree.ElementTree as ET

def read_hadoop_config(filename):
    try:
        tree = ET.parse(filename)
        root = tree.getroot()
        core_components = []

        for property_elem in root.findall('.//property'):
            name_elem = property_elem.find('name')
            value_elem = property_elem.find('value')
            
            if name_elem is not None and value_elem is not None:
                name = name_elem.text.strip()
                value = value_elem.text.strip()

                if 'fs.defaultFS' in name or 'dfs.nameservices' in name:
                    core_components.append((name, value))

        return core_components

    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return []
    except ET.ParseError:
        print(f"Error: Invalid XML format in '{filename}'.")
        return []

if __name__ == "__main__":
    hadoop_config_file = "path/to/your/core-site.xml"  # Replace with the actual path to your Hadoop configuration file
    core_components = read_hadoop_config(hadoop_config_file)

    if core_components:
        print("Core Components of Hadoop:")
        for name, value in core_components:
            print(f"{name}: {value}")
    else:
        print("No core components found in the Hadoop configuration file.")


### 2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.

In [None]:
from hdfs import InsecureClient

def get_hdfs_directory_size(hdfs_url, hdfs_directory):
    try:
        hdfs_client = InsecureClient(hdfs_url)
        total_size = 0

        # Get a list of all files in the directory
        files = hdfs_client.list(hdfs_directory, status=True)

        for file_info in files:
            file_path = file_info['path']
            file_size = file_info['length']
            total_size += file_size

        return total_size

    except Exception as e:
        print(f"Error: {e}")
        return None

if __name__ == "__main__":
    hdfs_url = "http://localhost:9870"  # Replace with the HDFS NameNode URL
    hdfs_directory = "/path/to/hdfs_directory"  # Replace with the directory you want to calculate the size for

    total_size = get_hdfs_directory_size(hdfs_url, hdfs_directory)

    if total_size is not None:
        print(f"Total file size in '{hdfs_directory}': {total_size} bytes")


### 3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.

In [None]:
import re
from collections import Counter
import multiprocessing

def mapper(text_chunk):
    word_count = Counter()
    word_pattern = re.compile(r'\b\w+\b')

    for line in text_chunk:
        words = word_pattern.findall(line.lower())
        word_count.update(words)

    return word_count

def reducer(word_counts):
    final_word_count = Counter()

    for word_count in word_counts:
        final_word_count.update(word_count)

    return final_word_count

def get_top_n_words(file_path, n):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    chunk_size = len(lines) // multiprocessing.cpu_count()
    chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]

    pool = multiprocessing.Pool()
    word_counts = pool.map(mapper, chunks)
    pool.close()
    pool.join()

    merged_word_count = reducer(word_counts)
    top_n_words = merged_word_count.most_common(n)

    return top_n_words

if __name__ == "__main__":
    file_path = "large_text_file.txt"
    N = 10

    top_words = get_top_n_words(file_path, N)

    if top_words:
        print(f"Top {N} most frequent words:")
        for word, count in top_words:
            print(f"{word}: {count}")
    else:
        print("No data or file not found.")


### 4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.

In [None]:
import requests

def check_namenode_health(namenode_url):
    try:
        response = requests.get(f"{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus")
        response_json = response.json()
        state = response_json['beans'][0]['State']
        return state

    except requests.exceptions.RequestException as e:
        print(f"Error while connecting to the NameNode: {e}")
        return None

def check_datanode_health(datanode_url):
    try:
        response = requests.get(f"{datanode_url}/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState")
        response_json = response.json()
        live_data_nodes = response_json['beans'][0]['NumLiveDataNodes']
        return live_data_nodes

    except requests.exceptions.RequestException as e:
        print(f"Error while connecting to the DataNode: {e}")
        return None

if __name__ == "__main__":
    hadoop_namenode_url = "http://namenode-hostname:50070"  # Replace with your Hadoop NameNode URL
    hadoop_datanode_url = "http://datanode-hostname:50075"  # Replace with your Hadoop DataNode URL

    namenode_state = check_namenode_health(hadoop_namenode_url)
    if namenode_state is not None:
        print(f"NameNode State: {namenode_state}")

    live_datanodes = check_datanode_health(hadoop_datanode_url)
    if live_datanodes is not None:
        print(f"Live DataNodes: {live_datanodes}")


### 5. Develop a Python program that lists all the files and directories in a specific HDFS path.

In [None]:
from hdfs import InsecureClient

def list_hdfs_path(hdfs_url, hdfs_path):
    try:
        hdfs_client = InsecureClient(hdfs_url)

        # Get a list of all files and directories in the HDFS path
        file_status_list = hdfs_client.list(hdfs_path, status=True)

        # Print the list of files and directories
        print(f"Files and Directories in HDFS path '{hdfs_path}':")
        for file_status in file_status_list:
            file_path = file_status['path']
            file_type = "File" if file_status['type'] == 'FILE' else "Directory"
            print(f"{file_type}: {file_path}")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    hdfs_url = "http://localhost:9870"  # Replace with the HDFS NameNode URL
    hdfs_path = "/path/to/hdfs_directory"  # Replace with the specific HDFS path you want to list

    list_hdfs_path(hdfs_url, hdfs_path)


### 6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [None]:
import requests

def get_datanodes_info(hadoop_datanode_url):
    try:
        response = requests.get(f"{hadoop_datanode_url}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo")
        response_json = response.json()
        datanode_info = response_json['beans'][0]
        return datanode_info

    except requests.exceptions.RequestException as e:
        print(f"Error while connecting to the DataNode: {e}")
        return None

def analyze_storage_utilization(datanode_info):
    if datanode_info:
        storage_info = datanode_info['StorageInfo']
        total_capacity = storage_info['capacity']
        remaining_capacity = storage_info['remaining']
        used_capacity = total_capacity - remaining_capacity

        return total_capacity, used_capacity, remaining_capacity
    else:
        return None

if __name__ == "__main__":
    hadoop_datanode_url = "http://datanode-hostname:50075"  # Replace with your Hadoop DataNode URL

    datanode_info = get_datanodes_info(hadoop_datanode_url)

    if datanode_info:
        total_capacity, used_capacity, remaining_capacity = analyze_storage_utilization(datanode_info)
        if total_capacity and used_capacity and remaining_capacity:
            print("DataNode Storage Utilization:")
            print(f"Total Capacity: {total_capacity} bytes")
            print(f"Used Capacity: {used_capacity} bytes")
            print(f"Remaining Capacity: {remaining_capacity} bytes")
        else:
            print("Failed to analyze storage utilization.")
    else:
        print("No data or DataNode not found.")


### 7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.

In [None]:
import requests
from hdfs import InsecureClient

def submit_hadoop_job(resourcemanager_url, job_name, jar_path, main_class, input_path, output_path):
    try:
        headers = {"Content-Type": "application/json"}
        payload = {
            "applicationId": "my-hadoop-job",
            "applicationName": job_name,
            "amContainerSpec": {
                "commands": [
                    f"yarn jar {jar_path} {main_class} {input_path} {output_path}"
                ]
            },
            "maxAppAttempts": 1,
            "resource": {
                "vCores": 1,
                "memory": 512
            }
        }

        response = requests.post(f"{resourcemanager_url}/ws/v1/cluster/apps", json=payload, headers=headers)
        response_json = response.json()

        if response.status_code == 202:
            print("Hadoop job submitted successfully.")
            return response_json['applicationId']
        else:
            print(f"Failed to submit Hadoop job. Status Code: {response.status_code}")
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error while connecting to the ResourceManager: {e}")
        return None

def monitor_job_progress(resourcemanager_url, application_id):
    try:
        response = requests.get(f"{resourcemanager_url}/ws/v1/cluster/apps/{application_id}")
        response_json = response.json()
        return response_json['app']['state']

    except requests.exceptions.RequestException as e:
        print(f"Error while monitoring job progress: {e}")
        return None

def download_output_files(hdfs_url, output_path, local_output_dir):
    try:
        hdfs_client = InsecureClient(hdfs_url)
        hdfs_client.download(output_path, local_output_dir)

    except Exception as e:
        print(f"Error while downloading output files from HDFS: {e}")

if __name__ == "__main__":
    resourcemanager_url = "http://resourcemanager-hostname:8088"  # Replace with your YARN ResourceManager URL
    hdfs_url = "http://namenode-hostname:9870"  # Replace with your HDFS NameNode URL

    job_name = "MyHadoopJob"
    jar_path = "/path/to/hadoop_job.jar"  # Replace with the path to your Hadoop job JAR file
    main_class = "com.example.HadoopJobMain"  # Replace with the main class of your Hadoop job
    input_path = "/path/to/input_data"  # Replace with the HDFS path of your input data
    output_path = "/path/to/output_data"  # Replace with the HDFS path where you want to save the output

    application_id = submit_hadoop_job(resourcemanager_url, job_name, jar_path, main_class, input_path, output_path)

    if application_id:
        while True:
            job_state = monitor_job_progress(resourcemanager_url, application_id)
            if job_state == "FINISHED":
                print("Hadoop job completed successfully.")
                break
            elif job_state in ["FAILED", "KILLED", "UNKNOWN"]:
                print(f"Hadoop job failed or terminated. Job State: {job_state}")
                break

        # Assuming the Hadoop job writes output to HDFS, download the output files
        download_output_files(hdfs_url, output_path, "local_output_directory")


### 8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.

In [None]:
import requests
import time

def submit_hadoop_job(resourcemanager_url, job_name, jar_path, main_class, input_path, output_path, num_containers, container_memory, container_vcores):
    try:
        headers = {"Content-Type": "application/json"}
        payload = {
            "applicationId": "my-hadoop-job",
            "applicationName": job_name,
            "amContainerSpec": {
                "commands": [
                    f"yarn jar {jar_path} {main_class} {input_path} {output_path}"
                ]
            },
            "maxAppAttempts": 1,
            "resource": {
                "vCores": container_vcores,
                "memory": container_memory
            },
            "resourceType": "YARN",
            "queue": "default",
            "containers": num_containers
        }

        response = requests.post(f"{resourcemanager_url}/ws/v1/cluster/apps", json=payload, headers=headers)
        response_json = response.json()

        if response.status_code == 202:
            print("Hadoop job submitted successfully.")
            return response_json['applicationId']
        else:
            print(f"Failed to submit Hadoop job. Status Code: {response.status_code}")
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error while connecting to the ResourceManager: {e}")
        return None

def get_job_status(resourcemanager_url, application_id):
    try:
        response = requests.get(f"{resourcemanager_url}/ws/v1/cluster/apps/{application_id}")
        response_json = response.json()
        return response_json['app']['state']

    except requests.exceptions.RequestException as e:
        print(f"Error while getting job status: {e}")
        return None

if __name__ == "__main__":
    resourcemanager_url = "http://resourcemanager-hostname:8088"  # Replace with your YARN ResourceManager URL

    job_name = "MyHadoopJob"
    jar_path = "/path/to/hadoop_job.jar"  # Replace with the path to your Hadoop job JAR file
    main_class = "com.example.HadoopJobMain"  # Replace with the main class of your Hadoop job
    input_path = "/path/to/input_data"  # Replace with the HDFS path of your input data
    output_path = "/path/to/output_data"  # Replace with the HDFS path where you want to save the output
    num_containers = 1
    container_memory = 1024  # Memory in MB for each container
    container_vcores = 1

    application_id = submit_hadoop_job(resourcemanager_url, job_name, jar_path, main_class, input_path, output_path, num_containers, container_memory, container_vcores)

    if application_id:
        while True:
            job_status = get_job_status(resourcemanager_url, application_id)
            print(f"Job Status: {job_status}")

            if job_status in ["FINISHED", "FAILED", "KILLED"]:
                break

            time.sleep(10)  # Wait for 10 seconds before checking the status again


### 9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.

In [None]:
import subprocess
import time

def submit_mapreduce_job(input_path, split_size):
    try:
        start_time = time.time()
        # Submit the MapReduce job with the given input split size
        cmd = f"hadoop jar path/to/hadoop_job.jar com.example.HadoopJobMain {input_path} output -D mapreduce.input.fileinputformat.split.minsize={split_size} -D mapreduce.input.fileinputformat.split.maxsize={split_size}"
        subprocess.run(cmd, shell=True, check=True)
        end_time = time.time()

        execution_time = end_time - start_time
        return execution_time

    except subprocess.CalledProcessError as e:
        print(f"Error while submitting the MapReduce job: {e}")
        return None

if __name__ == "__main__":
    input_path = "/path/to/input_data"  # Replace with the HDFS path of your input data
    split_sizes = [64 * 1024 * 1024, 128 * 1024 * 1024, 256 * 1024 * 1024]  # Different input split sizes in bytes

    for split_size in split_sizes:
        print(f"Running job with split size: {split_size} bytes")
        execution_time = submit_mapreduce_job(input_path, split_size)

        if execution_time is not None:
            print(f"Execution Time: {execution_time:.2f} seconds\n")
