### Hive Assignment

### 1.  Write a Python program that uses the HiveQL language to create a table named "Employees" with columns for "id," "name," and "salary."

In [None]:
from pyhive import hive

def create_employees_table():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to create the "Employees" table
        create_table_query = '''
            CREATE TABLE IF NOT EXISTS Employees (
                id INT,
                name STRING,
                salary FLOAT
            )
        '''

        # Execute the query
        cursor.execute(create_table_query)

        # Commit the changes and close the connection
        conn.commit()
        conn.close()

        print("Table 'Employees' created successfully.")

    except Exception as e:
        print(f"Error while creating the table: {e}")

if __name__ == "__main__":
    create_employees_table()


### 2.  Create a Python program that retrieves records from a Hive table named "Customers" where the age is greater than 30.

In [None]:
from pyhive import hive

def retrieve_customers_with_age_greater_than_30():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to retrieve records where age > 30
        retrieve_query = '''
            SELECT * FROM Customers
            WHERE age > 30
        '''

        # Execute the query
        cursor.execute(retrieve_query)

        # Fetch all rows from the result
        result = cursor.fetchall()

        # Close the connection
        conn.close()

        # Display the retrieved records
        if result:
            print("Customers with age greater than 30:")
            for row in result:
                print(row)
        else:
            print("No records found for customers with age greater than 30.")

    except Exception as e:
        print(f"Error while retrieving records: {e}")

if __name__ == "__main__":
    retrieve_customers_with_age_greater_than_30()


### 3.  Write a Python script that sorts records in descending order based on the "timestamp" column in a Hive table named "Logs."

In [None]:
from pyhive import hive

def sort_logs_by_timestamp_desc():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to sort records by timestamp in descending order
        sort_query = '''
            SELECT * FROM Logs
            ORDER BY timestamp DESC
        '''

        # Execute the query
        cursor.execute(sort_query)

        # Fetch all rows from the result
        sorted_result = cursor.fetchall()

        # Close the connection
        conn.close()

        # Display the sorted records
        if sorted_result:
            print("Logs sorted by timestamp in descending order:")
            for row in sorted_result:
                print(row)
        else:
            print("No records found in the 'Logs' table.")

    except Exception as e:
        print(f"Error while sorting records: {e}")

if __name__ == "__main__":
    sort_logs_by_timestamp_desc()


### 4.  Write a Python program that connects to a Hive server using PyHive library and retrieves all records from a table named "Products".

In [None]:
from pyhive import hive

def retrieve_all_products():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to retrieve all records from the "Products" table
        retrieve_query = '''
            SELECT * FROM Products
        '''

        # Execute the query
        cursor.execute(retrieve_query)

        # Fetch all rows from the result
        result = cursor.fetchall()

        # Close the connection
        conn.close()

        # Display the retrieved records
        if result:
            print("All records from the 'Products' table:")
            for row in result:
                print(row)
        else:
            print("No records found in the 'Products' table.")

    except Exception as e:
        print(f"Error while retrieving records: {e}")

if __name__ == "__main__":
    retrieve_all_products()


### 5.  Write a Python script that calculates the average salary of employees from a Hive table named "Employees".

In [None]:
from pyhive import hive

def calculate_average_salary():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to calculate the average salary from the "Employees" table
        avg_salary_query = '''
            SELECT AVG(salary) as average_salary FROM Employees
        '''

        # Execute the query
        cursor.execute(avg_salary_query)

        # Fetch the average salary from the result
        result = cursor.fetchone()

        # Close the connection
        conn.close()

        # Display the average salary
        if result and result[0] is not None:
            average_salary = result[0]
            print(f"Average salary of employees: {average_salary:.2f}")
        else:
            print("No salary data found in the 'Employees' table.")

    except Exception as e:
        print(f"Error while calculating the average salary: {e}")

if __name__ == "__main__":
    calculate_average_salary()



### 6.  Implement a Python program that uses Hive partitioning to create a partitioned table named "Sales_Data" based on the "year" and "month" columns.

In [None]:
from pyhive import hive

def create_partitioned_sales_table():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to create the partitioned "Sales_Data" table
        create_table_query = '''
            CREATE TABLE IF NOT EXISTS Sales_Data (
                sale_id INT,
                sale_amount FLOAT,
                year INT,
                month INT
            )
            PARTITIONED BY (year INT, month INT)
        '''

        # Execute the query
        cursor.execute(create_table_query)

        # Commit the changes and close the connection
        conn.commit()
        conn.close()

        print("Partitioned table 'Sales_Data' created successfully.")

    except Exception as e:
        print(f"Error while creating the partitioned table: {e}")

if __name__ == "__main__":
    create_partitioned_sales_table()


### 7.  Develop a Python script that adds a new column named "email" of type string to an existing Hive table named "Employees."

In [None]:
from pyhive import hive

def add_email_column_to_employees():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to add the "email" column to the "Employees" table
        add_column_query = '''
            ALTER TABLE Employees
            ADD COLUMNS (email STRING)
        '''

        # Execute the query
        cursor.execute(add_column_query)

        # Commit the changes and close the connection
        conn.commit()
        conn.close()

        print("Column 'email' added to the 'Employees' table successfully.")

    except Exception as e:
        print(f"Error while adding the column: {e}")

if __name__ == "__main__":
    add_email_column_to_employees()


### 8.  Create a Python program that performs an inner join between two Hive tables, "Orders" and "Customers," based on a common column.

In [None]:
from pyhive import hive

def perform_inner_join():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to perform the inner join between "Orders" and "Customers" tables
        join_query = '''
            SELECT o.order_id, o.order_date, c.customer_id, c.customer_name
            FROM Orders o
            INNER JOIN Customers c ON o.customer_id = c.customer_id
        '''

        # Execute the query
        cursor.execute(join_query)

        # Fetch all rows from the result
        result = cursor.fetchall()

        # Close the connection
        conn.close()

        # Display the joined records
        if result:
            print("Inner join results:")
            for row in result:
                print(row)
        else:
            print("No matching records found in the joined result.")

    except Exception as e:
        print(f"Error while performing the inner join: {e}")

if __name__ == "__main__":
    perform_inner_join()


### 9.  Implement a Python program that uses the Hive SerDe library to process JSON data stored in a Hive table named "User_Activity_Logs."

In [None]:
import json
from pyhive import hive

def process_json_data():
    try:
        # Connect to the Hive server
        conn = hive.connect(host='your_hive_server_hostname', port=10000, database='default', auth='NONE')

        # Create a cursor to execute queries
        cursor = conn.cursor()

        # Define the HiveQL query to retrieve JSON data from the "User_Activity_Logs" table
        select_query = '''
            SELECT user_activity_json
            FROM User_Activity_Logs
        '''

        # Execute the query
        cursor.execute(select_query)

        # Fetch all rows from the result
        result = cursor.fetchall()

        # Close the connection
        conn.close()

        # Process and display the JSON data
        if result:
            print("JSON data from the 'User_Activity_Logs' table:")
            for row in result:
                json_data = row[0]
                parsed_data = json.loads(json_data)
                print(parsed_data)
        else:
            print("No JSON data found in the 'User_Activity_Logs' table.")

    except Exception as e:
        print(f"Error while processing JSON data: {e}")

if __name__ == "__main__":
    process_json_data()
