'''
Create a Comprehensive Fact Table

Given two input files:
- employee.csv with columns: employee_id, department, salary
- employee_personal.csv with columns: employee_id, first_name, last_name, DOB, state, country

Write transformations to create employee_fact with columns:
- employee_id
- employee_full_name
- department
- salary
- Salary_Diff_to_reach_highest_sal
- DOB
- state
- country
- age
'''

In [26]:
import os, sys
from datetime import date
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, first, col

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRVIER_PYTHON'] = sys.executable

spark = SparkSession.builder.master('local[*]').appName('ComprehensiveFact').getOrCreate()


class ComprehensiveFact:
    
    def createEmployeeData(self):
        data = [
            (1, 'HR', 15000),
            (2, 'IT', 18000),
            (3, 'HR', 20000),
            (4, 'IT', 25000),
            (5, 'ADMIN', 12000)
        ]

        columns = ['employee_id', 'department', 'salary']
        return spark.createDataFrame(data, columns)

    def createPersonalData(self):
        data = [
            (1, 'Rohit', 'Khanna', date(1995, 12, 10), 'Delhi', 'IN'),
            (2, 'Arjun', 'Rao', date(1993, 10, 10), 'Chennai', 'IN'),
            (3, 'Kuldeep', 'Nair', date(1994, 2, 20), 'Delhi', 'IN'),
            (4, 'Viraj', 'Khaskar', date(1995, 3, 19), 'Bengalore', 'IN'),
            (5, 'Aditya', 'Paul', date(1996, 6, 12), 'Mumbai', 'IN'),
        ]

        columns = ['employee_id', 'first_name', 'last_name', 'DOB', 'state', 'country']
        return spark.createDataFrame(data, columns)
    
    def employeeFactSql(self, employee_df, employee_personal_df):
        
        employee_df.createOrReplaceTempView('employee')
        employee_personal_df.createOrReplaceTempView('employee_personal')
        query = '''
        select ep.employee_id, 
        concat(ep.first_name,' ', ep.last_name) as employee_full_name,
        e.department,
        e.salary,
        (select max(salary) from employee) - e.salary as salary_diff,
        DOB,
        state,
        country,
        year(current_date()) - year(DOB) as age
        from employee_personal ep inner join employee e
        on ep.employee_id = e.employee_id
        
        '''  
        return spark.sql(query) 
        
    def employeeFactSpark(self, employee_df, employee_personal_df):
        
        max_salary = employee_df.select('salary').agg(max('salary')).first()[0]
        
        merge_df = employee_personal_df.alias('ep').join(employee_df.alias('e'), on='employee_id', how='inner')\
        .withColumn('employee_full_name', concat('ep.first_name',lit(' '), 'ep.last_name'))\
        .withColumn('salary_diff',max_salary - col('e.salary'))\
        .withColumn('age', year(current_date()) - year(col('DOB')))
        
        
        result_df = merge_df.select(col('employee_id'),
        col('employee_full_name'),
        col('department'),
        col('salary'),
        col('salary_diff'),
        col('DOB'),
        col('state'),
        col('country'),
        col('age')
        )
        
        return result_df
    
obj = ComprehensiveFact()
employee_df = obj.createEmployeeData()
employee_personal_df = obj.createPersonalData()
employee_df.show()
employee_personal_df.show()
result_df = obj.employeeFactSql(employee_df, employee_personal_df)
result_df.show()
result_df = obj.employeeFactSpark(employee_df, employee_personal_df)
result_df.show()

+-----------+----------+------+
|employee_id|department|salary|
+-----------+----------+------+
|          1|        HR| 15000|
|          2|        IT| 18000|
|          3|        HR| 20000|
|          4|        IT| 25000|
|          5|     ADMIN| 12000|
+-----------+----------+------+

+-----------+----------+---------+----------+---------+-------+
|employee_id|first_name|last_name|       DOB|    state|country|
+-----------+----------+---------+----------+---------+-------+
|          1|     Rohit|   Khanna|1995-12-10|    Delhi|     IN|
|          2|     Arjun|      Rao|1993-10-10|  Chennai|     IN|
|          3|   Kuldeep|     Nair|1994-02-20|    Delhi|     IN|
|          4|     Viraj|  Khaskar|1995-03-19|Bengalore|     IN|
|          5|    Aditya|     Paul|1996-06-12|   Mumbai|     IN|
+-----------+----------+---------+----------+---------+-------+

+-----------+------------------+----------+------+-----------+----------+---------+-------+---+
|employee_id|employee_full_name|depart