# Install Dependencies

In [0]:
%%capture
!pip install pyspark pyarrow venv_pack

# Import Libraries

In [0]:
import os
import pyarrow as pa
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import venv_pack
import random

# One big function to do it all

In [0]:
def get_spark_session(pack_venv = True):
    """
    Launches Spark Context using UMBC Big Data Cluster YARN and returns a Spark Session.
    """
    # Pack Virtual Environment
    if pack_venv:
        packed_environment_file = f"{os.environ['USER']}.tar.gz"
        print(f"Packing Virtual Environment: {packed_environment_file}")
        venv_pack.pack(output=packed_environment_file, force = True)
    
    # Set local environment variables
    print(f"Setting Environment Variables")
    os.environ['HADOOP_CONF_DIR'] = "/opt/cloudera/parcels/CDH/lib/hadoop"
    os.environ['YARN_CONF_DIR'] = "/opt/cloudera/parcels/CDH/lib/hadoop"
    os.environ['SPARK_HOME'] = "/opt/cloudera/parcels/CDH/lib/spark"
    os.environ['PYSPARK_PYTHON'] = f'{os.environ["USER"]}/bin/python3'
    
    # Create Spark Configuration
    print(f"Creating Spark Configuration")
    conf = SparkConf()
    conf.setMaster('yarn')

    # Application Master Environment Variables -- ugly
    conf.set('spark.yarn.appMasterEnv.JAVA_HOME', '/usr/java/jdk1.8.0_181-cloudera')
    conf.set('spark.yarn.appMasterEnv.LD_LIBRARY_PATH', 
             '/opt/cloudera/parcels/CDH/lib64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64/server')

    # Executor Envrironment Variables -- ugly
    conf.set('spark.executorEnv.JAVA_HOME', '/usr/java/jdk1.8.0_181-cloudera')
    conf.set('spark.executorEnv.LD_LIBRARY_PATH', 
             '/opt/cloudera/parcels/CDH/lib64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64:/usr/java/jdk1.8.0_181-cloudera/jre/lib/amd64/server')

    app_name = f'{os.environ["USER"]}_data603_spark'
    conf.setAppName(app_name)
    conf.set('spark.yarn.dist.archives', f'{os.environ["USER"]}.tar.gz#{os.environ["USER"]}')
    conf.set('spark.pyspark.driver.python', f'source {os.environ["USER"]}/bin/activate && {os.environ["USER"]}/bin/python3')
    conf.set('spark.yarn.appMasterEnv.PYSPARK_PYTHON', f'source {os.environ["USER"]}/bin/activate && {os.environ["USER"]}/bin/python3')
    conf.set('spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON', f'source {os.environ["USER"]}/bin/activate && {os.environ["USER"]}/bin/python3')

    # Create SparkSession
    session_name = f"{os.environ['USER']}_data603_spark_session"
    print(f"Creating Spark Session: {session_name}")
    spark = SparkSession.builder\
        .config(conf = conf)\
        .appName(session_name)\
        .getOrCreate()

    return spark