In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import requests
import snowflake

import sqlalchemy
from sqlalchemy import create_engine

import datetime
from datetime import date
from datetime import timedelta

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# use your own credentials
username='username'
useremail='email'
password='SBDpassword'

## Below is the code to link SnowFlake Database, do not change

In [None]:
def get_oauth_token(svc_username=None, svc_password=None):
    """
    Retrives the authentication token for SBD Snowflake.
    This function automatically requests for username and password from the user through interactive prompts.
    If using service account credentials, they can be passed as svc_username and svc_password.
    Keywords:
    svc_username -- Service account username for which the token should be provided.
    svc_password -- Password corresponding to the service account user.
    Example usage:
    # Authentication with your personal user id.
    from sbd_common_utils.snowflake_utils import get_oauth_token
    access_token = get_oauth_token()
    # Authentication with service account.
    from sbd_common_utils.snowflake_utils import get_oauth_token
    from sbd_common_utils.common_utils import get_service_account_creds
    username, password = get_service_account_creds("/datascience/sandbox/someapp/service-account")
    access_token = get_oauth_token(username, password)
    """

    if svc_username and svc_password:
        username = svc_username
        password = svc_password
    else:
        print("Username and password isn't provided. Requesting user...")
        username = os.environ["service_account_id"]
        password = get_password_util(
                pass_value=os.environ.get("service_account_password", None),
                pass_name="Please Enter Service Account Password:  ",
            )


    r = requests.post(
        "https://ssoprod.sbdinc.com/as/token.oauth2",
        data={
            "client_id": "Snowflake",
            "grant_type": "password",
            "username": username,
            "password": password,
            "client_secret": 'f9sq630wmLP6UjpSsOk7kTuP6xccCrSOC4YhE1VdTq3GCupqR7gjYcpuhEGRJ9e0',
            "scope": "session:role-any",
        },
    )
    r.raise_for_status()
    access_token = r.json()["access_token"]
    return access_token


In [None]:
def getsnowflakecursor(snowflakedatabase,
                       snowflakewarehouse,
                       snowflakeschema,
                       snowflakerole,
                       username,
                       userpassword
                      ):
    import requests
    import snowflake.connector
    
    

    access_token = get_oauth_token(svc_username=username,
                               svc_password=userpassword)

    # Gets the version
    activesnowflakeconnector = snowflake.connector.connect(account='sbd_caspian.us-east-1', 
                                                           authenticator='oauth', 
                                                           token=access_token,
                                                          warehouse=snowflakewarehouse,
                                                          database=snowflakedatabase,
                                                          role=snowflakerole,
                                                          schema=snowflakeschema)
    activesnowflakecursor = activesnowflakeconnector.cursor()
    


    sqlalchemyengine = create_engine(f"snowflake://sbd_caspian.us-east-1.snowflakecomputing.com", creator=lambda: activesnowflakeconnector)
    

    return(activesnowflakeconnector,activesnowflakecursor,sqlalchemyengine)


In [None]:
def get_spark_configs(snowflakedatabase,
                       snowflakewarehouse,
                       snowflakeschema,
                       snowflakerole,
                       username,
                       userpassword,
                     useremail):
    
    
    spark_snowflake_configs = dict(
                sfUrl = 'sbd_caspian.us-east-1.snowflakecomputing.com',
                sfUser = useremail,  
                sfAuthenticator = "oauth",
                sfRole = snowflakerole,
                sfDatabase = snowflakedatabase,
                sfSchema = snowflakeschema,
                sfWarehouse = snowflakewarehouse,  
                sfToken=get_oauth_token(username,userpassword)
                )
    return(spark_snowflake_configs)

In [None]:
get_oauth_token(username, password)

## You can modify the cell below (dabase, role, schemas) before reading tables

In [None]:
activesnowflakeconnector,activesnowflakecursor,sqlalchemyengine = getsnowflakecursor(snowflakedatabase='PROD_EDW',
                                                                                     snowflakewarehouse='DEV_AIDA_WH',
                                                                                     snowflakeschema = 'DIMENSIONS',
                                                                                     snowflakerole = 'OPERATIONS_CREATOR_RO',
                                                                                     username = username,
                                                                                     userpassword = password
                                                                                    )

## Below are the codes to generate location master file

In [None]:
querytorun = 'SELECT * FROM PROD_EDW.DIMENSIONS.DIM_LOCATION'

In [None]:
df1 = pd.read_sql(querytorun,sqlalchemyengine)

In [None]:
df1.shape

In [None]:
# Drop column with all Null values
df1a = df1.dropna(axis =1, how = 'all')
df1a.shape

In [None]:
# only select the useful information for a location
col1= ['loc_key', 'src_sys_key', 'loc_id','src_rcrd_create_dte', 'src_rcrd_upd_dte','eff_dte', 'loc_addr_key', 'cntct_phn_nbr',
       'loc_typ_cd','loc_name', 'loc_desc','loc_regn_cd', 'systen_plnr_name' ]
df1b = df1[col1].sort_values(by = ['loc_id', 'loc_key'], ascending= [True, True]).reset_index(drop = True)

In [None]:
df1b.head()

In [None]:
df1b['src_sys_key'].unique()

In [None]:
df1b['plant_no'] = df1b['loc_key'].str.extract('~(.*?)~').astype(str)
df1b['plant_no2'] = df1b['loc_key'].str.split('~').str[1]

# Some plant has to loc_id, extracted from the loc_key
df1b.loc[df1b['plant_no'] == 'nan', 'plant_no'] = df1b['plant_no2']
df1b.head()

In [None]:
sortSAP = ['SAPE03', 'SAPC11', 'SAPP10','SAPSHP', 'QADCH', 'QADAR', 'QADBR', 'QADPE','JDAEDW', 'LEGACYWMS', 'LAWSONMAC']
df1b['src_sys_key'] = pd.Categorical(df1b['src_sys_key'], categories= sortSAP, ordered= True )
df1c = df1b.sort_values(by = ['src_sys_key', 'plant_no'], ascending= [True, True]).drop_duplicates('plant_no').reset_index(drop = True)

In [None]:
df1c.shape

In [None]:
selectSAP = ['SAPE03', 'SAPC11', 'SAPP10','SAPSHP', 'QADCH', 'QADAR', 'QADBR', 'QADPE']
df1d = df1c[df1c['src_sys_key'].isin(selectSAP) == True].reset_index(drop = True)

df1d.shape

In [None]:
addess = df1d['loc_addr_key'].tolist()

## read fromt the Address Table and merge to the location table

In [None]:
# reason to link Add_nbr in the sql query because address table is too large to read into a dataframe efficiently
queryaddress = """
select a.SRC_SYS_KEY
, a.EFF_DTE
, a.CITY_NAME
, a.REGN_LKEY
, a.CITY_PSTL_CD
, a.CNTRY_KEY
, a.CNTRY_DESC
, a.ADDR_1
, a.ADDR_GRP_LKEY
, a.ADDR_NBR
, a.MATCH_CD_NAME
from prod_edw.dimensions.dim_address a inner join prod_edw.dimensions.DIM_LOCATION l on a.ADDR_NBR = l.loc_addr_key
"""

df2 = pd.read_sql(queryaddress,sqlalchemyengine)
df2.head(2)

In [None]:
df3 = pd.merge(df1d, df2, how = 'left', left_on = 'loc_addr_key', right_on = 'addr_nbr', suffixes=('', '_drop')).reset_index(drop = True)
df3.drop([col for col in df3.columns if 'drop' in col], axis=1, inplace=True)

df3.head(2)

In [None]:
df3a = df3[['loc_key', 'src_sys_key', 'loc_id', 'plant_no',
       'src_rcrd_upd_dte', 'eff_dte', 'loc_addr_key', 'loc_name','match_cd_name',
        'city_name', 'regn_lkey', 'city_pstl_cd', 'cntry_key', 'cntry_desc',]].reset_index(drop = True)

#remove duplicates
df3b = df3a.sort_values(by = ['loc_key', 'plant_no', 'loc_name', 'match_cd_name'], ascending = [True, True, True, True]).drop_duplicates(['loc_key', 'plant_no'])

df3b.head()

In [None]:
# file has been saved in to the shared folder, no need to generate every time, the path should be modified based on the actual user's path
pathloc = 'C:\\Users\\username\\Stanley Black & Decker\\Supply Chain Development - General\\Projects\\Data Collected\\SF data and sample code\\'
df3b.to_csv(pathloc+'20230613_SFMaster_Location.csv', index = False)