In [39]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.master("spark://{sparkname}:7077".format(sparkname=os.environ['SPARK_CLUSTER'])).getOrCreate()

In [9]:
DENODO_DRIVER = "com.denodo.vdp.jdbc.Driver"
DENODO_HOST = "your_host"
DENODO_USER = "your_user"
DENODO_PASSWORD = ""
DENODO_PORT = "9999"
DENODO_DB = "your_db"

denodo_jdbc_str = ("jdbc:vdb://{host}:{port}/{database}?queryTimeout=0").format( \
    host=DENODO_HOST, \
    port=DENODO_PORT, \
    database=DENODO_DB)

denodo_prop = \
  {"user": DENODO_USER, \
   "password": DENODO_PASSWORD, \
   "driver": DENODO_DRIVER,  \
   "sslConnection": "false"}

In [25]:
denodo_join_query123 = """
SELECT DISTINCT rowno, current_date as load_date, case_number, Contention_date, vin, Case_id, fd.body
FROM
(
SELECT rownum() as rowno, c.case_number, fd.created_date AS Contention_date, c.vin, fd.parent_id AS Case_id, fd.body as Body 
FROM crrs_feeditem fd JOIN crrs_case c ON fd.parent_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND fd.body IS NOT NULL
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (fd.body NOT LIKE ('%Outbound%') AND fd.body NOT LIKE ('%No campaign data found%') AND fd.body NOT LIKE ('%viewed Campaign Info%') AND fd.body NOT LIKE ('%Recall%'))
AND c.DIVISION IN ('Honda', 'Acura') 
AND fd.created_date between '2020/01/29' and '2020/01/31'
UNION
select rownum() as rowno, c.case_number, cf.source_created_date AS Contention_date, c.vin, cf.parent_id AS Case_id, cf.body AS Body --cast(body as varchar(10000)) AS case_msg
from crrs_casefeed cf JOIN crrs_case c ON cf.parent_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND cf.body IS NOT NULL AND isdeleted = 0 
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (cf.body NOT LIKE ('%Outbound%') AND cf.body NOT LIKE ('%No campaign data found%') AND cf.body NOT LIKE ('%viewed Campaign Info%') AND cf.body NOT LIKE ('%Recall%'))
AND c.DIVISION IN ('Honda', 'Acura') 
AND formatdate('yyyy/MM/dd',cf.source_created_date) between '2020/01/29' and '2020/01/31'
UNION
select rownum() as rowno, c.case_number, em.message_timestamp AS Contention_date, c.vin, em.parent_id AS Case_id, cast(em.body_text as varchar(10000)) AS Body
from crrs_email_message em JOIN crrs_case c ON em.parent_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND em.body_text IS NOT NULL
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (em.body_text NOT LIKE ('%Outbound%') AND em.body_text NOT LIKE ('%No campaign data found%') AND em.body_text NOT LIKE ('%viewed Campaign Info%') AND em.body_text NOT LIKE ('%Recall%'))
AND c.DIVISION IN ('Honda', 'Acura') 
AND formatdate('yyyy/MM/dd',em.message_timestamp) between '2020/01/29' and '2020/01/31'
UNION
select rownum() as rowno, c.case_number,  ch.start_timestamp AS Contention_date, c.vin, c.case_id as Case_id, cast(ch.body_text as varchar(10000)) AS Body
from crrs_livechat_transcript ch JOIN crrs_case c ON ch.case_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND ch.body_text IS NOT NULL 
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (ch.body_text NOT LIKE ('%Outbound%') AND ch.body_text NOT LIKE ('%No campaign data found%') AND ch.body_text NOT LIKE ('%viewed Campaign Info%') AND ch.body_text NOT LIKE ('%Recall%'))
AND c.DIVISION IN ('Honda', 'Acura') 
AND formatdate('yyyy/MM/dd',ch.start_timestamp) between '2020/01/29' and '2020/01/31'
) t
WHERE case_number IS NOT NULL AND CASE_ID IS NOT NULL AND VIN IS NOT NULL AND BODY IS NOT NULL
ORDER BY case_number,  Contention_date;
"""

In [10]:
denodo_join_query = """
SELECT DISTINCT fd.rowno, c.case_number AS Case_No, fd.Contention_date AS Contention_date, c.vin, fd.Case_id AS Case_id, fd.body as Body,
r.recordtype_name AS Case_RecordType_Nm,
c.updt_ts AS Case_Updt_Ts,
c.date_time_opened AS Case_Src_Cret_Ts,
c.date_time_closed AS Case_Closed_Ts,
c.last_reopen_date AS last_reopen_date,
c.vin AS Case_Vin_Id,
c.miles AS Case_Miles_Qty,
c.division_code AS Case_Division_Cd,
c.subdivision AS Case_Subdivision_Cd,
c.current_dealer_code AS Case_Current_Dealer_No,
CONCAT(c.current_dealer_code, c.division_code) AS Case_Mx_Dealer_No,
c.service_district_code AS Case_Service_District_Cd,
c.status AS Case_Status_Cd,
c.method AS Case_Method_Nm,
c.point_of_origin AS Case_Point_Of_Origin_Dc,
c.sales_zone AS Case_Sales_Zone_Nm,
c.service_zone AS Case_Service_Zone_Nm, 
c.subject AS Case_Subject_Txt,
'' AS Case_Customer_Nm,
c.city AS Case_City_Nm,
c.state AS Case_State_Cd,
c.zipcode AS Case_Zip_Cd,
'' AS Case_HasApprovedCheck_Flg,
'' AS Case_HasKrExec_Flg,
'' AS Case_IsMediation_Flg,
'' Case_Phone_Carrier_Nm,         
'' Case_Phone_Mfg_Nm,              
'' Case_Phone_Model_Nm,         
'' Case_Phone_Os_Nm, 
'' Case_Phone_Carrier_Other_Nm,          
'' Case_Other_Phone_Mfg_Nm, 
'' Case_Other_Phone_Model_Nm,           
'' Case_Other_Phone_Os_Nm,
'' Note_Updt_Ts,
'' PQ_Case_ID,  
'' Probing_Question_Id, 
'' PQ_RecordType_Nm, 
'' PQ_Updt_Ts,  
'' PQ_Incident_Ts,           
'' PQ_Incident_City_Nm,              
'' PQ_Incident_State_Cd,              
'' PQ_Incident_Brief_Dc,               
'' PQ_Customer_Request_Txt,
i.issue_id AS Issue_Id,
i.issue_name AS Issue_Nm,
i.updt_ts AS Issue_Updt_Ts,
i.source_create_date AS Issue_Src_Cret_Ts,
i.closed_date AS Issue_Closed_Ts,
i.issue_disposition AS Issue_Disposition_Nm,
i.type_1 AS Issue_Type1_Nm,
i.type_2 AS Issue_Type2_Nm,
i.status AS Issue_Status_Cd,
i.title AS Issue_Title_Dc,
i.resolution AS Issue_Resolution_Dc,
i.total_goodwill_amount AS Issue_Total_Goodwill_Amt,
i.affected_part AS Issue_Affected_Part_Cd,
i.symptom AS Issue_Symptom_Cd,
i.labor_code AS Issue_Labor_Cd_Nm,
i.tread_comp_code1 AS Issue_Tread_Comp1_Cd,
i.primary_part_no AS Issue_Primary_Part_No,
i.primary_part_description AS Issue_Primary_Part_Dc,
i.campaign_code AS Issue_Campaign_Cd,
i.campaign_description AS Issue_Campaign_Dc,
i.defect_code AS Issue_Defect_Cd,
v.retail_sales_date AS Rtl_Sls_Dt,
V.assembled_date AS Af_Off_Dt,              
'' Eng_Off_Dt,   
'' Trmsn_Off_Dt,
m.short_sales_model_code AS shrt_sls_mdl_cd,
m.model_year AS Model_Year,
m.model_name AS CMQ_Model,
m.model_name AS AH_Model,
m.destination_code AS Dest_Code,
m.factory_code AS Factory_Code,
v.product_division_code AS prod_div,
m.doors AS doors,
m.trim_type_code AS Trim_Type, 
m.engine_series AS engine_series,
m.grade_short AS grade_short, 
m.model_generation AS model_generation, 
m.transmission AS Trans_Type, 
m.transmission_series AS Trans_Serial_No, 
m.four_wheel_drive AS four_wheel_drive_flag, 
'' AS issue_check_req_status_cnt
FROM 
(
 
SELECT DISTINCT rowno, case_number, Contention_date, vin, Case_id, body
FROM
(
SELECT rownum() as rowno,c.case_number, fd.created_date AS Contention_date, c.vin, fd.parent_id AS Case_id, fd.body as Body 
FROM crrs_feeditem fd JOIN crrs_case c ON fd.parent_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND fd.body IS NOT NULL
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (fd.body NOT LIKE ('%Outbound%') AND fd.body NOT LIKE ('%No campaign data found%') AND fd.body NOT LIKE ('%viewed Campaign Info%') AND fd.body NOT LIKE ('%Recall%'))
AND ((c.DIVISION = 'Honda' AND c.YEAR BETWEEN 2017 AND 2020) OR (c.DIVISION = 'Acura' AND c.YEAR BETWEEN 2014 AND 2020))
AND formatdate('yyyy/MM/dd',fd.created_date) between '2020/01/30' and '2020/01/31'
 
UNION
 
select rownum() as rowno,c.case_number, cf.source_created_date AS Contention_date, c.vin, cf.parent_id AS Case_id, cf.body AS Body --cast(body as varchar(10000)) AS case_msg
from crrs_casefeed cf JOIN crrs_case c ON cf.parent_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND cf.body IS NOT NULL AND isdeleted = 0 
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (cf.body NOT LIKE ('%Outbound%') AND cf.body NOT LIKE ('%No campaign data found%') AND cf.body NOT LIKE ('%viewed Campaign Info%') AND cf.body NOT LIKE ('%Recall%'))
AND ((c.DIVISION = 'Honda' AND c.YEAR BETWEEN 2017 AND 2020) OR (c.DIVISION = 'Acura' AND c.YEAR BETWEEN 2014 AND 2020))
AND formatdate('yyyy/MM/dd',cf.source_created_date) between '2020/01/30' and '2020/01/31'
 
UNION
 
select rownum() as rowno,c.case_number, em.message_timestamp AS Contention_date, c.vin, em.parent_id AS Case_id, cast(em.body_text as varchar(10000)) AS Body
from crrs_email_message em JOIN crrs_case c ON em.parent_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND em.body_text IS NOT NULL
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (em.body_text NOT LIKE ('%Outbound%') AND em.body_text NOT LIKE ('%No campaign data found%') AND em.body_text NOT LIKE ('%viewed Campaign Info%') AND em.body_text NOT LIKE ('%Recall%'))
AND ((c.DIVISION = 'Honda' AND c.YEAR BETWEEN 2017 AND 2020) OR (c.DIVISION = 'Acura' AND c.YEAR BETWEEN 2014 AND 2020))
AND formatdate('yyyy/MM/dd',em.message_timestamp) between '2020/01/30' and '2020/01/31'
 
UNION
 
select rownum() as rowno,c.case_number,  ch.start_timestamp AS Contention_date, c.vin, c.case_id as Case_id, cast(ch.body_text as varchar(10000)) AS Body
from crrs_livechat_transcript ch JOIN crrs_case c ON ch.case_id = c.case_id
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND ch.body_text IS NOT NULL 
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (ch.body_text NOT LIKE ('%Outbound%') AND ch.body_text NOT LIKE ('%No campaign data found%') AND ch.body_text NOT LIKE ('%viewed Campaign Info%') AND ch.body_text NOT LIKE ('%Recall%'))
AND ((c.DIVISION = 'Honda' AND c.YEAR BETWEEN 2017 AND 2020) OR (c.DIVISION = 'Acura' AND c.YEAR BETWEEN 2014 AND 2020))
AND formatdate('yyyy/MM/dd',ch.start_timestamp) between '2020/01/30' and '2020/01/31'
 
) t
WHERE case_number IS NOT NULL AND CASE_ID IS NOT NULL AND VIN IS NOT NULL AND BODY IS NOT NULL
-- ORDER BY case_number,  Contention_date
 
) fd 
JOIN crrs_case c ON fd.Case_id = c.case_id
LEFT JOIN vehicle v ON c.vin = v.vin
LEFT JOIN model_mto_feature m ON m.mto_model_code = v.mto_model_code  AND m.mto_type_code = v.mto_type_code AND m.mto_option_code = v.mto_option_code
LEFT JOIN crrs_issue i ON c.case_id = i.case_id
LEFT JOIN crrs_recordtype r ON c.recordtype_id = r.recordtype_id
 
WHERE c.case_number IS NOT NULL AND c.case_id IS NOT NULL AND c.vin IS NOT NULL AND fd.body IS NOT NULL
AND c.method NOT IN ('Outreach', 'Pro-Active O/B', 'Robo-Call') 
AND c.subdivision NOT IN ('Alpha','Campaign','Marine Dealer Support','Marine Sales Support','MC Mediation','PCRM','PCRM - Torrance','PCRM Chino','PCRM-Chino','PCRM-Torrance')
AND c.point_of_origin NOT IN ('Outreach')
AND (fd.body NOT LIKE ('%Outbound%') OR fd.body NOT LIKE ('%No campaign data found%') OR fd.body NOT LIKE ('%viewed Campaign Info%') OR fd.body NOT LIKE ('%Recall%'))
AND ((c.DIVISION = 'Honda' AND c.YEAR BETWEEN 2017 AND 2020) OR (c.DIVISION = 'Acura' AND c.YEAR BETWEEN 2014 AND 2020));
"""


In [11]:
def qpreds(n):
    return ["mod(rowno, {np}) = {modulus}".format(np=n, modulus=k) for k in range(n)]


In [12]:
joindf = spark.read.jdbc(url=denodo_jdbc_str, \
    table='({sql}) test'.format(sql=denodo_join_query), \
    properties=denodo_prop, predicates=qpreds(50))

joindf.cache().count()


11632

In [None]:
joindf.head()

In [40]:
urlPSQL = ("jdbc:postgresql://{host}:{port}/{db}").format( \
    host='your_host', \
    port='5432', \
    db='your_db')
propertiesPSQL = { \
    "driver": "org.postgresql.Driver",
    "user": 'your_username',
    "password": 'your_password'
}


In [41]:
joindf.write.jdbc(table='cmbs_source_from_Jan30to31_2_4_2020', mode='overwrite', \
                url=urlPSQL, \
                properties=propertiesPSQL)


In [42]:
def qpreds_psql(n):
    return ["mod(rowno, {np}) = {modulus}".format(np=n, modulus=k) for k in range(n)]


In [46]:
cmbsdf = spark.read.jdbc(url=urlPSQL, \
    table='({sql}) test'.format(sql="select * from cmbs_source_from_Jan30to31_2_4_2020"), \
    properties=propertiesPSQL, predicates=qpreds_psql(25)).cache()


In [47]:
cmbsdf.count()
cmbsdf.columns

['rowno',
 'case_no',
 'contention_date',
 'vin',
 'case_id',
 'body',
 'case_recordtype_nm',
 'case_updt_ts',
 'case_src_cret_ts',
 'case_closed_ts',
 'last_reopen_date',
 'case_vin_id',
 'case_miles_qty',
 'case_division_cd',
 'case_subdivision_cd',
 'case_current_dealer_no',
 'case_mx_dealer_no',
 'case_service_district_cd',
 'case_status_cd',
 'case_method_nm',
 'case_point_of_origin_dc',
 'case_sales_zone_nm',
 'case_service_zone_nm',
 'case_subject_txt',
 'case_customer_nm',
 'case_city_nm',
 'case_state_cd',
 'case_zip_cd',
 'case_hasapprovedcheck_flg',
 'case_haskrexec_flg',
 'case_ismediation_flg',
 'case_phone_carrier_nm',
 'case_phone_mfg_nm',
 'case_phone_model_nm',
 'case_phone_os_nm',
 'case_phone_carrier_other_nm',
 'case_other_phone_mfg_nm',
 'case_other_phone_model_nm',
 'case_other_phone_os_nm',
 'note_updt_ts',
 'pq_case_id',
 'probing_question_id',
 'pq_recordtype_nm',
 'pq_updt_ts',
 'pq_incident_ts',
 'pq_incident_city_nm',
 'pq_incident_state_cd',
 'pq_incident_b

In [48]:
cmbsdf.count()

11632

In [49]:
cmbsdf.toPandas().to_csv('cmbs_source_from_Jan30to31_2_4_2020.csv')

In [None]:
cmbsdf.toPandas().head()