In [1]:
import numpy as np
import csv
import decimal
import math
import os
import re
import time
import random
from pyspark.sql.functions import udf
from pyspark.sql.functions import isnan, when, count, col, to_date, row_number
import pyspark.sql.functions
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import LongType
from pyspark.sql.types import DateType
from pyspark.sql import HiveContext
from pyspark.storagelevel import StorageLevel

In [2]:
from pyspark.sql.types import *

from pyspark_llap.sql import HiveWarehouseBuilder
from pyspark_llap.sql.session import CreateTableBuilder, HiveWarehouseSessionImpl
hive = HiveWarehouseBuilder.session(spark).build()

In [3]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
spark.sparkContext._conf.getAll()

[('spark.eventLog.enabled', 'true'),
 ('spark.app.id', 'application_1678624282671_2622561'),
 ('spark.yarn.historyServer.address', 'http://hpchdp2i3.hpc.ford.com:18081'),
 ('spark.yarn.appMasterEnv.MKL_NUM_THREADS', '1'),
 ('spark.repl.local.jars',
  'file:///s/hadoop/user/jars/hive-warehouse-connector-assembly-hpchdp2.jar,file:///u/mpartha9/scoring-code-spark-api_2.4.3-0.0.22.jar,file:///u/mpartha9/64132f079417a607d5972c42-64119c37f60675e3f6972a3d.jar'),
 ('spark.yarn.am.extraLibraryPath',
  '/opt/cloudera/parcels/CDH-7.1.7-1.cdh7.1.7.p1000.24102687/lib/hadoop/lib/native'),
 ('spark.yarn.dist.pyFiles',
  'file:///opt/cloudera/parcels/CDH/lib/hive_warehouse_connector/pyspark_hwc-1.0.0.7.1.7.1000-141.zip'),
 ('spark.sql.hive.hiveserver2.jdbc.url.principal', 'hive/_HOST@HPC.FORD.COM'),
 ('spark.shuffle.io.serverThreads', '128'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.RM_HA_URLS',
  'hpchdp2.hpc.ford.com:8088,hpchdp2i4.hpc.ford.com:8088'),
 ('spark.seria

In [4]:
hive_context = HiveContext(spark)
hive_context.setConf("hive.exec.dynamic.partition", "true")
hive_context.setConf("hive.execution.engine","spark")
hive_context.setConf("hive.prewarm.enabled","true")
hive_context.setConf("hive.vectorized.execution.enabled","true")
hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")

## VOMART Data Pull between 2003 to 2021 for FORD USA

In [5]:
#Read the parquet file
mpartha9_NA_data_1_external_p1=spark.read.parquet('mpartha9_vomart_2003_21_current_model.parquet')

In [6]:
mpartha9_NA_data_1_external_p1.createOrReplaceTempView("mpartha9_NA_data_1_external_p1")

In [7]:
print(mpartha9_NA_data_1_external_p1.count(), mpartha9_NA_data_1_external_p1.select("consumer_id").distinct().count())

66271637 41125090


In [8]:
mpartha9_NA_data_1_external_p1.printSchema()

root
 |-- vin: string (nullable = true)
 |-- consumer_id: decimal(11,0) (nullable = true)
 |-- vehicle_ownership_cycle_num: short (nullable = true)
 |-- vehicle_model: string (nullable = true)
 |-- vehicle_model_year: string (nullable = true)
 |-- acquisition_date: date (nullable = true)
 |-- acquisition_year: short (nullable = true)
 |-- vehicle_age: short (nullable = true)
 |-- fmcc_lease_purchase_ind: byte (nullable = true)
 |-- platform: string (nullable = true)
 |-- lifestage_value: string (nullable = true)
 |-- consumer_type_code: string (nullable = true)
 |-- contribution_margin: decimal(15,2) (nullable = true)



### NEW MODEL :   All NON-ZERO data records

In [9]:
#All NON-ZERO data records
Query2="""select vin, 
consumer_id,
vehicle_ownership_cycle_num,
trim(vehicle_model) as vehicle_model, 
int(vehicle_model_year) as vehicle_model_year, 
acquisition_date, 
acquisition_year,
vehicle_age, 
fmcc_lease_purchase_ind, 
trim(platform) as platform,
trim(lifestage_value) as lifestage_value,
cast(contribution_margin as decimal(18,2)) as contribution_margin
from mpartha9_NA_data_1_external_p1
where contribution_margin != 0
and consumer_type_code='I'
"""
mpartha9_NA_combo_all_nonzero_records_p2=spark.sql(Query2)
mpartha9_NA_combo_all_nonzero_records_p2.createOrReplaceTempView("mpartha9_NA_combo_all_nonzero_records_p2")

In [10]:
print(mpartha9_NA_combo_all_nonzero_records_p2.count(), mpartha9_NA_combo_all_nonzero_records_p2.select("consumer_id").distinct().count())

21352960 15086551


In [11]:
mpartha9_NA_combo_all_nonzero_records_p2.show(2)

+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model|vehicle_model_year|acquisition_date|acquisition_year|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin|
+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|1FTPW14554KD61923|    1601895|                          1|        F-150|              2004|      2004-08-16|            2004|         18|                      0|   TRUCK| Late Ownership|           12808.96|
|1FAFP33P44W191799|    1616104|                          1|        FOCUS|              2004|      2004-06-03|            2004|         18|                      0|     C

In [12]:
mpartha9_NA_combo_all_nonzero_records_p2.where(col('contribution_margin').isNull()).count()

0

### median of non zero CM values

In [13]:
import pyspark.sql.functions as F

In [14]:
# getting median of contribution_margin at model+model_year level
mpartha9_NA_nonzero_median_p3=mpartha9_NA_combo_all_nonzero_records_p2.groupBy(['vehicle_model', 'acquisition_year']).agg(F.expr('percentile_approx(contribution_margin, 0.5)').alias("contribution_margin"))

In [15]:
mpartha9_NA_nonzero_median_p3=mpartha9_NA_nonzero_median_p3.orderBy(['vehicle_model','acquisition_year'],ascending=True)

In [16]:
mpartha9_NA_nonzero_median_p3.show()

+-------------+----------------+-------------------+
|vehicle_model|acquisition_year|contribution_margin|
+-------------+----------------+-------------------+
|         null|            2006|           21710.85|
|         null|            2007|            5667.10|
|         null|            2008|            8983.44|
|         null|            2009|           12678.95|
|         null|            2010|           14351.79|
|         null|            2011|           11102.76|
|         null|            2012|           15431.36|
|         null|            2013|           11284.14|
|         null|            2014|            9878.81|
|         null|            2015|           10485.89|
|         null|            2016|           19728.37|
|         null|            2021|           21837.52|
|             |            2021|           18958.49|
|       BRONCO|            2021|           16587.41|
| BRONCO SPORT|            2020|            7192.67|
| BRONCO SPORT|            2021|            74

In [17]:
mpartha9_NA_nonzero_median_p3.printSchema()

root
 |-- vehicle_model: string (nullable = true)
 |-- acquisition_year: short (nullable = true)
 |-- contribution_margin: decimal(18,2) (nullable = true)



In [18]:
mpartha9_NA_nonzero_median_p3.count()

485

### Only zero CM  records

In [19]:
#only zero CM  records
Query3="""select vin, 
consumer_id,
vehicle_ownership_cycle_num,
trim(vehicle_model) as vehicle_model, 
int(vehicle_model_year) as vehicle_model_year, 
acquisition_date, 
acquisition_year,
vehicle_age, 
fmcc_lease_purchase_ind, 
trim(platform) as platform,
trim(lifestage_value) as lifestage_value,
cast(contribution_margin as decimal(18,2)) as contribution_margin_0
from mpartha9_NA_data_1_external_p1
where contribution_margin = 0
and consumer_type_code='I'
"""
mpartha9_NA_combo_all_zero_records_p3=spark.sql(Query3)
mpartha9_NA_combo_all_zero_records_p3.createOrReplaceTempView("mpartha9_NA_combo_all_zero_records_p3")

In [20]:
print(mpartha9_NA_combo_all_zero_records_p3.count(), mpartha9_NA_combo_all_zero_records_p3.select("consumer_id").distinct().count())

32738647 27023667


In [21]:
mpartha9_NA_combo_all_zero_records_p3.show(2)

+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+---------------------+
|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model|vehicle_model_year|acquisition_date|acquisition_year|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin_0|
+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+---------------------+
|1FTRW12W37KD37502|    2122507|                          1|        F-150|              2007|      2007-08-09|            2007|         15|                      0|   TRUCK| Late Ownership|                 0.00|
|1FMEU74E86UA38128|    2202795|                          1|     EXPLORER|              2006|      2006-02-28|            2006|         16|                      

In [22]:
mpartha9_NA_combo_all_zero_records_p3.where(col('contribution_margin_0').isNull()).count()

0

### Replace all zero CM records with median imputation - acc to vehicle model and model year

In [23]:
df_join = mpartha9_NA_combo_all_zero_records_p3.join(mpartha9_NA_nonzero_median_p3, on=['vehicle_model', 'acquisition_year'],how='inner') 

In [24]:
df_join.show(5)

+-------------+----------------+-----------------+-----------+---------------------------+------------------+----------------+-----------+-----------------------+--------+---------------+---------------------+-------------------+
|vehicle_model|acquisition_year|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model_year|acquisition_date|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin_0|contribution_margin|
+-------------+----------------+-----------------+-----------+---------------------------+------------------+----------------+-----------+-----------------------+--------+---------------+---------------------+-------------------+
|       FIESTA|            2020|3FADP4BJ2JM131803|10068785650|                          3|              2018|      2020-12-20|          4|                      0|     CAR|  Mid Ownership|                 0.00|            2635.67|
|       FIESTA|            2020|3FADP4FJ4CM187058|10106349650|                  

In [25]:
print(df_join.count(), df_join.select("consumer_id").distinct().count())

31047202 25721636


In [26]:
df_join.where(col('contribution_margin').isNull()).count()

0

In [27]:
# dropping unneccessary columns 
df_clean = df_join.drop('contribution_margin_0')

In [28]:
df_clean.show(5)

+-------------+----------------+-----------------+-----------+---------------------------+------------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|vehicle_model|acquisition_year|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model_year|acquisition_date|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin|
+-------------+----------------+-----------------+-----------+---------------------------+------------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|       FIESTA|            2020|3FADP4BJXDM186455| 1748874121|                          2|              2013|      2020-07-21|          9|                      0|     CAR| Late Ownership|            2635.67|
|       FIESTA|            2020|3FADP4BJ7CM102655| 7514011304|                          2|              2012|      2020-02-25|         11|                      0|     C

In [29]:
df_clean.createOrReplaceTempView("mpartha9_NA_df_clean")

In [30]:
df_clean.printSchema()

root
 |-- vehicle_model: string (nullable = true)
 |-- acquisition_year: short (nullable = true)
 |-- vin: string (nullable = true)
 |-- consumer_id: decimal(11,0) (nullable = true)
 |-- vehicle_ownership_cycle_num: short (nullable = true)
 |-- vehicle_model_year: integer (nullable = true)
 |-- acquisition_date: date (nullable = true)
 |-- vehicle_age: short (nullable = true)
 |-- fmcc_lease_purchase_ind: byte (nullable = true)
 |-- platform: string (nullable = true)
 |-- lifestage_value: string (nullable = true)
 |-- contribution_margin: decimal(18,2) (nullable = true)



In [31]:
df_clean_rearrange="""select vin, consumer_id, vehicle_ownership_cycle_num, vehicle_model, vehicle_model_year, acquisition_date, acquisition_year,
vehicle_age, fmcc_lease_purchase_ind, platform, lifestage_value, contribution_margin
from mpartha9_NA_df_clean
"""
df_clean_final=spark.sql(df_clean_rearrange)

In [32]:
print(df_clean_final.count(), df_clean_final.select("consumer_id").distinct().count())

31047202 25721636


In [33]:
df_clean_final.show(5)

+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model|vehicle_model_year|acquisition_date|acquisition_year|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin|
+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|3FADP4GX4HM147850|12992531150|                          1|       FIESTA|              2017|      2020-07-27|            2020|          2|                      0|     CAR|  Mid Ownership|            2635.67|
|3FADP4BJ0DM184696| 1711386320|                          2|       FIESTA|              2013|      2020-03-11|            2020|          9|                      0|     C

### APPEND NONZERO AND IMPUTED ZERO CM

In [34]:
## APPEND NONZERO AND IMPUTED ZERO CM
mpartha9_NA_df_append = mpartha9_NA_combo_all_nonzero_records_p2.union(df_clean_final)
mpartha9_NA_df_append.createOrReplaceTempView("mpartha9_NA_df_append")

In [35]:
print(mpartha9_NA_df_append.count(), mpartha9_NA_df_append.select("consumer_id").distinct().count())

52400162 36936249


In [36]:
mpartha9_NA_df_append.show(5)

+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model|vehicle_model_year|acquisition_date|acquisition_year|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin|
+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+
|1FTPW14554KD61923|    1601895|                          1|        F-150|              2004|      2004-08-16|            2004|         18|                      0|   TRUCK| Late Ownership|           12808.96|
|1FAFP33P44W191799|    1616104|                          1|        FOCUS|              2004|      2004-06-03|            2004|         18|                      0|     C

### Customers who bought a new vehicle in 2022 or later from Ford USA

In [None]:
#Save the original extract as parquet file
#mpartha9_vomart_2022_beyond.write.parquet("mpartha9_vomart_2022_beyond.parquet")

In [37]:
#Read the parquet file
mpartha9_vomart_2022_beyond=spark.read.parquet('mpartha9_vomart_2022_beyond.parquet')

In [38]:
mpartha9_vomart_2022_beyond.createOrReplaceTempView("mpartha9_vomart_2022_beyond")

In [39]:
print(mpartha9_vomart_2022_beyond.count(), mpartha9_vomart_2022_beyond.select("consumer_id").distinct().count())

2061970 1385035


### Only Retail

In [40]:
Query_retail="""select *
from mpartha9_vomart_2022_beyond
where consumer_type_code='I'
"""
mpartha9_vomart_2022_beyond_retail=spark.sql(Query_retail)
mpartha9_vomart_2022_beyond_retail.createOrReplaceTempView("mpartha9_vomart_2022_beyond_retail")

In [41]:
print(mpartha9_vomart_2022_beyond_retail.count(), mpartha9_vomart_2022_beyond_retail.select("consumer_id").distinct().count())

1266000 1196833


### Row Numbering based on Acquisition date

In [42]:
Query_row_numbering = """select *
from 
(select *,
ROW_NUMBER() OVER (PARTITION BY consumer_id ORDER BY acquisition_date) as ROW_numbering
from mpartha9_vomart_2022_beyond_retail
) a
"""
mpartha9_NA_row_numbering = spark.sql(Query_row_numbering)
mpartha9_NA_row_numbering.createOrReplaceTempView("mpartha9_NA_row_numbering")

In [43]:
print(mpartha9_NA_row_numbering.count(), mpartha9_NA_row_numbering.select("consumer_id").distinct().count())

1266000 1196833


In [44]:
mpartha9_NA_row_numbering.show(2)

+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+------------------+-------------------+-------------+
|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model|vehicle_model_year|acquisition_date|acquisition_year|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|consumer_type_code|contribution_margin|ROW_numbering|
+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+------------------+-------------------+-------------+
|1FTFW1E84NFB10664|   11936403|                          1|        F-150|              2022|      2022-05-25|            2022|          1|                      0|   TRUCK|Early Ownership|                 I|           18407.93|            1|
|1FT8W3DTXNEF02600|  100214113|     

### Only First Transaction in 2022 and beyond

In [45]:
Query_only_latest_transaction = """select consumer_id, acquisition_date, 
contribution_margin as Latest_CM
from mpartha9_NA_row_numbering
where ROW_numbering = 1
"""
mpartha9_NA_only_latest_transaction = spark.sql(Query_only_latest_transaction)
mpartha9_NA_only_latest_transaction.createOrReplaceTempView("mpartha9_NA_only_latest_transaction")

In [46]:
print(mpartha9_NA_only_latest_transaction.count(), mpartha9_NA_only_latest_transaction.select("consumer_id").distinct().count())

1196833 1196833


In [47]:
mpartha9_NA_only_latest_transaction.show(5)

+-----------+----------------+---------+
|consumer_id|acquisition_date|Latest_CM|
+-----------+----------------+---------+
|   11936403|      2022-05-25| 18407.93|
|  100214113|      2022-10-13| 27408.44|
|  101523550|      2022-06-27|  3670.71|
|  101817003|      2022-05-23| 11817.37|
|  110802703|      2023-03-08| 18169.33|
+-----------+----------------+---------+
only showing top 5 rows



### Match back beyond_2022 with consumer_ids before 2022

In [48]:
Match_query = """
select a.*, b.Latest_CM
from mpartha9_NA_df_append a
inner join mpartha9_NA_only_latest_transaction b
on a.consumer_id = b.consumer_id
"""
mpartha9_NA_Match_query = spark.sql(Match_query)
mpartha9_NA_Match_query.createOrReplaceTempView("mpartha9_NA_Match_query")

In [49]:
print(mpartha9_NA_Match_query.count(), mpartha9_NA_Match_query.select("consumer_id").distinct().count())

1579701 549396


In [50]:
mpartha9_NA_Match_query.show(5)

+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+---------+
|              vin|consumer_id|vehicle_ownership_cycle_num|vehicle_model|vehicle_model_year|acquisition_date|acquisition_year|vehicle_age|fmcc_lease_purchase_ind|platform|lifestage_value|contribution_margin|Latest_CM|
+-----------------+-----------+---------------------------+-------------+------------------+----------------+----------------+-----------+-----------------------+--------+---------------+-------------------+---------+
|1FTPW14554KD61923|    1601895|                          1|        F-150|              2004|      2004-08-16|            2004|         18|                      0|   TRUCK| Late Ownership|           12808.96| 34080.00|
|1FT7W2B67KEC85800|  108746650|                          1|        F-250|              2019|      2019-03-18|            2019|  

### Vehicle Age

In [51]:
Query_vehicle_age = """select consumer_id, avg(vehicle_age) as avg_vehicle_age
from mpartha9_NA_Match_query
group by consumer_id
order by consumer_id
"""
mpartha9_NA_veh_age = spark.sql(Query_vehicle_age)
mpartha9_NA_veh_age.createOrReplaceTempView("mpartha9_NA_veh_age")

In [52]:
print(mpartha9_NA_veh_age.count(), mpartha9_NA_veh_age.select("consumer_id").distinct().count())

549396 549396


### fmcc_lease_purchase_ind

In [53]:
Query_fmcc_lease_purchase_ind = """select consumer_id, sum(fmcc_lease_purchase_ind) as no_of_lease_purchases
from mpartha9_NA_Match_query
group by consumer_id
order by consumer_id
"""
mpartha9_NA_fmcc_lease_purchase_ind = spark.sql(Query_fmcc_lease_purchase_ind)
mpartha9_NA_fmcc_lease_purchase_ind.createOrReplaceTempView("mpartha9_NA_fmcc_lease_purchase_ind")

In [54]:
print(mpartha9_NA_fmcc_lease_purchase_ind.count(), mpartha9_NA_fmcc_lease_purchase_ind.select("consumer_id").distinct().count())


549396 549396


### Frequency

In [55]:
Query_freq = """select consumer_id, Count(acquisition_date) as freq_excluding_latest
from mpartha9_NA_Match_query
group by consumer_id
order by consumer_id
"""
mpartha9_freq = spark.sql(Query_freq)
mpartha9_freq.createOrReplaceTempView("mpartha9_freq")

In [56]:
print(mpartha9_freq.count(), mpartha9_freq.select("consumer_id").distinct().count())

549396 549396


### avg_contribution_margin_new

In [57]:
Query_monetary_CM = """select consumer_id, avg(contribution_margin) as avg_contribution_margin_new
from mpartha9_NA_Match_query
group by consumer_id
order by consumer_id
"""
mpartha9_monetary_CM = spark.sql(Query_monetary_CM)
mpartha9_monetary_CM.createOrReplaceTempView("mpartha9_monetary_CM")

In [58]:
print(mpartha9_monetary_CM.count(), mpartha9_monetary_CM.select("consumer_id").distinct().count())

549396 549396


### AVERAGE TIME TO PURCHASE (FORMERLY  RECENCY)

In [59]:
Query_Recency = """Select consumer_id, acquisition_date
,-((DateDiff(acquisition_date, Lag(acquisition_date) Over (Partition by consumer_id Order By acquisition_date desc)))/365) as moving_diff
FROM mpartha9_NA_Match_query
ORDER BY consumer_id
"""
mpartha9_Recency = spark.sql(Query_Recency)
mpartha9_Recency.createOrReplaceTempView("mpartha9_Recency")

In [60]:
Query_Recency_1 = """select consumer_id, avg(moving_diff) as AVERAGE_TIME_TO_PURCHASE
from mpartha9_Recency
group by consumer_id
"""
mpartha9_Recency_1 = spark.sql(Query_Recency_1)
mpartha9_Recency_1.createOrReplaceTempView("mpartha9_Recency_1")

In [61]:
Query_Recency_2 = """select consumer_id,
case when AVERAGE_TIME_TO_PURCHASE is NULL then 'Purchased_only_once'
	when (AVERAGE_TIME_TO_PURCHASE >=0 and AVERAGE_TIME_TO_PURCHASE<=3) then 'Repurchased_WITHIN_3_yrs'
	when (AVERAGE_TIME_TO_PURCHASE >3 and AVERAGE_TIME_TO_PURCHASE<=5) then 'Repurchased_BTWN_3_5_yrs'
	when AVERAGE_TIME_TO_PURCHASE >5 then 'Repurchased_AFTER_5_yrs'
	end as AVERAGE_TIME_TO_PURCHASE
from mpartha9_Recency_1
"""
mpartha9_Recency_2 = spark.sql(Query_Recency_2)
mpartha9_Recency_2.createOrReplaceTempView("mpartha9_Recency_2")

In [62]:
print(mpartha9_Recency_2.count(), mpartha9_Recency_2.select("consumer_id").distinct().count())

549396 549396


In [63]:
## Value Counts
mpartha9_Recency_2.groupBy('AVERAGE_TIME_TO_PURCHASE').count().show()

+------------------------+------+
|AVERAGE_TIME_TO_PURCHASE| count|
+------------------------+------+
|    Repurchased_WITHI...|193077|
|     Purchased_only_once|210257|
|    Repurchased_AFTER...| 65844|
|    Repurchased_BTWN_...| 80218|
+------------------------+------+



### COMBINED TABLE for all feature variables

In [64]:
Query_combined_table = """select a.consumer_id, a.freq_excluding_latest, 
b.avg_contribution_margin_new, 
c.AVERAGE_TIME_TO_PURCHASE,
e.avg_vehicle_age, 
g.no_of_lease_purchases,
h.Latest_CM
from mpartha9_freq a
inner join mpartha9_monetary_CM b
on a.consumer_id=b.consumer_id
inner join mpartha9_Recency_2 c
on a.consumer_id=c.consumer_id
inner join mpartha9_NA_veh_age e
on a.consumer_id=e.consumer_id
inner join mpartha9_NA_fmcc_lease_purchase_ind g
on a.consumer_id=g.consumer_id
inner join mpartha9_NA_only_latest_transaction h
on a.consumer_id=h.consumer_id
order by consumer_id
"""
mpartha9_NA_combined_table = spark.sql(Query_combined_table)
mpartha9_NA_combined_table.createOrReplaceTempView("mpartha9_NA_combined_table")

In [65]:
print(mpartha9_NA_combined_table.count(), mpartha9_NA_combined_table.select("consumer_id").distinct().count())

549396 549396


In [66]:
mpartha9_NA_combined_table.show(5)

+-----------+---------------------+---------------------------+------------------------+------------------+---------------------+---------+
|consumer_id|freq_excluding_latest|avg_contribution_margin_new|AVERAGE_TIME_TO_PURCHASE|   avg_vehicle_age|no_of_lease_purchases|Latest_CM|
+-----------+---------------------+---------------------------+------------------------+------------------+---------------------+---------+
|    1400494|                    2|               13661.405000|    Repurchased_AFTER...|              14.5|                    0| 14243.97|
|    1601895|                    1|               12808.960000|     Purchased_only_once|              18.0|                    0| 34080.00|
|    9248410|                    3|                9029.806667|    Repurchased_WITHI...|11.666666666666666|                    0| 12918.75|
|   10290303|                    1|               12511.120000|     Purchased_only_once|              19.0|                    0| 18497.77|
|   10796203|       

### Final X Variables  final formattings

In [67]:
Query_RFM_new_non_NaN = """select consumer_id, 
case when freq_excluding_latest=1 then 'One'
	when freq_excluding_latest=2 then 'Two'
	when freq_excluding_latest=3 then 'Three'
	else 'More_than_Three' end as Freq_purchase,
avg_contribution_margin_new as avg_imputed_contribution_margin,
AVERAGE_TIME_TO_PURCHASE,
avg_vehicle_age,
case when no_of_lease_purchases=0 then 'Zero' else 'One' end as lease_purchase_flag,
Latest_CM
from mpartha9_NA_combined_table
"""
mpartha9_NA_RFM_new_non_NaN = spark.sql(Query_RFM_new_non_NaN)
mpartha9_NA_RFM_new_non_NaN.createOrReplaceTempView("mpartha9_NA_RFM_new_non_NaN")

In [68]:
print(mpartha9_NA_RFM_new_non_NaN.count(), mpartha9_NA_RFM_new_non_NaN.select("consumer_id").distinct().count())

549396 549396


In [69]:
mpartha9_NA_RFM_new_non_NaN.show(5)

+-----------+-------------+-------------------------------+------------------------+------------------+-------------------+---------+
|consumer_id|Freq_purchase|avg_imputed_contribution_margin|AVERAGE_TIME_TO_PURCHASE|   avg_vehicle_age|lease_purchase_flag|Latest_CM|
+-----------+-------------+-------------------------------+------------------------+------------------+-------------------+---------+
|    1400494|          Two|                   13661.265000|    Repurchased_AFTER...|              14.5|               Zero| 14243.97|
|    1601895|          One|                   12808.960000|     Purchased_only_once|              18.0|               Zero| 34080.00|
|    9248410|        Three|                    9030.296667|    Repurchased_WITHI...|11.666666666666666|               Zero| 12918.75|
|   10290303|          One|                   12511.120000|     Purchased_only_once|              19.0|               Zero| 18497.77|
|   10796203|          One|                   10705.230000|   

In [70]:
mpartha9_NA_RFM_new_non_NaN.summary().show()

+-------+--------------------+---------------+-------------------------------+------------------------+------------------+-------------------+-----------------+
|summary|         consumer_id|  Freq_purchase|avg_imputed_contribution_margin|AVERAGE_TIME_TO_PURCHASE|   avg_vehicle_age|lease_purchase_flag|        Latest_CM|
+-------+--------------------+---------------+-------------------------------+------------------------+------------------+-------------------+-----------------+
|  count|              549396|         549396|                         549396|                  549396|            549396|             549396|           549396|
|   mean|    19536523888.3807|           null|               11051.4145604841|                    null| 8.018026746376963|               null|     16406.206497|
| stddev|1.348910050681828...|           null|              6633.128546825475|                    null| 4.670090680082621|               null|9262.167350226375|
|    min|             1400494|More

In [72]:
mpartha9_NA_RFM_new_non_NaN.toPandas().to_csv('/s/mpartha9/RETAIL_NEW_developed_model_performance_2022_beyond.csv', index=False)

In [None]:
#mpartha9_NA_RFM_new_non_NaN.write.format("orc").mode("overwrite").option("path", "/project/dz/collab/ICI_ANALYTICS/mpartha9_NA_RFM_new_non_NaN") \
#.saveAsTable('ici_analytics.mpartha9_Scoring_data_full_2003_21')

In [72]:
#mpartha9_NA_RFM_new_non_NaN.toPandas().to_csv('/s/mpartha9/SCORING_2003_21_full_data_15Feb.csv', index=False)

In [71]:
# Create a copy of original dataframe
original_df = mpartha9_NA_RFM_new_non_NaN

In [93]:
n_splits = 3

In [94]:
# Calculate count of each dataframe rows in the two df
each_len = original_df.count() // n_splits

In [95]:
# Splitting DataFrame
limited_df = original_df.limit(each_len).cache()

In [96]:
print(limited_df.count(), limited_df.select("consumer_id").distinct().count())

11377208 11377208


In [97]:
# Truncate the `original_df` to remove the contents fetched for `limited_df`
rest_df = original_df.join(limited_df, ['consumer_id'], 'left_anti')

In [99]:
print(rest_df.count(), rest_df.select("consumer_id").distinct().count())

22855805 23042462


In [100]:
new_len = rest_df.count() // 2

In [101]:
limited_df_1=rest_df.limit(new_len).cache()

In [102]:
print(limited_df_1.count(), limited_df_1.select("consumer_id").distinct().count())

11464463 11464463


In [103]:
# Truncate the `rest_df` to remove the contents fetched for `limited_df_1`
rest_df_1 = rest_df.join(limited_df_1, ['consumer_id'], 'left_anti')

In [104]:
print(rest_df_1.count(), rest_df_1.select("consumer_id").distinct().count())

11637411 11539275


In [105]:
limited_df.show(5)

+-----------+-------------+-------------------------------+------------------------+---------------+-------------------+
|consumer_id|Freq_purchase|avg_imputed_contribution_margin|AVERAGE_TIME_TO_PURCHASE|avg_vehicle_age|lease_purchase_flag|
+-----------+-------------+-------------------------------+------------------------+---------------+-------------------+
|  101058614|            2|                    7267.505000|    Repurchased_WITHI...|           12.0|                  0|
|  117704750|  More_than_3|                   14600.984000|    Repurchased_WITHI...|           11.2|                  0|
|  740219007|            1|                    5908.540000|     Purchased_only_once|           18.0|                  0|
| 1079734006|            1|                    5556.710000|     Purchased_only_once|           16.0|                  0|
| 1206229604|            1|                    5129.040000|     Purchased_only_once|           21.0|                  0|
+-----------+-------------+-----

In [106]:
limited_df_1.show(5)

+-----------+-------------+-------------------------------+------------------------+---------------+-------------------+
|consumer_id|Freq_purchase|avg_imputed_contribution_margin|AVERAGE_TIME_TO_PURCHASE|avg_vehicle_age|lease_purchase_flag|
+-----------+-------------+-------------------------------+------------------------+---------------+-------------------+
|    5502910|            1|                   10016.350000|     Purchased_only_once|           29.0|                  0|
|    7304810|            1|                    3793.010000|     Purchased_only_once|           19.0|                  0|
|    9203194|            1|                    3318.840000|     Purchased_only_once|           16.0|                  0|
|   11419109|            1|                    3318.840000|     Purchased_only_once|           17.0|                  0|
|   14700507|            1|                    2262.570000|     Purchased_only_once|           16.0|                  0|
+-----------+-------------+-----

In [None]:
rest_df_1.show(5)

In [None]:
limited_df.toPandas().to_csv('/s/mpartha9/SCORING_2003_21_full_data_15Feb_Part1.csv', index=False)

In [None]:
limited_df_1.toPandas().to_csv('/s/mpartha9/SCORING_2003_21_full_data_15Feb_Part2.csv', index=False)

In [None]:
rest_df_1.toPandas().to_csv('/s/mpartha9/SCORING_2003_21_full_data_15Feb_Part3.csv', index=False)