In [20]:
# Import Libraries from Pyspark
import pyspark
import re
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from datetime import datetime
import os
from IPython.display import Image
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import sha2
from pyspark.sql import SQLContext

In [21]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('demo_content').getOrCreate()

In [22]:
# location of dataset 
hit_log_uri = '/home/jovyan/work/hashed_data.csv'

In [23]:
# load data into spark
df = spark.read.csv(hit_log_uri,sep=',',encoding='ISO-8859-1',header=True) 

In [7]:
# load page category mapping file
comp_mapping_uri='/home/jovyan/work/component_mapping.csv'

In [None]:
# load page category mapping file
comp_mapping_uri='/home/jovyan/work/component_mapping.csv'

In [8]:
#read the page category mapping file
df_compname_mapping = spark.read.csv(comp_mapping_uri, header=True)

In [10]:
dict_compename_mapping = {row['Component']:row['Component Category'] for row in df_compname_mapping.select('Component','Component Category').collect()}


In [11]:
def udf_wrapper(returntype):
        def udf_func(func):
            return udf(func, returnType=returntype)
        return udf_func
        
@udf_wrapper(StringType())
def get_comp_category(comp_type):
    '''
    based on the page_type return page category
    '''
    if comp_type is None:
        return None
    if comp_type == 'null':
        return None
    if comp_type in dict_compename_mapping.keys():
        return dict_compename_mapping[comp_type]
    else:
        return None

In [12]:
df=df.withColumn('comp_category', get_comp_category(col('evar26')))

In [13]:
#remove records which doesnt capture pagename
df_processed = df.filter(col('hit_source') == '1') \
                    .filter(col('exclude_hit') == '0') \
                    .filter(col('comp_category') != '')

In [14]:
df_processed.select('hashed_visitor_id').distinct().count()

38335

In [16]:
df_agg=df_processed.groupBy('hashed_visitor_id').count().orderBy('count',ascending=False)

In [17]:
#filter outliers with high or less number of activities 
df_filter=df_agg.filter(df_agg['count'] < 10000).filter(df_agg['count'] > 2)

In [14]:
#final number of unique visitors 
df_filter.count()

23642

In [18]:
#combine with original dataframe 
df_processed = df_processed.join(df_filter, on='hashed_visitor_id', how='inner')

In [16]:
#final number of records for model 
df_processed.count()

567964

In [19]:
def udf_wrapper(returntype):
        def udf_func(func):
            return udf(func, returnType=returntype)
        return udf_func
        
        
     # sessionization
TIME_OUT = 30 # 30 minutes of timeout


@udf_wrapper(IntegerType())
def get_event_boundary(time_diff):
    '''
    based on the timeout value, return the event boundary
    '''
    if(time_diff is None):
        return 0
    else:
        if(time_diff > TIME_OUT):
            return 1
        else:
            return 0

@udf_wrapper(FloatType())
def set_default_page_time(time_spent):
    '''
    based on the time spent, if if
    '''
    if(time_spent is None):
        return 0.08 # 5 seconds 
    else:
        return time_spent

In [18]:
def conversion(page_category):
        if page_category == 'test_drive':
            return 1
        else : 
            return 0
    
conversion_udf = udf(conversion, IntegerType())


In [19]:
# Timestamp Conversion
ts_pattern_1 = 'yyyy-MM-dd HH:mm:ss'
date_pattern_1 = 'yyyy-MM-dd'

In [20]:
df_processed = df_processed\
                     .withColumn('date_time_ts',unix_timestamp(col('date_time'), ts_pattern_1).cast('timestamp'))\
                     .withColumn('conversion_status', conversion_udf('page_category'))
                     
    
                         
    
# Sessionization
w0 = Window.partitionBy(col('hashed_visitor_id'))

w1 = Window.partitionBy(col('hashed_visitor_id')).orderBy(col('date_time_ts'))

w2 = Window.partitionBy(col('hashed_visitor_id'),col('session_id')).orderBy(col('date_time_ts').desc())

w3 = Window.partitionBy(col('hashed_visitor_id'),col('session_id')).orderBy(col('date_time_ts'))

df_processed  = df_processed \
                   .withColumn('is_converted', max(col('conversion_status')).over(w0)) \
                   .withColumn('prev', lag(col('date_time_ts'),1).over(w1)) \
                   .withColumn('time_diff',((col('date_time_ts').cast('long') - col('prev').cast('long'))/60.0)) \
                   .withColumn('new_event_boundary',get_event_boundary(col('time_diff'))) \
                   .withColumn('session_id', sum(col('new_event_boundary')).over(w1)) \
                   .drop('prev','time_diff','new_event_boundary')

                    
# hit order
df_processed = df_processed.withColumn('hit_rank_reversed', dense_rank().over(w2))



#Time Spent                      
df_processed = df_processed\
                    .withColumn('next', lead(col('date_time_ts'),1).over(w3)) \
                    .withColumn('time_diff',((col('next').cast('long') - col('date_time_ts').cast('long'))/60.0)) \
                    .withColumn('time_spent_in_mins',set_default_page_time(col('time_diff'))) 


In [21]:
#number of converted visitors
df_processed.filter(df_processed['is_converted']=='1').select('hashed_visitor_id').distinct().count()

173

In [22]:
df_page_category_counts = df_processed.groupby('hashed_visitor_id').pivot('page_category').agg(count('hashed_visitor_id').cast('float'))


In [23]:
excluded_page_category = ['test_drive']

In [24]:
filtered_columns = ([column for column in df_page_category_counts.columns if column not in excluded_page_category])
df_page_category_counts = df_page_category_counts.select(filtered_columns)
df_page_category_counts = df_page_category_counts.fillna(0.0)

In [25]:
replacements = {c:'page_count_{0}'.format(c) for c in df_page_category_counts.columns if c != 'hashed_visitor_id'}

In [26]:
df_page_category_counts = df_page_category_counts.select([col(c).alias(replacements.get(c, c)) for c in df_page_category_counts.columns])


In [27]:
df_page_category_counts.columns

['hashed_visitor_id',
 'page_count_business_cars',
 'page_count_chat',
 'page_count_configurator',
 'page_count_connect_store',
 'page_count_connectivity',
 'page_count_contact-us',
 'page_count_dealer_locator',
 'page_count_design',
 'page_count_eco',
 'page_count_errors',
 'page_count_exterior',
 'page_count_feasibility_abort',
 'page_count_feasibility_conflict',
 'page_count_finance',
 'page_count_fleet_and_business',
 'page_count_home',
 'page_count_interior',
 'page_count_item_info',
 'page_count_moterization',
 'page_count_not_set',
 'page_count_offers',
 'page_count_others',
 'page_count_owner',
 'page_count_passengercars',
 'page_count_print_layer',
 'page_count_safety',
 'page_count_user_cars',
 'page_count_vehicle_search',
 'page_count_your_vehicle']

In [28]:
aggregations = []
aggregations.append(count(col('hashed_visitor_id')).alias('total_activity_count'))
aggregations.append(countDistinct(col('session_id')).alias('total_session_count'))
aggregations.append(max(col('is_converted')).alias('is_converted_visitor'))

In [29]:
df_aggregated_others = df_processed.select('hashed_visitor_id','is_converted','session_id').groupBy('hashed_visitor_id').agg(*aggregations)


In [30]:
df_aggregated_others.columns

['hashed_visitor_id',
 'total_activity_count',
 'total_session_count',
 'is_converted_visitor']

In [31]:
df_final =  df_page_category_counts.join(df_aggregated_others,'hashed_visitor_id', 'inner')

In [32]:
df_final.columns

['hashed_visitor_id',
 'page_count_business_cars',
 'page_count_chat',
 'page_count_configurator',
 'page_count_connect_store',
 'page_count_connectivity',
 'page_count_contact-us',
 'page_count_dealer_locator',
 'page_count_design',
 'page_count_eco',
 'page_count_errors',
 'page_count_exterior',
 'page_count_feasibility_abort',
 'page_count_feasibility_conflict',
 'page_count_finance',
 'page_count_fleet_and_business',
 'page_count_home',
 'page_count_interior',
 'page_count_item_info',
 'page_count_moterization',
 'page_count_not_set',
 'page_count_offers',
 'page_count_others',
 'page_count_owner',
 'page_count_passengercars',
 'page_count_print_layer',
 'page_count_safety',
 'page_count_user_cars',
 'page_count_vehicle_search',
 'page_count_your_vehicle',
 'total_activity_count',
 'total_session_count',
 'is_converted_visitor']

In [33]:
model_inputs=[col for col in df_final.columns if col not in 
              ['hashed_visitor_id','is_converted_visitor','total_activity_count','total_session_count']]

In [35]:
#creating the assemblerfor input variables 
assembler=VectorAssembler(inputCols=model_inputs,outputCol='features')


In [36]:
#creating dense vector represntation of input variables
output=assembler.transform(df_final)

In [37]:
#declaring the input dense vector and output variable
data=output.select('features','is_converted_visitor')


In [40]:
#build and train the ML model
rfc=RandomForestClassifier(labelCol='is_converted_visitor',featuresCol='features')

In [41]:
#fit the model on training data 
rf_model=rfc.fit(data)

In [42]:
#map the page ids back to pagenames
feats={}
for feature,importance in zip(model_inputs,rf_model.featureImportances):
    feats[feature]=importance

In [44]:
feats.keys()

dict_keys(['page_count_business_cars', 'page_count_chat', 'page_count_configurator', 'page_count_connect_store', 'page_count_connectivity', 'page_count_contact-us', 'page_count_dealer_locator', 'page_count_design', 'page_count_eco', 'page_count_errors', 'page_count_exterior', 'page_count_feasibility_abort', 'page_count_feasibility_conflict', 'page_count_finance', 'page_count_fleet_and_business', 'page_count_home', 'page_count_interior', 'page_count_item_info', 'page_count_moterization', 'page_count_not_set', 'page_count_offers', 'page_count_others', 'page_count_owner', 'page_count_passengercars', 'page_count_print_layer', 'page_count_safety', 'page_count_user_cars', 'page_count_vehicle_search', 'page_count_your_vehicle'])

In [45]:
feats.values()

dict_values([0.08560535807322521, 0.038186412870358219, 0.061054764966927542, 0.00032822995067791115, 0.0, 0.015912323438681049, 0.091563709989187614, 0.059257047393501738, 0.011369352477104612, 0.0, 0.016405569028626707, 0.015038239932216441, 0.018077448255833621, 0.17790596121686464, 0.018883744798244596, 0.072487453532855836, 0.0022112536484443844, 0.0096352193510687925, 0.067616303828090502, 0.0, 0.0, 0.061683447592672871, 0.046210913941203607, 0.07023411245215691, 0.011370812593419106, 0.0, 0.031115040641733129, 0.0030177171162483812, 0.014829562910656785])

In [47]:
results=sorted(feats.items(),key=lambda x:x[1],reverse=True)

In [59]:
final_results_page=[(key,str(imp)) for key,imp in results]
results_df = spark.createDataFrame(final_results_page, ['key','val'])

In [61]:
results_df.show(50,False)

+-------------------------------+-----------------+
|key                            |val              |
+-------------------------------+-----------------+
|page_count_finance             |0.177905961217   |
|page_count_dealer_locator      |0.0915637099892  |
|page_count_business_cars       |0.0856053580732  |
|page_count_home                |0.0724874535329  |
|page_count_passengercars       |0.0702341124522  |
|page_count_moterization        |0.0676163038281  |
|page_count_others              |0.0616834475927  |
|page_count_configurator        |0.0610547649669  |
|page_count_design              |0.0592570473935  |
|page_count_owner               |0.0462109139412  |
|page_count_chat                |0.0381864128704  |
|page_count_user_cars           |0.0311150406417  |
|page_count_fleet_and_business  |0.0188837447982  |
|page_count_feasibility_conflict|0.0180774482558  |
|page_count_exterior            |0.0164055690286  |
|page_count_contact-us          |0.0159123234387  |
|page_count_

In [62]:
def page_imp_model(model_inputs,data):
    
    
    #creating the assemblerfor input variables 
    assembler=VectorAssembler(inputCols=model_inputs,outputCol='features')



    #creating dense vector represntation of input variables
    output=assembler.transform(df_final)
    
    
    #declaring the input dense vector and output variable
    data=output.select('features','is_converted_visitor')
    
    
    #build and train the ML model
    rfc=RandomForestClassifier(labelCol='is_converted_visitor',featuresCol='features')

    #fit the model on training data 
    rf_model=rfc.fit(data)
    
    #create dictionary with page and importance
    feats={}
    for feature,importance in zip(model_inputs,rf_model.featureImportances):
        feats[feature]=importance
        
        
    #map the page ids back to pagenames
    feats={}
    for feature,importance in zip(model_inputs,rf_model.featureImportances):
        feats[feature]=importance


    results=sorted(feats.items(),key=lambda x:x[1],reverse=True)
    
    #create dataframe with page and importance
    final_results_page=[(key,str(imp)) for key,imp in results]
    results_df = spark.createDataFrame(final_results_page, ['key','val'])
    
    return results_df


In [None]:
page_imp=page_imp_model(model_inputs,df_final)