# 0. Scoring

In [12]:
import os
import json
import pickle
import glob
import gc

import calendar
from datetime import date, timedelta, datetime
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
# from pyspark.sql.functions import *
# from pyspark.sql.functions import udf, struct, count_distinct, from_unixtime

!pip install openpyxl -q
import openpyxl

In [3]:
SEED = 42
s3a = 's3a://pvc-75e6242d-1f96-4903-a9eb-22fa28f5b73e'

In [4]:
home_repo = '/home/jovyan'
project_repo = f'{home_repo}/__RAYDTT'

%cd {project_repo}
%pwd

/home/jovyan/__RAYDTT


'/home/jovyan/__RAYDTT'

In [4]:
def access_data(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data

access_s3_data = access_data('.access_s3.json')

print('user:', os.environ['JUPYTERHUB_SERVICE_PREFIX'])

def uiWebUrl(self):
    from urllib.parse import urlparse
    web_url = self._jsc.sc().uiWebUrl().get()
    port = urlparse(web_url).port
    return '{}proxy/{}/jobs/'.format(os.environ['JUPYTERHUB_SERVICE_PREFIX'], port)

SparkContext.uiWebUrl = property(uiWebUrl)

conf = SparkConf()
conf.set('spark.master', 'local[*]')
conf.set('spark.driver.memory', '40G')
conf.set('spark.driver.maxResultSize', '32G')
############################################
conf.set('spark.driver.memoryOverhead', '2G')
conf.set('spark.executor.memory', '36G') #32G
conf.set('spark.executor.memoryOverhead', '2G')
conf.set('spark.executor.cores', '10') # 8
conf.set('spark.executor.instances', '2')
conf.set('spark.dynamicAllocation.enabled', 'true')
conf.set('spark.dynamicAllocation.minExecutors', '1')
conf.set('spark.dynamicAllocation.maxExecutors', '50')
############################################
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark._jsc.hadoopConfiguration().set('fs.s3a.access.key', access_s3_data['aws_access_key_id'])
spark._jsc.hadoopConfiguration().set('fs.s3a.secret.key', access_s3_data['aws_secret_access_key'])
spark._jsc.hadoopConfiguration().set('fs.s3a.impl','org.apache.hadoop.fs.s3a.S3AFileSystem')
spark._jsc.hadoopConfiguration().set('fs.s3a.multipart.size', '104857600')
spark._jsc.hadoopConfiguration().set('fs.s3a.block.size', '33554432')
spark._jsc.hadoopConfiguration().set('fs.s3a.threads.max', '256')
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://storage.yandexcloud.net')
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 
                                     'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
spark

# conf.set('spark.driver.memory', '32G')
# conf.set('spark.driver.maxResultSize', '4G')

user: /user/st054552/


In [6]:
parquets = glob.glob(f'ray_data/*.parquet')
# parquets.remove('vgtest/data.parquet')
len(parquets)

25

# sdf_init & target

In [None]:
columns = ['amplitude_id', 'event_time', 'purchase_type', 'subscription_date_start', 'paid']
# '[server]_purchase'
# '[server]_subscription_start_date'
sdf = None

for parquet in parquets:
    
    sdf_temp = spark.read.parquet(parquet).select(columns)
    
    sdf_temp = (
                sdf_temp
                #
                # .selectExpr('*', 'SUBSTRING(`[server]_purchase`[0], 16) AS purchase') \
                # .drop(F.col('`[server]_purchase`')) \
                #
                # .selectExpr('*', 'SUBSTRING(`[server]_subscription_start_date`[0], -10) AS subscription_start_date') \
                # .withColumn('subscription_start_date', F.col('subscription_start_date').cast('date')) \
                # .drop(F.col('`[server]_subscription_start_date`')) \
                #
                .withColumn('event_time_timestamp', F.to_timestamp('event_time'))
                .withColumn('event_time', F.to_date('event_time')) \
                .withColumn('paid', F.col('paid').cast('string')) \
                .withColumn('subscription_date_start_timestamp', F.to_timestamp('subscription_date_start')) \
                .withColumn('subscription_date_start', F.to_date('subscription_date_start'))
               )
                        
    if sdf is None:
        sdf = sdf_temp
    else:
        sdf = sdf.unionByName(sdf_temp, allowMissingColumns=True)

In [None]:
sdf.printSchema()

In [8]:
sdf.select('*').show(truncate=200)

+------------+----------+-------------+-----------------------+-----+-----------------------+---------------------------------+
|amplitude_id|event_time|purchase_type|subscription_date_start| paid|   event_time_timestamp|subscription_date_start_timestamp|
+------------+----------+-------------+-----------------------+-----+-----------------------+---------------------------------+
|288086553546|2022-04-03|         null|                   null|false|2022-04-03 15:48:08.399|                             null|
|288086553546|2022-04-06|         null|                   null|false|2022-04-06 20:33:04.817|                             null|
|288086553546|2022-04-06|         null|                   null|false|2022-04-06 20:33:04.896|                             null|
|288086553546|2022-04-06|         null|                   null|false|2022-04-06 20:33:08.581|                             null|
|288086553546|2022-04-06|         null|                   null|false|2022-04-06 20:33:09.657|           

In [8]:
%%time
target = (
            sdf
            .filter(
                (F.col('subscription_date_start').isNotNull()) & 
                ((F.col('paid') == 'yes') | (F.col('paid') == 'true'))
                    )
            .groupBy('amplitude_id')
            .agg(
                F.min('subscription_date_start').alias('target_date'),
                F.min('subscription_date_start_timestamp').alias('target_date_timestamp'),
                F.trunc(F.min('subscription_date_start'), 'month').alias('target_month')
                )
        )

CPU times: user 5.64 ms, sys: 0 ns, total: 5.64 ms
Wall time: 168 ms


In [9]:
target.printSchema()

root
 |-- amplitude_id: long (nullable = true)
 |-- target_date: date (nullable = true)
 |-- target_date_timestamp: timestamp (nullable = true)
 |-- target_month: date (nullable = true)



In [11]:
%%time
file_path = f'{s3a}/bs_segments_new/sdf_init.parquet'

sdf.write.parquet(path = file_path, mode = 'overwrite')

CPU times: user 210 ms, sys: 125 ms, total: 335 ms
Wall time: 33min 14s


In [10]:
%%time
file_path = f'{s3a}/bs_segments_new/target.parquet'

target.write.parquet(path = file_path, mode = 'overwrite')

CPU times: user 102 ms, sys: 58.1 ms, total: 160 ms
Wall time: 15min 22s


In [110]:
catboost_pred_test_new[catboost_pred_test_new.pred==1].predict_proba.mean()

0.3395352430839672

In [111]:
X_test.target.sum()/len(X_test)

0.007693638771679955