In [1]:
import re
import pandas as pd
import numpy as np

from datetime import datetime as dt

import os
import sys
 
spark_home = '/opt/cloudera/parcels/SPARK2/lib/spark2'
os.environ['SPARK_HOME'] = spark_home
os.environ['PYSPARK_DRIVER_PYTHON'] = '/opt/cloudera/parcels/PYENV.ZNO20008661/bin/python'
os.environ['PYSPARK_PYTHON'] = '/opt/cloudera/parcels/PYENV.ZNO20008661/bin/python'

os.environ['LD_LIBRARY_PATH'] = '/opt/python/virtualenv/jupyter/lib'

sys.path.insert(0, os.path.join (spark_home,'python'))

sys.path.insert(0, os.path.join (spark_home,'python/lib/py4j-0.10.7-src.zip'))
from pyspark import SparkContext, SparkConf, HiveContext


conf = SparkConf().setAppName('IskraComis')\
    .setMaster("yarn-client")\
    .set('spark.dynamicAllocation.enabled', 'true')\
    .set('hive.exec.dynamic.partition.mode', 'nonstrict') \
    .set('spark.executor.memory','20g') \
    .set('spark.driver.memory', '20g') \
    .set('spark.executor.cores', '2') \
    .set('spark.executor.instances', '140') \
    .set('spark.driver.maxResultSize','10g') \
    .set('spark.yarn.driver.memoryOverhead', '2g') \
    .set('spark.port.maxRetries', '150') \
    .set('spark.kryoserializer.buffer.max.mb','512') \
    .set('"spark.default.parallelism','1000') \
    .set('spark.ui.killEnable','true')

print('Start',dt.now())
sc = SparkContext.getOrCreate(conf=conf)
# Запуск HiveContext
hive = HiveContext(sc)
print('Allocated', dt.now())

%matplotlib inline
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

Start 2020-07-16 14:03:07.590311
Allocated 2020-07-16 14:03:50.189001


In [2]:
from pyspark.sql.types import TimestampType, DoubleType, IntegerType, StringType, DateType
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [3]:
from apyori import apriori

In [4]:
mon_iskra_prod = hive.table('team_pricing.product_iskra_temp_20200715')

In [5]:
trainDf = mon_iskra_prod \
    .filter(col('flag_true_row')==1) \
    .groupBy('inn', 'min_mon') \
    .agg(collect_list('product').alias('items')) \
    .withColumn("items", concat_ws(", ", "items"))

In [6]:
%%time
df = trainDf.toPandas()

CPU times: user 43.2 s, sys: 8.13 s, total: 51.4 s
Wall time: 1min 8s


In [7]:
df['items'] = df['items'].str.replace(' ', '').str.split(',')

In [8]:
records = list(df['items'].values)

In [9]:
association_rules = apriori(records, min_support=0.0053, min_confidence=0.20, min_lift=1.5, min_length=2)
association_results = list(association_rules)

In [10]:
print(len(association_results))

17


In [11]:
results = []
for item in association_results:

    value01 = ([x for x in item[0]])
    value02 = ([x for x in item[2][0][0]])
    value1 = ([x for x in item[2][0][1]])
    value2 = (item[1])
    value3 = (item[2][0][2])
    value4 = (item[2][0][3])
    
    rows = (value01, value02, value1, value2, value3, value4)
    results.append(rows)
    
labels = ['basket', 'antecedent','consequent', 'support', 'confidence', 'lift']
product_suggestion = pd.DataFrame.from_records(results, columns = labels)

In [12]:
rules = pd.DataFrame(product_suggestion)

In [14]:
rules.to_excel("./data/rules-inn-mon-as-bill.xlsx", header=True, index=False)

