In [1]:
%matplotlib inline
import seaborn as sns

import findspark
import os
findspark.init()

import pyspark
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark import sql
from pyspark import SparkConf
from pyspark import StorageLevel

from pyspark.sql import SQLContext, HiveContext

from pyspark.sql import Row
from pyspark.sql import DataFrameWriter
from pyspark.sql import DataFrameReader
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import GroupedData
from pyspark.sql.functions import UserDefinedFunction

from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation  import *


from functools import partial
import numpy as np

from datasu.auc import *
from datasu.dicts import *
from datasu.files import *
from datasu.pandas import *
from datasu.persist import *
from datasu.spark import *
from datasu.patsy import *


import pandas as pd



In [2]:
conf = SparkConf()
conf.set('spark.driver.memory', '12g')
conf.set('spark.python.worker.memory', '2g')
conf.set("spark.driver.maxResultSize", "5g")
conf.set("spark.executor.max", 3)
conf.set('spark.executor.memory', '5g')
conf.set("spark.cores.max", 28)
conf.set('spark.worker.cleanup.enabled', True)
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+UseCompressedOops')

conf.setAppName('logr3')
conf.getAll()

[(u'spark.app.name', u'logr3'),
 (u'spark.master', u'spark://spark1.ea.lab:7077'),
 (u'spark.executor.max', u'3'),
 (u'spark.driver.memory', u'12g'),
 (u'spark.submit.pyFiles',
  u'/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.jars',
  u'file:/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,file:/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,file:/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.executor.memory', u'5g'),
 (u'spark.driver.maxResultSize', u'5g'),
 (u'spark.files',
  u'file:/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,file:/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,file:/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.serializer', u'org.apache.spark.serializer.KryoSerializer'),
 (u'spark.cores.max', u'28'),
 (u'spark.worker

In [35]:
try:
    sc.stop()
except:
    print 'spark context not exists'

In [4]:
sc = pyspark.SparkContext(conf=conf)
sqc = pyspark.SQLContext(sc)
sqlContext = sqc

In [5]:
csv_reader = sqc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')

## LOAD DATA


In [6]:
base_data_path = '/home/ds/dev/data/Kagle-ValuesShoppers/'

### load spark

In [7]:
spark_data_path = 'file://'+ base_data_path + 'spark_data/'
transactions_name = 'transactions'

In [8]:
ddf_trans_agg_history = csv_reader.load(spark_data_path+'ddf_trans_agg_history', samplingRatio=None)

In [9]:
ddf_trans_agg_history.rdd.getNumPartitions()

400

In [10]:
ddf_trans_agg_history.count()

105104330

In [11]:
ddf_trans_agg_history = ddf_trans_agg_history.withColumnRenamed('repeater_calc', 'repeater')

In [12]:
ddf_trans_agg_history.columns

['customer_id',
 'category',
 'brand',
 'agg_customer_id_brand_customer_id_count',
 'agg_customer_id_brand_productsize_average',
 'agg_customer_id_brand_productsize_total',
 'agg_customer_id_brand_purchasequantity_average',
 'agg_customer_id_brand_purchasequantity_total',
 'agg_customer_id_brand_purchaseamount_average',
 'agg_customer_id_brand_purchaseamount_total',
 'agg_customer_id_category_customer_id_count',
 'agg_customer_id_category_productsize_average',
 'agg_customer_id_category_productsize_total',
 'agg_customer_id_category_purchasequantity_average',
 'agg_customer_id_category_purchasequantity_total',
 'agg_customer_id_category_purchaseamount_average',
 'agg_customer_id_category_purchaseamount_total',
 'repeater']

### load pandas

In [13]:
df_coupons = pd.read_csv(base_data_path+'offers')[['offer','category','company','brand','offervalue','quantity']]
df_offers_ids = pd.read_csv(base_data_path+'trainHistory').rename(columns={'id': 'customer_id'})
df_offers_ids_submission = pd.read_csv(base_data_path+'testHistory').rename(columns={'id': 'customer_id'})

In [14]:
df_offers_all = pd.merge(df_offers_ids, df_coupons, on=['offer'])
df_offers_all = df_offers_all[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity','repeattrips','repeater']]

df_offers_all_submission = pd.merge(df_offers_ids_submission, df_coupons, on=['offer'])
df_offers_all_submission = df_offers_all_submission[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity']]

In [15]:
main_folder = '/home/ds/dev/data/Kagle-ValuesShoppers/'
load_variables(path=main_folder+'working_data', variables=['df_offers__trans_aggs', 'df_offers_submission__trans_aggs']);


['df_offers_submission__trans_aggs.var', 'df_offers__trans_aggs.var']
loaded df_offers_submission__trans_aggs
loaded df_offers__trans_aggs


In [16]:
ddf_offers__trans_aggs = sqc.createDataFrame(df_offers__trans_aggs).na.fill(0)
ddf_offers_submission__trans_aggs = sqc.createDataFrame(df_offers_submission__trans_aggs).na.fill(0)
# ddf_offers__trans_aggs = ddf_offers__trans_aggs.drop('repeattrips')

## prepare features

In [17]:
categorical_cols = ['chain','market','category','company','brand']
agg_cols = filter(lambda c: re.match("agg_*",c), ddf_offers__trans_aggs.columns)
num_cols = ['offervalue','quantity']

ddf_offers__trans_aggs = convert_columns_to_type(ddf_offers__trans_aggs, categorical_cols, StringType)
ddf_offers_submission__trans_aggs = convert_columns_to_type(ddf_offers_submission__trans_aggs, categorical_cols, StringType)

In [18]:
ddf_trans_agg_history = convert_columns_to_type(ddf_trans_agg_history, categorical_cols, StringType)
ddf_trans_agg_history = ddf_trans_agg_history.withColumn('repeater', F.substring(ddf_trans_agg_history.repeater, 0,1))

# build pipes

In [None]:
# plus_categorical_expr = partial(plus_expr, expr='%s')
# categorical_cols_plus = plus_categorical_expr(categorical_cols)

# brand_interactions = "(%s):(%s)" % (plus_agg_columns_by_infix('brand',agg_cols), categorical_cols_plus)
# category_interactions = "(%s):(%s)" % (plus_agg_columns_by_infix('category',agg_cols), categorical_cols_plus)
# num_interactions = "(%s):(%s)" % (plus_expr(agg_cols), plus_expr(num_cols))

# R_expr = plus_expr([brand_interactions,category_interactions,num_interactions])
# R_expr

### build prepare pipe

In [19]:
# rf1 = RFormula(formula="repeater ~ . - repeattrips", featuresCol="features", labelCol="label")
# rf1 = RFormula(formula="repeater ~ agg_customer_id_brand_customer_id_count", featuresCol="features", labelCol="label")
rf1 = RFormula(formula="repeater ~ category:. + brand:. -repeattrips -chain -offer -market -company -offerdate -offervalue -quantity", featuresCol="features", labelCol="label")



pipe_transform1 = Pipeline(stages=[rf1])

std_scale = StandardScaler(inputCol="features", outputCol="features_scaled", withStd=True, withMean=False)

px = PolynomialExpansion(degree=2, inputCol="features_scaled", outputCol="features_poly")

# pipe_prepare1 = Pipeline(stages=[std_scale, px])
pipe_prepare1 = Pipeline(stages=[std_scale])

## Logr

In [20]:
lr1 = LogisticRegression(featuresCol="features_scaled", labelCol="label", predictionCol="prediction", regParam=0.1, elasticNetParam=0.3)

# , 
#                          maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, 
#                          thresholds=None, probabilityCol="probability",  
#                          rawPredictionCol="rawPrediction", standardization=True, weightCol=None

In [21]:
pipe_lr1 = Pipeline(stages=[pipe_prepare1, lr1])

## Fit models

In [22]:
ddf_offers_submission__trans_aggs = ddf_offers_submission__trans_aggs.withColumn('repeater',F.lit('f'))
ddf_offers_union__trans_aggs = ddf_offers__trans_aggs.unionAll(
                            ddf_offers_submission__trans_aggs.select(*ddf_offers__trans_aggs.columns))

ddf_offers_union__trans_aggs = ddf_offers_union__trans_aggs.drop('customer_id')

In [24]:
model_transform2 = pipe_transform1.fit(ddf_offers_union__trans_aggs)

In [25]:
ddf_data = model_transform2.transform(ddf_offers__trans_aggs)
ddf_data = ddf_data.select(['features','label']) #, 'repeattrips'])

In [26]:
ddf_data_train, ddf_data_test = ddf_data.randomSplit([0.7, 0.3])


In [29]:
ddf_data_train.first()

Row(features=SparseVector(14369, {13: 1.0, 280: 1.0, 2154: 1.0, 3092: 1204821.0, 3549: 1.0, 4026: 1.0, 6081: 1.0, 7052: 1.5, 7072: 1.0, 7373: 1.0, 7670: 1.0, 9646: 1.0, 10317: 1204821.0, 10806: 1.0, 11233: 1.0, 13397: 1.0, 14079: 1.5, 14098: 1.0}), label=0.0)

In [None]:
# ddf_data_train.groupBy('label').count().collect()

In [30]:
model_lr2 = pipe_lr1.fit(ddf_data_train) #, params={'weightCol':"repeattrips"})


In [31]:
ddf_data_test_res1 = model_lr2.transform(ddf_data_test, params={'threshold':0.3})



In [32]:
ddf_data_test_res1.columns

['features',
 'label',
 'features_scaled',
 'rawPrediction',
 'probability',
 'prediction']

In [34]:
ddf_data_test_res1.show(10)

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|label|     features_scaled|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|(14369,[1,39,575,...|  0.0|(14369,[1,39,575,...|[1.11769824504102...|[0.75356151497684...|       0.0|
|(14369,[1,39,552,...|  0.0|(14369,[1,39,552,...|[1.11769824504102...|[0.75356151497684...|       0.0|
|(14369,[5,118,111...|  1.0|(14369,[5,118,111...|[1.07549227353990...|[0.74563999050520...|       0.0|
|(14369,[5,118,108...|  0.0|(14369,[5,118,108...|[1.07549227353990...|[0.74563999050520...|       0.0|
|(14369,[1,39,572,...|  0.0|(14369,[1,39,572,...|[1.11769824504102...|[0.75356151497684...|       0.0|
|(14369,[1,39,546,...|  0.0|(14369,[1,39,546,...|[1.11769824504102...|[0.75356151497684...|       0.0|
|(14369,[1,39,549,...|  0.0|(14369,[1,39,549,...|[1.11769824504102...|[0.

In [33]:
df_data_test_res1 = ddf_data_test_res1.select(['rawPrediction', get_index_from_vector()('probability', F.lit(1)).alias('probability'), 'prediction', 'label']).toPandas()

KeyboardInterrupt: 

In [None]:
df_data_test_res1.head()

In [None]:
plot_auc(df_data_test_res1.label, df_data_test_res1.probability.values)

### cross-validation

In [None]:
grid = ParamGridBuilder().build()
evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=pipe_lr1, estimatorParamMaps=grid, evaluator=evaluator)

cvModel = cv.fit(ddf_data_train)
evaluator.evaluate(cvModel.transform(ddf_data_train))

### fit big data!!!

In [None]:
ddf_trans_agg_history = ddf_trans_agg_history.drop('customer_id')

In [None]:
ddf_trans_agg_history.columns

In [None]:
ddf_trans_agg_history_union_offers = ddf_trans_agg_history.unionAll(ddf_offers_union__trans_aggs.select(*ddf_trans_agg_history.columns))
ddf_trans_agg_history_union_offers.columns

In [None]:
model_transform1 = pipe_transform1.fit(ddf_trans_agg_history_union_offers)

In [None]:
ddf_data = model_transform1.transform(ddf_trans_agg_history)
ddf_data = ddf_data.select(['features','label']) #, 'repeattrips'])

In [None]:
ddf_data_train = ddf_data

In [None]:
ddf_data_test = model_transform1.transform(ddf_offers__trans_aggs)

In [None]:
# ddf_data_train, ddf_data_test = ddf_data.randomSplit([0.7, 0.3])

In [None]:
ddf_trans_agg_history_union_offers.groupBy('repeater').count().collect()

In [None]:
model_lr1 = pipe_lr1.fit(ddf_data_train) #, params={'weightCol':"repeattrips"})

In [None]:
ddf_data_test_res1 = model_lr1.transform(ddf_data_test, params={'threshold':0.3})

In [None]:
ddf_data_test_res1 = ddf_data_test_res1.select(['rawPrediction', get_index_from_vector()('probability', F.lit(1)).alias('probability'), 'prediction', 'label'])

In [None]:
# df_data_test_res1 = ddf_data_test_res1.sample(False, fraction=0.1).toPandas()
df_data_test_res1 = ddf_data_test_res1.toPandas()

In [None]:
df_data_test_res1.head()

In [None]:
plot_auc(df_data_test_res1.label, df_data_test_res1.probability.values)

### submission

In [None]:
ddf_data_submission = model_transform2.transform(ddf_offers_submission__trans_aggs).select(['customer_id','features'])

In [None]:
ddf_data_submission_res1 = model_lr2.transform(ddf_data_submission)

In [None]:
df_data_submission_res1 = ddf_data_submission_res1.select(
                [F.col('customer_id').alias('id'), get_index_from_vector()('probability', F.lit(1)).alias('repeatProbability')]).toPandas()

In [None]:
df_data_submission_res1.head()

In [None]:
df_data_submission_res1.to_csv(path_or_buf=main_folder+'submission/'+'submission_spark18_LR1_bd', index=False)

## Other

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
output = ParamGridBuilder() \
         .baseOn({lr.labelCol: 'l'}) \
         .baseOn([lr.predictionCol, 'p']) \
         .addGrid(lr.regParam, [1.0, 2.0]) \
         .addGrid(lr.maxIter, [1, 5]) \
         .build()
            
expected = [
         {lr.regParam: 1.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'},
         {lr.regParam: 2.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'},
         {lr.regParam: 1.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'},
         {lr.regParam: 2.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}]

len(output) == len(expected)
all([m in expected for m in output])


In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.linalg import Vectors
dataset = sqlContext.createDataFrame(
     [(Vectors.dense([0.0]), 0.0),
      (Vectors.dense([0.4]), 1.0),
      (Vectors.dense([0.5]), 0.0),
      (Vectors.dense([0.6]), 1.0),
      (Vectors.dense([1.0]), 1.0)] * 10,
     ["features", "label"])
lr = LogisticRegression()

grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
evaluator.evaluate(cvModel.transform(dataset))

### play with RFormula

In [None]:
df = sqc.createDataFrame([
     (1.0, 1.0, "a","q"),
     (0.0, 2.0, "b","w"),
     (0.0, 3.0, "a","q"),    
 ], ["y", "x", "cat1","cat2"])
df.show()


In [None]:
rf = RFormula(formula="y ~ cat1", featuresCol="features", labelCol="label")
df1 = rf.fit(df).transform(df)
df1.show()

rf = RFormula(formula="y ~ cat1+cat2", featuresCol="features", labelCol="label")
df2 = rf.fit(df).transform(df)
df2.show()

rf = RFormula(formula="y ~ cat1:cat2", featuresCol="features", labelCol="label")
df3 = rf.fit(df).transform(df)
df3.show()

rf = RFormula(formula="y ~ cat1:x + cat2:x", featuresCol="features", labelCol="label")
df4 = rf.fit(df).transform(df)
df4.show()

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

scalerModel = scaler.fit(df1)
scalerModel.transform(df1).show()

In [None]:
from pyspark.sql import Row
from pyspark.mllib.linalg import Vectors

df = sc.parallelize([
     Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
     Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF()
df.show()

In [None]:

test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
result = model.transform(test0).head()
result.prediction

result.probability

result.rawPrediction

test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF()
model.transform(test1).head().prediction

lr.setParams("vector")