In [1]:
%matplotlib inline
import seaborn as sns

import findspark
import os
findspark.init()

import pyspark
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark import sql
from pyspark import SparkConf
from pyspark import StorageLevel

from pyspark.sql import SQLContext, HiveContext

from pyspark.sql import Row
from pyspark.sql import DataFrameWriter
from pyspark.sql import DataFrameReader
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import GroupedData
from pyspark.sql.functions import UserDefinedFunction

from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation  import *


from functools import partial
import numpy as np

from datasu.auc import *
from datasu.dicts import *
from datasu.files import *
from datasu.pandas import *
from datasu.persist import *
from datasu.spark import *
from datasu.patsy import *


import pandas as pd



In [2]:
conf = SparkConf()
conf.set('spark.driver.memory', '12g')
conf.set('spark.python.worker.memory', '2g')
conf.set("spark.driver.maxResultSize", "5g")
conf.set("spark.executor.max", 3)
conf.set('spark.executor.memory', '5g')
conf.set("spark.cores.max", 28)
conf.set('spark.worker.cleanup.enabled', True)
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryoserializer.buffer.max", "196m")
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+UseCompressedOops')

conf.setAppName('mlp1')
conf.getAll()

[(u'spark.master', u'spark://spark1.ea.lab:7077'),
 (u'spark.executor.max', u'3'),
 (u'spark.driver.memory', u'12g'),
 (u'spark.submit.pyFiles',
  u'/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.jars',
  u'file:/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,file:/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,file:/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.executor.memory', u'5g'),
 (u'spark.kryoserializer.buffer.max', u'196m'),
 (u'spark.driver.maxResultSize', u'5g'),
 (u'spark.files',
  u'file:/home/ds/.ivy2/jars/com.databricks_spark-csv_2.10-1.3.0.jar,file:/home/ds/.ivy2/jars/org.apache.commons_commons-csv-1.1.jar,file:/home/ds/.ivy2/jars/com.univocity_univocity-parsers-1.5.1.jar'),
 (u'spark.app.name', u'mlp1'),
 (u'spark.serializer', u'org.apache.spark.serializer.KryoSerializer'),

In [3]:
try:
    sc.stop()
except:
    print 'spark context not exists'

spark context not exists


In [4]:
sc = pyspark.SparkContext(conf=conf)
sqc = pyspark.SQLContext(sc)
sqlContext = sqc

In [5]:
csv_reader = sqc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')

## LOAD DATA


In [6]:
base_data_path = '/home/ds/dev/data/Kagle-ValuesShoppers/'
spark_data_path = 'file://'+ base_data_path + 'spark_data/'
transactions_name = 'transactions'

In [7]:
df_coupons = pd.read_csv(base_data_path+'offers')[['offer','category','company','brand','offervalue','quantity']]
df_offers_ids = pd.read_csv(base_data_path+'trainHistory').rename(columns={'id': 'customer_id'})
df_offers_ids_subm = pd.read_csv(base_data_path+'testHistory').rename(columns={'id': 'customer_id'})
# df_trans_all = pd.read_csv(base_data_path+'transactions_reduced_category').rename(columns={'id': 'customer_id'})

In [8]:
df_offers_hist = pd.merge(df_offers_ids, df_coupons, on=['offer'])
df_offers_hist = df_offers_hist[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity','repeattrips','repeater']]

df_offers_subm = pd.merge(df_offers_ids_subm, df_coupons, on=['offer'])
df_offers_subm = df_offers_subm[['customer_id','chain','offer','market','category','company','brand','offerdate','offervalue','quantity']]

In [9]:
ddf_offers_hist = sqc.createDataFrame(df_offers_hist)
ddf_offers_subm = sqc.createDataFrame(df_offers_subm)


ddf_offers_hist = ddf_offers_hist.withColumn('repeater_bool', ddf_offers_hist.repeater == 't')

In [10]:
bool_to_double = UserDefinedFunction(lambda v: 1.0 if v else 0.0, DoubleType(), 'bool_to_double')

In [11]:
ddf_offers_hist = ddf_offers_hist.withColumn('label', bool_to_double('repeater_bool'))

In [12]:
ddf_offers_hist.show(3)

+-----------+-----+-------+------+--------+---------+-----+----------+----------+--------+-----------+--------+-------------+-----+
|customer_id|chain|  offer|market|category|  company|brand| offerdate|offervalue|quantity|repeattrips|repeater|repeater_bool|label|
+-----------+-----+-------+------+--------+---------+-----+----------+----------+--------+-----------+--------+-------------+-----+
|      86246|  205|1208251|    34|    2202|104460040| 3718|2013-04-24|       2.0|       1|          5|       t|         true|  1.0|
|   15753725|   17|1208251|     4|    2202|104460040| 3718|2013-04-24|       2.0|       1|          0|       f|        false|  0.0|
|   16535563|    4|1208251|     1|    2202|104460040| 3718|2013-04-27|       2.0|       1|          7|       t|         true|  1.0|
+-----------+-----+-------+------+--------+---------+-----+----------+----------+--------+-----------+--------+-------------+-----+
only showing top 3 rows



In [13]:
ddf_transactions = csv_reader.load(base_data_path+transactions_name, samplingRatio=0.02)
ddf_transactions.rdd.setName(transactions_name)
ddf_transactions.alias('transactions')

ddf_transactions.rdd.getNumPartitions()
ddf_transactions = ddf_transactions.withColumnRenamed('id','customer_id')

In [14]:
ddf_transactions_small, ddf_transactions_big = ddf_transactions.randomSplit([0.01,0.99])

In [15]:
ddf_trans = ddf_transactions
ddf_trans.rdd.setName('ddf_trans')
# ddf_trans.cache()

ddf_trans MapPartitionsRDD[35] at javaToPython at NativeMethodAccessorImpl.java:-2

In [None]:
ddf_trans.count()

## prepare features

#### aggregate transactions

In [16]:
summ_grouping = {'total':F.sum, 'average':F.avg }
count_grouping = {'count':F.count }

In [17]:
total_agg = partial(get_ddf_aggs, agg_columns=['purchasequantity'], agg_funcs={'total':F.sum}, prefix='agg_', cast_to='double')
avg_agg = partial(get_ddf_aggs, agg_columns=['productsize'], agg_funcs={'average':F.avg}, prefix='agg_', cast_to='double')

In [18]:
ddf_category_pivot1, category_indexer = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', 
                                              pivot_column='category', 
                                              aggs=total_agg(['customer_id', 'category']),
                                              return_indexer=True)

In [19]:
ddf_category_pivot1.first()

Row(customer_id=614738200, features=SparseVector(836, {0: 13.0, 1: 37.0, 2: 16.0, 3: 28.0, 4: 13.0, 5: 31.0, 6: 7.0, 7: 20.0, 9: 5.0, 10: 5.0, 11: 145.0, 12: 6.0, 13: 8.0, 14: 174.0, 15: 7.0, 16: 8.0, 17: 6.0, 20: 18.0, 21: 1.0, 22: 3.0, 23: 10.0, 24: 7.0, 25: 14.0, 26: 4.0, 29: 14.0, 30: 3.0, 31: 7.0, 32: 4.0, 33: 2.0, 35: 5.0, 36: 9.0, 37: 1.0, 38: 17.0, 39: 4.0, 40: 16.0, 41: 5.0, 42: 4.0, 43: 8.0, 44: 8.0, 45: 8.0, 46: 13.0, 47: 6.0, 48: 2.0, 49: 2.0, 50: 2.0, 51: 3.0, 52: 2.0, 53: 33.0, 54: 1.0, 55: 9.0, 56: 16.0, 57: 2.0, 58: 15.0, 59: 4.0, 61: 19.0, 63: 10.0, 65: 5.0, 67: 2.0, 68: 2.0, 69: 1.0, 70: 1.0, 71: 1.0, 73: 2.0, 74: 2.0, 76: 2.0, 79: 7.0, 80: 1.0, 85: 1.0, 88: 3.0, 89: 93.0, 90: 6.0, 91: 1.0, 92: 11.0, 93: 32.0, 94: 4.0, 96: 7.0, 97: 8.0, 98: 2.0, 99: 2.0, 100: 22.0, 102: 69.0, 103: 6.0, 104: 1.0, 105: 1.0, 106: 11.0, 107: 53.0, 108: 1.0, 111: 4.0, 112: 8.0, 116: 1.0, 117: 2.0, 118: 9.0, 119: 5.0, 121: 3.0, 126: 2.0, 127: 2.0, 130: 7.0, 134: 1.0, 135: 22.0, 137: 1.0, 13

In [20]:
ddf_category_pivot2 = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', 
                                              pivot_column='category', 
                                              aggs=avg_agg(['customer_id', 'category']))

In [21]:
ddf_brand_pivot1, brand_indexer = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', 
                                              pivot_column='brand', 
                                              aggs=total_agg(['customer_id', 'brand']),
                                              return_indexer=True)

In [22]:
ddf_brand_pivot1.show(2)

+-----------+--------------------+
|customer_id|            features|
+-----------+--------------------+
|  614738200|(35689,[0,1,2,3,4...|
| 4621031497|(35689,[0,1,2,3,4...|
+-----------+--------------------+
only showing top 2 rows



In [23]:
ddf_pivot12 = merge_features(ddfs=[ddf_category_pivot1,ddf_brand_pivot1], join_column='customer_id', merge_column='features')

In [24]:
ddf_pivot12.first()

Row(customer_id=98468631, features=SparseVector(36525, {0: 15.0, 1: 23.0, 2: 35.0, 3: 9.0, 4: 57.0, 5: 5.0, 6: 4.0, 7: 37.0, 8: 5.0, 9: 1.0, 10: 2.0, 11: 16.0, 12: 11.0, 15: 7.0, 16: 4.0, 17: 4.0, 18: 5.0, 20: 33.0, 21: 2.0, 22: 6.0, 25: 2.0, 26: 2.0, 28: 3.0, 29: 4.0, 32: 4.0, 33: 4.0, 36: 3.0, 37: 1.0, 40: 13.0, 41: 7.0, 47: 1.0, 48: 9.0, 49: 2.0, 51: 1.0, 52: 1.0, 54: 43.0, 55: 5.0, 58: 8.0, 59: 1.0, 61: 2.0, 62: 1.0, 63: 2.0, 68: 16.0, 71: 2.0, 72: 36.0, 78: 1.0, 79: 1.0, 80: 3.0, 83: 8.0, 84: 1.0, 85: 2.0, 89: 152.0, 90: 17.0, 96: 16.0, 99: 3.0, 100: 6.0, 102: 4.0, 104: 1.0, 105: 2.0, 106: 13.0, 107: 3.0, 109: 17.0, 114: 1.0, 116: 2.0, 118: 4.0, 127: 1.0, 129: 2.0, 130: 1.0, 134: 5.0, 135: 1.0, 137: 2.0, 138: 12.0, 141: 1.0, 142: 4.0, 145: 1.0, 147: 3.0, 148: 2.0, 149: 6.0, 150: 3.0, 152: 5.0, 153: 1.0, 154: 1.0, 156: 3.0, 157: 2.0, 160: 7.0, 162: 6.0, 164: 4.0, 165: 6.0, 166: 1.0, 168: 7.0, 169: 1.0, 176: 3.0, 177: 3.0, 182: 1.0, 183: 1.0, 185: 4.0, 190: 2.0, 191: 3.0, 194: 1.0, 

#### merge trans with history

In [25]:
ddf_offers_hist.show(2)

+-----------+-----+-------+------+--------+---------+-----+----------+----------+--------+-----------+--------+-------------+-----+
|customer_id|chain|  offer|market|category|  company|brand| offerdate|offervalue|quantity|repeattrips|repeater|repeater_bool|label|
+-----------+-----+-------+------+--------+---------+-----+----------+----------+--------+-----------+--------+-------------+-----+
|      86246|  205|1208251|    34|    2202|104460040| 3718|2013-04-24|       2.0|       1|          5|       t|         true|  1.0|
|   15753725|   17|1208251|     4|    2202|104460040| 3718|2013-04-24|       2.0|       1|          0|       f|        false|  0.0|
+-----------+-----+-------+------+--------+---------+-----+----------+----------+--------+-----------+--------+-------------+-----+
only showing top 2 rows



In [26]:
# ddf_offers_hist_indexed = category_indexer.transform(ddf_offers_hist)

In [27]:
# ddf_offers_hist_indexed = brand_indexer.transform(ddf_offers_hist_indexed)

In [28]:
# ddf_offers_hist_indexed.show(2)

In [29]:
# assembler = VectorAssembler(inputCols=['category_idx','brand_idx'], outputCol='features')
# ddf_offers_hist_indexed = assembler.transform(ddf_offers_hist_indexed)

In [30]:
# ddf_hist_features = merge_features(ddfs=[ddf_offers_hist_indexed, ddf_pivot12],                     
#                      join_column='customer_id', merge_column='features')

ddf_hist_features = ddf_offers_hist.join(ddf_pivot12, on='customer_id', how='inner')

In [31]:
ddf_hist_features.first()

Row(customer_id=99642431, chain=15, offer=1197502, market=9, category=3203, company=106414464, brand=13474, offerdate=u'2013-03-25', offervalue=0.75, quantity=1, repeattrips=0, repeater=u'f', repeater_bool=False, label=0.0, features=SparseVector(36525, {0: 2.0, 1: 15.0, 2: 9.0, 3: 4.0, 4: 20.0, 5: 6.0, 6: 4.0, 7: 1.0, 8: 4.0, 9: 6.0, 10: 6.0, 11: 4.0, 14: 1.0, 16: 7.0, 19: 5.0, 23: 1.0, 24: 3.0, 25: 3.0, 26: 1.0, 27: 1.0, 28: 11.0, 29: 2.0, 30: 5.0, 31: 2.0, 39: 2.0, 41: 1.0, 42: 4.0, 43: 13.0, 45: 1.0, 47: 10.0, 49: 3.0, 51: 6.0, 53: 1.0, 55: 2.0, 56: 1.0, 58: 6.0, 59: 1.0, 61: 5.0, 63: 3.0, 64: 2.0, 67: 2.0, 68: 1.0, 69: 2.0, 77: 3.0, 78: 1.0, 81: 1.0, 87: 1.0, 89: 11.0, 95: 2.0, 98: 1.0, 104: 1.0, 105: 2.0, 107: 1.0, 109: 3.0, 110: 4.0, 111: 4.0, 112: 4.0, 113: 1.0, 119: 1.0, 124: 4.0, 131: 1.0, 135: 3.0, 137: 4.0, 138: 4.0, 139: 2.0, 140: 6.0, 144: 1.0, 145: 14.0, 149: 3.0, 152: 2.0, 157: 10.0, 160: 2.0, 161: 1.0, 162: 5.0, 164: 1.0, 170: 2.0, 173: 2.0, 176: 2.0, 181: 1.0, 184: 1.0

# build pipes

In [32]:
std_scale = StandardScaler(inputCol="features", outputCol="features_scaled", withStd=True, withMean=False)
pipe_prepare1 = Pipeline(stages=[std_scale])

In [33]:
lr1 = LogisticRegression(featuresCol="features_scaled", labelCol="label", predictionCol="prediction", regParam=0.01, elasticNetParam=0.3)
pipe_lr1 = Pipeline(stages=[lr1])
# , 
#                          maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, 
#                          thresholds=None, probabilityCol="probability",  
#                          rawPredictionCol="rawPrediction", standardization=True, weightCol=None

In [34]:
rf1 = RandomForestClassifier(labelCol="repeater_str", featuresCol="features", maxDepth=5, maxBins=50)

pipe_rf1 = Pipeline(stages=[rf1])

In [35]:
layers = [36525, 500,100, 2]
mlp1 = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features_scaled", maxIter=100, layers=layers, blockSize=1280, seed=1234)

pipe_mlp1 = Pipeline(stages=[mlp1])

In [36]:
pipe = pipe_mlp1

## Fit models

In [37]:
model_prepare1 = pipe_prepare1.fit(ddf_hist_features)

In [38]:
ddf_data = model_prepare1.transform(ddf_hist_features)
ddf_data = ddf_data.select(['features','features_scaled','label']) #, 'repeattrips'])

In [39]:
ddf_data_train, ddf_data_test = ddf_data.randomSplit([0.7, 0.3])


In [40]:
ddf_data_train.show(3)

+--------------------+--------------------+-----+
|            features|     features_scaled|label|
+--------------------+--------------------+-----+
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  0.0|
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  0.0|
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  1.0|
+--------------------+--------------------+-----+
only showing top 3 rows



In [41]:
model = pipe.fit(ddf_data_train)

In [42]:
ddf_data_train.count()

111776

In [49]:
ddf_data_train.agg(F.avg('label')).show()

+-------------------+
|         avg(label)|
+-------------------+
|0.27182042656742056|
+-------------------+



In [99]:
from numpy import random
from scipy import stats

In [142]:


randBern = F.UserDefinedFunction(lambda r: float(random.choice(a=[0.0, 1.0],p=[0.73, 0.27])) ,DoubleType())

In [155]:
# r=ddf_data_train.withColumn('prediction', randBern(F.lit(1)))
r=ddf_data_train.withColumn('prediction', F.lit(0.0))

In [156]:
r.show(5)

+--------------------+--------------------+-----+----------+
|            features|     features_scaled|label|prediction|
+--------------------+--------------------+-----+----------+
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  0.0|       0.0|
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  0.0|       0.0|
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  1.0|       0.0|
|(36525,[1,2,3,4,5...|(36525,[1,2,3,4,5...|  0.0|       0.0|
|(36525,[0,1,2,3,4...|(36525,[0,1,2,3,4...|  0.0|       0.0|
+--------------------+--------------------+-----+----------+
only showing top 5 rows



In [157]:
r.agg(F.avg('prediction')).show()

+---------------+
|avg(prediction)|
+---------------+
|            0.0|
+---------------+



In [162]:
# evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC",rawPredictionCol="prediction", labelCol="label")

predictionAndLabels = r.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="precision")
print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))

evaluator = MulticlassClassificationEvaluator(metricName="recall")
print("Recall:" + str(evaluator.evaluate(predictionAndLabels)))

Precision:0.728179573433
Recall:0.728179573433


In [124]:
r = model.transform(ddf_data_train)

In [None]:
df_data_test_res1 = r.select(['rawPrediction', get_index_from_vector()('probability', F.lit(1)).alias('probability'), 'prediction', 'label']).toPandas()

In [None]:
df_data_test_res1.head()

In [None]:
plot_auc(df_data_test_res1.label, df_data_test_res1.probability.values)

In [None]:
ddf_offers_submission__trans_aggs = ddf_offers_submission__trans_aggs.withColumn('repeater',F.lit('f'))
ddf_offers_union__trans_aggs = ddf_offers__trans_aggs.unionAll(
                            ddf_offers_submission__trans_aggs.select(*ddf_offers__trans_aggs.columns))

ddf_offers_union__trans_aggs = ddf_offers_union__trans_aggs.drop('customer_id')

In [None]:
model_transform2 = pipe_transform1.fit(ddf_offers_union__trans_aggs)

In [None]:
ddf_data = model_transform2.transform(ddf_offers__trans_aggs)
ddf_data = ddf_data.select(['features','label']) #, 'repeattrips'])

In [None]:
ddf_data_train, ddf_data_test = ddf_data.randomSplit([0.7, 0.3])


In [None]:
# ddf_data_train.groupBy('label').count().collect()

In [None]:
model_lr2 = pipe_lr1.fit(ddf_data_train) #, params={'weightCol':"repeattrips"})


In [None]:
ddf_data_test_res1 = model_lr2.transform(ddf_data_test, params={'threshold':0.3})

In [None]:
ddf_data_test_res1.columns

In [None]:
df_data_test_res1 = ddf_data_test_res1.select(['rawPrediction', get_index_from_vector()('probability', F.lit(1)).alias('probability'), 'prediction', 'label']).toPandas()

In [None]:
df_data_test_res1.head()

In [None]:
plot_auc(df_data_test_res1.label, df_data_test_res1.probability.values)

### cross-validation

In [None]:
grid = ParamGridBuilder().build()
evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=pipe_lr1, estimatorParamMaps=grid, evaluator=evaluator)

cvModel = cv.fit(ddf_data_train)
evaluator.evaluate(cvModel.transform(ddf_data_train))

### fit big data!!!

In [None]:
ddf_trans_agg_history = ddf_trans_agg_history.drop('customer_id')

In [None]:
ddf_trans_agg_history.columns

In [None]:
ddf_trans_agg_history_union_offers = ddf_trans_agg_history.unionAll(ddf_offers_union__trans_aggs.select(*ddf_trans_agg_history.columns))
ddf_trans_agg_history_union_offers.columns

In [None]:
model_transform1 = pipe_transform1.fit(ddf_trans_agg_history_union_offers)

In [None]:
ddf_data = model_transform1.transform(ddf_trans_agg_history)
ddf_data = ddf_data.select(['features','label']) #, 'repeattrips'])

In [None]:
ddf_data_train = ddf_data

In [None]:
ddf_data_test = model_transform1.transform(ddf_offers__trans_aggs)

In [None]:
# ddf_data_train, ddf_data_test = ddf_data.randomSplit([0.7, 0.3])

In [None]:
ddf_trans_agg_history_union_offers.groupBy('repeater').count().collect()

In [None]:
model_lr1 = pipe_lr1.fit(ddf_data_train) #, params={'weightCol':"repeattrips"})

In [None]:
ddf_data_test_res1 = model_lr1.transform(ddf_data_test, params={'threshold':0.3})

In [None]:
ddf_data_test_res1 = ddf_data_test_res1.select(['rawPrediction', get_index_from_vector()('probability', F.lit(1)).alias('probability'), 'prediction', 'label'])

In [None]:
# df_data_test_res1 = ddf_data_test_res1.sample(False, fraction=0.1).toPandas()
df_data_test_res1 = ddf_data_test_res1.toPandas()

In [None]:
df_data_test_res1.head()

In [None]:
plot_auc(df_data_test_res1.label, df_data_test_res1.probability.values)

### submission

In [None]:
ddf_data_submission = model_transform2.transform(ddf_offers_submission__trans_aggs).select(['customer_id','features'])

In [None]:
ddf_data_submission_res1 = model_lr2.transform(ddf_data_submission)

In [None]:
df_data_submission_res1 = ddf_data_submission_res1.select(
                [F.col('customer_id').alias('id'), get_index_from_vector()('probability', F.lit(1)).alias('repeatProbability')]).toPandas()

In [None]:
df_data_submission_res1.head()

In [None]:
df_data_submission_res1.to_csv(path_or_buf=main_folder+'submission/'+'submission_spark18_LR1_bd', index=False)

## Other

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
output = ParamGridBuilder() \
         .baseOn({lr.labelCol: 'l'}) \
         .baseOn([lr.predictionCol, 'p']) \
         .addGrid(lr.regParam, [1.0, 2.0]) \
         .addGrid(lr.maxIter, [1, 5]) \
         .build()
            
expected = [
         {lr.regParam: 1.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'},
         {lr.regParam: 2.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'},
         {lr.regParam: 1.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'},
         {lr.regParam: 2.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}]

len(output) == len(expected)
all([m in expected for m in output])


In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.linalg import Vectors
dataset = sqlContext.createDataFrame(
     [(Vectors.dense([0.0]), 0.0),
      (Vectors.dense([0.4]), 1.0),
      (Vectors.dense([0.5]), 0.0),
      (Vectors.dense([0.6]), 1.0),
      (Vectors.dense([1.0]), 1.0)] * 10,
     ["features", "label"])
lr = LogisticRegression()

grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
evaluator.evaluate(cvModel.transform(dataset))

### play with RFormula

In [None]:
df = sqc.createDataFrame([
     (1.0, 1.0, "a","q"),
     (0.0, 2.0, "b","w"),
     (0.0, 3.0, "a","q"),    
 ], ["y", "x", "cat1","cat2"])
df.show()


In [None]:
rf = RFormula(formula="y ~ cat1", featuresCol="features", labelCol="label")
df1 = rf.fit(df).transform(df)
df1.show()

rf = RFormula(formula="y ~ cat1+cat2", featuresCol="features", labelCol="label")
df2 = rf.fit(df).transform(df)
df2.show()

rf = RFormula(formula="y ~ cat1:cat2", featuresCol="features", labelCol="label")
df3 = rf.fit(df).transform(df)
df3.show()

rf = RFormula(formula="y ~ cat1:x + cat2:x", featuresCol="features", labelCol="label")
df4 = rf.fit(df).transform(df)
df4.show()

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

scalerModel = scaler.fit(df1)
scalerModel.transform(df1).show()

In [None]:
from pyspark.sql import Row
from pyspark.mllib.linalg import Vectors

df = sc.parallelize([
     Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
     Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF()
df.show()

In [None]:

test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
result = model.transform(test0).head()
result.prediction

result.probability

result.rawPrediction

test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF()
model.transform(test1).head().prediction

lr.setParams("vector")