<https://www.kaggle.com/c/instacart-market-basket-analysis/data>

## 1. Read data

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Pyspark course") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.3")\
    .getOrCreate()

In [2]:
import pyspark.sql.functions as F

In [3]:
import os
import configparser
aws_profile = "myaws"

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
access_id = config.get(aws_profile, "aws_access_key_id") 
access_key = config.get(aws_profile, "aws_secret_access_key")

In [4]:
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)

In [5]:
#sdf = spark.read.option("header", "true").csv("s3n://bartek-ml-course/order_products__prior.csv").cache()
sdf = spark.read.option("header", "true").csv("data/order_products__prior.csv").cache()

In [None]:
sdf.printSchema()

In [None]:
sdf.show()

In [None]:
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
schema_order_product = StructType([
    StructField("order_id", IntegerType()),
    StructField("product_id", IntegerType()),
    StructField("add_to_cart_order", IntegerType()),
    StructField("reorderd", IntegerType())
])

In [6]:
#orders_sdf = spark.read.option("header", "true").csv("s3n://bartek-ml-course/orders.csv")\
#    .cache()
orders_sdf = spark.read.option("header", "true").csv("data/orders.csv")\
    .cache()

In [None]:
orders_sdf.printSchema()

## 2. Exploratory analysis

In [None]:
## your code here
sdf.groupBy("order_id", "product_id")\
    .agg(F.count("product_id").alias("times"))\
    .orderBy(F.desc("times"))\
    .show()

## 3. Create user-item list

In [7]:
from pyspark.sql import Window
def id2idx(id_sdf, id_col, idx_col):
    id_window = Window().orderBy(id_col)
    return id_sdf.withColumn(idx_col, F.rank().over(id_window))

In [16]:
all_users = orders_sdf.select("user_id").distinct()
users_sample, _ = all_users.randomSplit([0.015, 0.985], seed=666)
all_product = sdf.select("product_id").distinct()

In [17]:
users_sample.count()

3149

In [18]:
user_id_idx = id2idx(users_sample, "user_id", "user_idx")
product_id_idx = id2idx(all_product, "product_id", "product_idx")

In [19]:
## your code here
user_item_sdf = sdf.join(orders_sdf, on="order_id")\
    .select(F.col("user_id"), F.col("product_id"), F.lit(1).alias("rating"))\
    .join(user_id_idx, "user_id")\
    .join(product_id_idx, "product_id")\
    .cache()

## 4. Create ALS model

In [70]:
from pyspark.ml.recommendation import ALS
als = ALS(
    rank=5, maxIter=20, 
    userCol="user_idx", itemCol="product_idx", 
    seed=666, implicitPrefs=True)

In [71]:
als_model = als.fit(user_item_sdf)

## 5. Validate results

In [88]:
rec5 = als_model.recommendForAllUsers(10)

In [89]:
rec5.printSchema()

root
 |-- user_idx: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- product_idx: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [91]:
rec5ids = rec5.join(user_id_idx, "user_idx")\
    .withColumn("recommendation", F.explode("recommendations"))\
    .select("user_id", "recommendation.product_idx")\
    .join(product_id_idx, "product_idx")\
    .drop("product_idx").cache()

In [92]:
order_product_train_sdf = spark\
    .read.option("header", "true")\
    .csv("data/order_products__train.csv").cache()
user_item_train_sdf = order_product_train.join(orders_sdf, "order_id")\
    .select(F.col("user_id"), F.col("product_id"), F.lit(1).alias("last_buy")).cache()

In [93]:
rec5ids_val = rec5ids.join(user_item_train_sdf, on=["user_id", "product_id"], how="left")

In [94]:
precision_df = rec5ids_val.groupBy("last_buy").count().toPandas()

In [95]:
precision_df['count']/sum(precision_df['count'])

0    0.948555
1    0.051445
Name: count, dtype: float64

## 6. Non colaborative algorithm

In [96]:
als_model.itemFactors.show()

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.010944766, -0...|
| 20|[-0.0060177636, 7...|
| 40|[-0.2809162, 0.41...|
| 60|[-0.0019884966, 0...|
| 70|[-0.09843452, 0.0...|
| 80|[-0.015406999, 0....|
|100|[5.365436E-4, 0.0...|
|110|[0.055715587, -2....|
|170|[-0.0025643965, -...|
|180|[-0.016432649, 0....|
|190|[-0.011314751, 0....|
|200|[0.0105908085, 0....|
|240|[-0.032621145, 0....|
|250|[-0.04655511, 0.0...|
|290|[0.084733084, -0....|
|380|[-0.16577977, 0.4...|
|420|[-0.0034182696, 0...|
|430|[0.0034633707, -0...|
|440|[7.157677E-4, -8....|
|450|[0.024876615, 0.0...|
+---+--------------------+
only showing top 20 rows

