## Imports

In [1]:
!free -h

             total       used       free     shared    buffers     cached
Mem:          7.8G       3.4G       4.4G        64K       1.8G       772M
-/+ buffers/cache:       843M       7.0G
Swap:           0B         0B         0B


In [2]:
!pip install --upgrade pip
!pip install sagemaker_pyspark
!pip install pyspark
!pip install gdown



# Session setup

## Current user's data retrieval

In [3]:
import botocore.session

session = botocore.session.get_session()
credentials = session.get_credentials()

## S3 setup

In [4]:
# retrieves infos about S3
import boto3

s3 = boto3.client('s3')
bucket = "cristo-test"

## PySpark setup

In [5]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import sagemaker_pyspark

conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', credentials.access_key)
    .config('fs.s3a.secret.key', credentials.secret_key)
    .appName("recommender-system")
    .getOrCreate()
)

## Utility functions

### S3 functions

These functions are a bridge to S3 using the `boto3` module

In [6]:
from typing import Union, List
import os
import requests

def list_files(client, bucket: str):
    contents = client.list_objects(Bucket=bucket, Prefix="")['Contents']
    filenames = [filename["Key"] for filename in contents]
    return filenames

def get_object(client, bucket: str, filepath: str) -> str:
    response = client.get_object(Bucket=bucket, Key=filepath)
    body = response["Body"].read()
    return body

def load_dataframe(client, bucket: str, filepaths: Union[str, List[str]]):
    # eventually casts a single filepath to a list 
    if isinstance(filepaths, str):
        filepaths = [filepaths]
    # downloads the datasets from S3
    for filepath in filepaths:
        if not os.path.basename(filepath) in os.listdir():
            print(f"Downloading {filepath} to {os.path.basename(filepath)}")
            s3.download_file(bucket, filepath, os.path.basename(filepath))
    # checks if the extension is valid
    extension = os.path.splitext(filepaths[0])[-1]
    separator = None
    if extension == ".csv":
        separator = ","
    elif extension == ".tsv":
        separator = "\t"
    else:
        raise Exception(f"Unrecognized extension '{extension}'")
    # loads the dataframe
    df = spark.read.csv([os.path.basename(filepath) for filepath in filepaths], header=True, inferSchema=True, sep=separator)
    return df

# Data downloading

Since the datasets are hosted on Google Drive, this code:

- downloads the compressed files (`.csv.gz`) to this notebook's space
- decompresses the previously downloaded files (`.csv.gz` $\rightarrow$ `.csv`)
- uploads the files to an S3 bucket

In [7]:
import re
import gdown
import gzip
import shutil

In [8]:
datasets_ids = [
    ("Dec.csv.gz", "1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P"),
    ("Jan.csv.gz", "1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT"),
    ("Feb.csv.gz", "1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB"),
    ("Mar.csv.gz", "1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8"),
    ("Apr.csv.gz", "1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL")
]

# downloads the unprocessed datasets
for dataset_name, dataset_id in datasets_ids:
    # check if the file is already on S3
    if dataset_name.replace(".gz", "") in list_files(client=s3, bucket=bucket):
        print(f"{dataset_name.replace('.gz', '')} already on S3")
        continue
        
    # downloads the data
    if dataset_name in os.listdir() or dataset_name.replace(".gz", "") in os.listdir():
        print(f"{dataset_name} already downloaded")
    else:
        print(f"Downloading {dataset_name}...")
        gdown.download(f"https://drive.google.com/uc?id={dataset_id}", dataset_name, quiet=False)
        
    # extracts the archives
    if dataset_name.replace(".gz", "") in os.listdir():
        print(f"{dataset_name} already extracted")
    else:
        print(f"Extracting {dataset_name} to {dataset_name.replace('.gz', '')}...")
        with gzip.open(dataset_name, 'rb') as fp_in:
            with open(dataset_name.replace(".gz", ""), 'wb') as fp_out:
                shutil.copyfileobj(fp_in, fp_out)
                
    # uploads to S3
    print(f"Uploading {dataset_name.replace('.gz', '')} to S3...")
    with open(dataset_name.replace('.gz', ''), "rb") as fp:
        s3.upload_fileobj(fp, bucket, dataset_name.replace('.gz', ''))
    print(f"Successfully Uploaded {dataset_name.replace('.gz', '')} to S3")
    
csvs = [filename for filename in list_files(client=s3, bucket=bucket) if re.match(r".*.csv", filename)]
# df_unprocessed = load_dataframe(client=s3, bucket=bucket, filepaths=csvs).select("event_time", "user_id", "event_type", "product_id")
# df_unprocessed.printSchema()
# df_unprocessed.show(4)

Dec.csv already on S3
Jan.csv already on S3
Feb.csv already on S3
Mar.csv already on S3
Apr.csv already on S3


# Data preprocessing

We now have the datasets uploaded to S3

This code will do the following:

- downloads the `.csv` datasets from S3 to this notebook's space
- loads the datasets into a PySpark dataframe
- preprocesses the dataframe to:
    - transform the implicit feedbacks to explicit ones 
    - remap ids to small integers, since the recommender system does not accept such values 

In [9]:
from cc_project import datasets

df_processed = load_dataframe(client=s3, bucket=bucket, filepaths=f"preprocessed_dataset.tsv")
df_processed.printSchema()
print(f"|df_processed| = {df_processed.count()}")
df_processed.show(4)

root
 |-- event_time: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)

|df_processed| = 1048574
+--------------------+----------+---------+------+
|          event_time|product_id|  user_id|rating|
+--------------------+----------+---------+------+
|2019-12-01 00:00:...|  25399270|406166684|     3|
|2019-12-01 00:00:...|      3761|440989572|     0|
|2019-12-01 00:01:...|      3855|428825782|     2|
|2019-12-01 00:01:...|      3860|432624845|    18|
+--------------------+----------+---------+------+
only showing top 4 rows



# Recommender system

## Training and test splits

In [10]:
df_train, df_test = df_processed.randomSplit([0.8, 0.2])

for df_name, df in [("df_train", df_train),
                    ("df_test", df_test)]:
    print(f"|{df_name}| = {df.count()}")
    df.show(4)

|df_train| = 838987
+--------------------+----------+---------+------+
|          event_time|product_id|  user_id|rating|
+--------------------+----------+---------+------+
|2019-12-01 00:00:...|  25399270|406166684|     3|
|2019-12-01 00:00:...|      3761|440989572|     0|
|2019-12-01 00:01:...|      3855|428825782|     2|
|2019-12-01 00:01:...|      3860|432624845|    18|
+--------------------+----------+---------+------+
only showing top 4 rows

|df_test| = 209587
+--------------------+----------+---------+------+
|          event_time|product_id|  user_id|rating|
+--------------------+----------+---------+------+
|2019-12-01 00:02:...|  16200027|438982266|     3|
|2019-12-01 00:06:...|      3681|411869891|     3|
|2019-12-01 00:07:...|  25401814|385698033|     3|
|2019-12-01 00:11:...|      2338|383650265|     2|
+--------------------+----------+---------+------+
only showing top 4 rows



## Training

In [11]:
from cc_project import recommender_system

In [13]:
model = recommender_system.train_recommender_system(df_train=df_train, df_test=df_test,
                                                    tuning=True, logs=True)

model.transform(df_test).na.drop().show(4)

	training model 1/12
		found best model with RMSE=4.933335635127373:
			rank=10	reg_param=1
	training model 2/12
	training model 3/12
	training model 4/12
	training model 5/12
	training model 6/12
	training model 7/12
	training model 8/12
	training model 9/12
	training model 10/12
	training model 11/12
	training model 12/12
Training time: 310s
+--------------------+----------+---------+------+----------+
|          event_time|product_id|  user_id|rating|prediction|
+--------------------+----------+---------+------+----------+
|2019-12-25 15:56:...|      3749|442302775|     7| 3.1553183|
|2019-12-02 08:45:...|      3749|425489638|     5| 2.3343449|
|2019-12-25 07:35:...|      3749|439871079|     8|  6.504874|
|2019-12-11 10:39:...|      3749|416292228|     2|  4.159937|
+--------------------+----------+---------+------+----------+
only showing top 4 rows



### Saving the model

In [14]:
# saves locally
model_path = recommender_system.save_model_zip(model=model, model_name="model", output_folder=".", logs=False)

# saves to S3
with open(model_path, "rb") as fp:
        s3.upload_fileobj(fp, bucket, model_path.split("/")[-1])
print(f"Model saved on S3 to {model_path.split('/')[-1]}")

Model saved on S3 to model.zip


## Evaluation

In [15]:
rmse = recommender_system.evaluate_recommender_system(df=df_test, model=model, logs=True)

Starting evaluation of ALS model
Evaluation time: 8s
Root-mean-square error: 4.933335635127373
