## Setup

In [1]:
import os
# give googe drive the required permission
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Create a folder in your drive and add the name of that folder here.
# For example, for the code below to run correctly, you need to have a folder named FoodRecSys in 'My Drive'.
# The said folder will be your home directory for the rest of the project.
# You will be able to save and read data from the folder.

os.chdir("/content/drive/MyDrive/FoodRecSys/")
os.getcwd()

'/content/drive/MyDrive/FoodRecSys'

In [3]:
os.chdir("/content/drive/MyDrive/food_recsys_project/Code Files/")
os.getcwd()

FileNotFoundError: ignored

In [10]:
try:
  import pyspark
except:
  !pip install pyspark==3.1.2
  import pyspark

In [29]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import *

In [12]:
spark = SparkSession.builder.master("local").config('spark.ui.port', '4050').getOrCreate()

In [13]:
spark

## Imports

In [14]:
import pandas as pd
import numpy as np

from pyspark.sql import functions as F
# Import for typecasting columns
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType,StringType, ArrayType

## Read the data

In [15]:
raw_ratings_df = (spark.read.csv("raw_ratings_small.csv", # modify the path to read the data
                                 header=True,
                                 inferSchema= True))

In [17]:
raw_recipes_df = spark.read.csv("raw_recipies_small.csv", # modify the path to read the data
                                header=True,
                                inferSchema=True)

In [18]:
assert (raw_recipes_df.count(), len(raw_recipes_df.columns)) == (20340, 13)
assert (raw_ratings_df.count(), len(raw_ratings_df.columns)) == (93357, 5)

#### Decide a split date based on the ratings dataframe.

In [19]:
# Find the number of data points in the interaction dataset.
# You can use the count() method.
# The output must be an integer.

num_review_int = raw_ratings_df.count()

#### Task 01 - Train Test Split

Divide the data into train and test based on the 80 - 20 split using the approach discussed. You will have to save the data in a parquet file.

In [20]:
test_num_reviews_int = round(num_review_int *0.2)

In [21]:
# Sort the interactions dataset in descending order of review date.
# Extract ```test_num_reviews_int``` most recent reviews.

temp_ratings_df = (raw_ratings_df.sort("review_date", ascending=False)
                                 .limit(test_num_reviews_int)
                  )

In [22]:
assert temp_ratings_df.count()  == 18671
assert raw_recipes_df.collect()[11][4] <= raw_recipes_df.collect()[10][4]

In [23]:
temp_ratings_df.collect()[-1][4]

'2011-07-17'

Split the data into two parts before and after 2011-07-17.

- All reviews in the ratings data after 2011-07-17 will not exsist in the training set.
- For all future predictions the date will be set at 2011-07-18.   

In [26]:
# Join raw_recipes and raw_ratings
# Use recipe_id as the key to join these dataframes
# The resulting dataframe must have all rows from the raw_ratings dataframe.

interaction_level_df = raw_ratings_df.join(raw_recipes_df.withColumnRenamed('id','recipe_id'),['recipe_id'],'left')

In [30]:
# Use the filter command to separate the datasets.
# All interactions which were rated BEFORE '2011-07-17' will be train data.

train_interaction_level_df  = (interaction_level_df.filter(col("review_date") < '2011-07-17'))

In [31]:
# Use the filter command to separate the datasets.
# All interactions which were rated ON OR AFTER '2011-07-17' will be test data.

test_interaction_level_all_recipies_df  = (interaction_level_df.filter(col("review_date") > '2011-07-17'))

In [35]:
assert (test_interaction_level_all_recipies_df.count(), len(test_interaction_level_all_recipies_df.columns)) == (18660, 17)
assert (train_interaction_level_df.count(), len(train_interaction_level_df.columns)) == (74673, 17)

In [36]:
# create data files for modeling

(train_interaction_level_df.coalesce(1)
                           .write.mode('overwrite')
                           .parquet('train_interaction_level_df.parquet'))  # change the file name and file path

(test_interaction_level_all_recipies_df.coalesce(1)
                                       .write.mode('overwrite')
                                       .parquet('test_interaction_level_df.parquet'))  # change the file name and file path