In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType, IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Data preparation

In [0]:
recipes = spark.read.option("sep", ",").option("quote", "\"").option("escape", "\"").option("header", "true").option("multiline", "true").csv("wasb://workfront@buyingcirclestorage.blob.core.windows.net/recipe/recipes.csv")
reviews = spark.read.option("sep", ",").option("quote", "\"").option("escape", "\"").option("header", "true").option("multiline", "true").csv("wasb://workfront@buyingcirclestorage.blob.core.windows.net/recipe/reviews.csv")

In [0]:
recipes.createOrReplaceTempView("recipes")
reviews.createOrReplaceTempView("reviews")

In [0]:
%sql
select *
from recipes
limit 100

RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen Dessert recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/YUeirxMLQaeE1h3v3qnM_229%20berry%20blue%20frzn%20dess.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/AFPDDHATWzQ0b1CDpDAT_255%20berry%20blue%20frzn%20dess.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/UYgf9nwMT2SGGJCuzILO_228%20berry%20blue%20frzn%20dess.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/PeBMJN2TGSaYks2759BA_20140722_202142.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/picuaETeN.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/38/pictzvxW5.jpg"")",Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"", ""Healthy"", ""Free Of..."", ""Summer"", ""Weeknight"", ""Freezer"", ""Easy"")","c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla yogurt"", ""lemon juice"")",4.5,4.0,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stand for 45 minutes, stirring occasionally."", ""Transfer berry-sugar mixture to food processor."", ""Add yogurt and process until smooth."", ""Strain through fine sieve. Pour into baking pan (or transfer to ice cream maker and process according to manufacturers' directions). Freeze uncovered until edges are solid but centre is soft. Transfer to processor and blend until smooth again."", ""Return to pan and freeze until edges are solid."", ""Transfer to processor and blend until smooth again."", ""Fold in remaining 2 cups of blueberries."", ""Pour into plastic mold and freeze overnight. Let soften slightly to serve."")"
39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/39/picM9Mhnw.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/39/picHv4Ocr.jpg"")",Chicken Breast,"c(""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"", ""Meat"", ""Asian"", ""Indian"", ""Weeknight"", ""Stove Top"")","c(""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ""1"", ""1"", ""1/4"", ""1/4"", ""1/2"", ""1/4"", ""2"", ""3"", NA, ""2"", ""1"", ""1"", ""8"", ""2"", ""1/3"", ""1/3"", ""1/3"", ""6"")","c(""saffron"", ""milk"", ""hot green chili peppers"", ""onions"", ""garlic"", ""clove"", ""peppercorns"", ""cardamom seed"", ""cumin seed"", ""poppy seed"", ""mace"", ""cilantro"", ""mint leaf"", ""fresh lemon juice"", ""plain yogurt"", ""boneless chicken"", ""salt"", ""ghee"", ""onion"", ""tomatoes"", ""basmati rice"", ""long-grain rice"", ""raisins"", ""cashews"", ""eggs"")",3.0,1.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and puree in blender."", ""Add chiles, onions, ginger, garlic, cloves, peppercorns, cardamom seeds, cinnamon, coriander and cumin seeds, poppy seeds, nutmeg, mace, cilantro or mint leaves and lemon juice. Blend into smooth paste. Put paste into large bowl, add yogurt and mix well."", ""Marinate chicken in yogurt mixture with salt, covered for at least 2 - 6 hours in refrigerator."", ""In skillet. heat oil over medium heat for 1 minute. Add ghee and 15 seconds later add onion and fry for about8 minutes."", ""Reserve for garnish."", ""In same skillet, cook chicken with its marinade with tomatoes for about 10 minutes over medium heat, uncovered."", ""Remove chicken pieces from the sauce and set aside. Add rice to sauce, bring to boil, and cook, covered over low heat for 15 minutes."", ""Return chicken and add raisins, cashews and almonds; mix well."", ""Simmer, covered for 5 minutes."", ""Place chicken, eggs and rice in large serving dish in such a way that yellow of the eggs, the saffron-colored rice, the nuts and the chicken make a colorful display."", ""Add reserved onion as garnish."")"
40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,"This is from one of my first Good House Keeping cookbooks. You must use a *zester* in order to avoid getting any of that bitter rind, and when you zest the lemons, zest them onto some sugar from the recipe (the sugar will 'catch' all of the oils). I also advise you from personal experience to use only the best skinned lemons for the best flavor.","c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/40/picJ4Sz3N.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/40/pic23FWio.jpg"")",Beverages,"c(""Low Protein"", ""Low Cholesterol"", ""Healthy"", ""Summer"", ""< 60 Mins"")","c(""1 1/2"", ""1"", NA, ""1 1/2"", NA, ""3/4"")","c(""sugar"", ""lemons, rind of"", ""lemon, zest of"", ""fresh water"", ""fresh lemon juice"")",4.5,10.0,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, put sugar and lemon peel, or zest; add 1 1/2 cups very hot water (not from tap!). With lid fitted firmly, shake jar until sugar is dissolved."", ""Add lemon juice. Refrigerate until chilled."", ""To Serve: Into each 12-ounce glass, over ice cubes, pour 1/4 cup of the lemon syrup."", ""Then add chilled club soda or, if you prefer, water."", ""Stir to mix well."")"
41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to allow the ingredients to soak in the marinade overnight.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/41/picmbLig8.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/41/picL02w0s.jpg"")",Soy/Tofu,"c(""Beans"", ""Vegetable"", ""Low Cholesterol"", ""Weeknight"", ""Broil/Grill"", ""Oven"")","c(""12"", ""1"", ""2"", ""1"", ""10"", ""1"", ""3"", ""2"", ""2"", ""2"", ""1"", ""2"", ""1/2"", ""1/4"", ""4"")","c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""mushrooms"", ""soy sauce"", ""low sodium soy sauce"", ""olive oil"", ""maple syrup"", ""honey"", ""red wine vinegar"", ""lemon juice"", ""garlic cloves"", ""mustard powder"", ""black pepper"")",4.5,2.0,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out excess water, and pat dry with paper towels."", ""Cut tofu into one-inch squares."", ""Set aside. Cut eggplant lengthwise in half, then cut each half into approximately three strips."", ""Cut strips crosswise into one-inch cubes."", ""Slice zucchini into half-inch thick slices."", ""Cut red pepper in half, removing stem and seeds, and cut each half into one-inch squares."", ""Wipe mushrooms clean with a moist paper towel and remove stems."", ""Thread tofu and vegetables on to barbecue skewers in alternating color combinations: For example, first a piece of eggplant, then a slice of tofu, then zucchini, then red pepper, baby corn and mushrooms."", ""Continue in this way until all skewers are full."", ""Make the marinade by putting all ingredients in a blender, and blend on high speed for about one minute until mixed."", ""Alternatively, put all ingredients in a glass jar, cover tightly with the lid and shake well until mixed."", ""Lay the kebabs in a long, shallow baking pan or on a non-metal tray, making sure they lie flat. Evenly pour the marinade over the kebabs, turning them once so that the tofu and vegetables are coated."", ""Refrigerate the kebabs for three to eight hours, occasionally spooning the marinade over them."", ""Broil or grill the kebabs at 450 F for 15-20 minutes, or on the grill, until the vegetables are browned."", ""Suggestions This meal can be served over cooked, brown rice. Amounts can easily be doubled to make four servings."")"
42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from Food.com.,"""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/42/picVEMxk8.jpg""",Vegetable,"c(""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""Healthy"", ""Winter"", ""< 60 Mins"", ""Easy"")","c(""46"", ""4"", ""1"", ""2"", ""1"")","c(""plain tomato juice"", ""cabbage"", ""onion"", ""carrots"", ""celery"")",4.5,11.0,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil."", ""Reduce heat and simmer for 30 minutes (longer if you prefer your veggies to be soft)."", ""Refrigerate until cool."", ""Serve chilled with sour cream."")"
43,Best Blackbottom Pie,34879,Barefoot Beachcomber,PT2H,PT20M,PT2H20M,1999-08-21T10:35:00Z,Make and share this Best Blackbottom Pie recipe from Food.com.,character(0),Pie,"c(""Dessert"", ""Weeknight"", ""Stove Top"", ""< 4 Hours"")","c(""1 1/4"", ""1/4"", ""6"", ""1/3"", ""1/4"", ""1/4"", ""2"", ""3"", ""1"", ""1"", ""1/4"", ""1"", ""2"", ""3"", ""1/4"", ""1/2"", NA)","c(""graham cracker crumbs"", ""sugar"", ""butter"", ""sugar"", ""cornstarch"", ""salt"", ""milk"", ""vanilla extract"", ""water"", ""gelatin"", ""rum"", ""cream of tartar"", ""sugar"")",1.0,1.0,437.9,19.3,10.9,94.3,267.6,58.0,1.8,42.5,7.0,8.0,1 9-inch pie,"c(""Graham Cracker Crust: In small bowl, combine graham cracker crumbs, sugar and butter. Press evenly on bottom and sides of 9-inch pie plate. Chill until firm (about 1 hour)."", ""Chocolate Layer: In medium saucepan, combine sugar, cornstarch and salt. Gradually stir in milk. Cook over medium heat, stirring constantly, until mixture boils. Remove from heat. In small bowl, beat egg yolks. Gradually stir in small amount of hot mixture; return to saucepan. Cook over low heat, stirring constantly, for 2 minutes. Remove from heat."", ""Remove 1-1/2 cups custard to medium bowl; add semi-sweet chocolate morsels and vanilla extract. Stir until morsels are melted and mixture is smooth."", ""Pour into prepared Graham Cracker Crust; chill until set (about 30 minutes)."", ""While Chocolate Layer is chilling, prepare Vanilla Layer."", ""Vanilla Layer: In large bowl, combine cold water and gelatin; let stand 5 minutes. Add remaining warm custard; stir until gelatin dissolves. Cool 15 minutes. Stir in rum; beat with wire whisk until smooth. Set aside."", ""In 1-1/2 quart bowl, combine egg whites and cream of tartar; beat until foamy. Gradually add sugar; beat until stiff peaks form. Fold egg whites into custard; pour over chocolate layer."", ""Chill until set (about 2 hours)."", ""Garnish with whipped cream and chocolate shavings, if desired."", ""Makes one 9-inch pie."")"
44,Warm Chicken A La King,1596,Joan Edington,PT3M,PT35M,PT38M,1999-09-17T04:47:00Z,"I copied this one out of a friend's book so many moons ago that I can't remember where it's from, but it's so decadently fattening that I can't resist pigging out now and then. I usually serve with rice, but I suppose it would go with noodles or new potatoes just as well.","""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/44/picsSKvFd.jpg""",Chicken,"c(""Poultry"", ""Meat"", ""< 60 Mins"")","c(""12"", ""2"", ""3"", ""450"", ""1"", ""2"", ""1/4"", ""1"", NA, NA, ""2"", ""2"", ""1"", NA)","c(""chicken"", ""butter"", ""flour"", ""milk"", ""celery"", ""button mushrooms"", ""green pepper"", ""canned pimiento"", ""salt"", ""black pepper"", ""Worcestershire sauce"", ""parsley"")",5.0,23.0,895.5,66.8,31.9,405.8,557.2,29.1,3.1,5.0,45.3,2.0,,"c(""Melt 1 1/2 ozs butter, add the flour and cook for 2 to 3 minutes, stirring."", ""Gradually add milk and cook, stirring, until thick and smooth."", ""Melt the remaining butter and saute sliced celery, button mushrooms and chopped pepper until soft but not coloured."", ""Add celery, mushrooms, pepper, chicken and pimiento to the sauce and heat through."", ""Season to taste. Combine the egg yolks, double cream and Worcestershire sauce. Add to the chicken mixture and heat through."", ""Transfer to a serving dish and sprinkle with chopped parsley."" )"
45,Buttermilk Pie With Gingersnap Crumb Crust,1580,tristitia,PT50M,PT30M,PT1H20M,1999-08-06T00:40:00Z,Make and share this Buttermilk Pie With Gingersnap Crumb Crust recipe from Food.com.,"""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/45/pic79tPh5.jpg""",Pie,"c(""Dessert"", ""Healthy"", ""Weeknight"", ""Oven"", ""< 4 Hours"")","c(""3/4"", ""1"", ""1"", ""2"", ""3"", ""1/4"", ""1"", ""1/2"", ""1/2"", ""2"")","c(""sugar"", ""margarine"", ""egg"", ""flour"", ""salt"", ""buttermilk"", ""graham cracker crumbs"", ""margarine"")",4.0,3.0,228.0,7.1,1.7,24.5,281.8,37.5,0.5,24.7,4.2,8.0,,"c(""Preheat oven to 350°F."", ""Make pie crust, using 8 inch pie pan, do not bake."", ""Mix sugar and margarine in medium bowl until blended; beat in egg whites and egg."", ""Stir in flour, salt, and buttermilk until well blended."", ""Pour filling into prepared crust, bake 40 minutes or until sharp knife inserted near center comes out clean."", ""Sprinkle with nutmeg and serve warm or chilled."", ""Combine graham crumbs, gingersnap crumbs, and margarine in 8 or 9 inch pie pan, pat mixture evenly on bottom and side of pan."", ""Bake 8 to 10 minutes or until edge of crust is lightly browned."", ""Cool on wire rack."")"
46,A Jad - Cucumber Pickle,1533,Dancer,,PT25M,PT25M,1999-08-11T19:48:00Z,Make and share this A Jad - Cucumber Pickle recipe from Food.com.,character(0),Vegetable,"c(""Thai"", ""Asian"", ""Free Of..."", ""< 30 Mins"")","c(""1/2"", ""5"", ""2"", ""1"", ""1"", ""1"")","c(""rice vinegar"", ""haeo"")",5.0,2.0,4.3,0.0,0.0,0.0,0.7,1.1,0.2,0.2,0.1,,1 cup,"c(""Slice the cucumber in four lengthwise, then slice the pieces to segments about an eighth of an inch thick."", ""Slice the tops of the chilies (green ones can be used if red are not available, but Thais like the color contrast), tap out any loose seeds and discard, then slice the chilies across into thin rounds."", ""Slice the shallots and water chestnuts."", ""Combine and serve. This will keep 2 or 3 weeks in a refrigerator."")"
47,Butter Pecan Cookies,1573,benluc,PT9M,PT55M,PT1H4M,1999-09-07T09:01:00Z,Make and share this Butter Pecan Cookies recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/47/picfnmxck.jpg"", ""https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/47/picCPvxZU.jpg"")",Dessert,"c(""Cookie & Brownie"", ""Fruit"", ""Nuts"", ""Weeknight"", ""Oven"", ""< 4 Hours"")","c(""3/4"", ""1/2"", ""1"", ""1"", ""1"", ""2"", ""1"")","c(""butter"", ""brown sugar"", ""granulated sugar"", ""vanilla extract"", ""flour"", ""pecan halves"")",4.0,2.0,69.0,5.6,1.4,6.3,15.0,4.5,0.6,1.6,0.8,,84 cookies,"c(""Preheat oven to 350 degrees."", ""Cream butter in large mixing bowl."", ""Gradually add brown sugar and granulated sugar."", ""Cream well."", ""Add unbeaten egg yolk and vanilla and beat well."", ""Blend in sifted flour to form a stiff dough."", ""Shape dough into small balls."", ""Place on greased cookie sheet. Flatten cookies with bottom of glass dipped in sugar."", ""Bake at 350 degrees for 7-9 minutes, till golden brown (do not overbrown.) Cool before frosting."", ""Garnish with pecan halves."")"


In [0]:
recipes_clean = spark.sql("""
select RecipeId, Description, RecipeInstructions, ReviewCount, CookTime, PrepTime, RecipeCategory, RecipeIngredientParts, AggregatedRating, Calories, FatContent, SaturatedFatContent, CholesterolContent, SodiumContent, CarbohydrateContent, FiberContent, SugarContent, ProteinContent, RecipeServings 
from recipes
where AggregatedRating != 'NA' and ReviewCount > 1
""")

In [0]:
# recipes_clean = spark.sql("""
# select RecipeId, Description, RecipeInstructions, ReviewCount, CookTime, PrepTime, RecipeCategory, RecipeIngredientParts, AggregatedRating, Calories, FatContent, SaturatedFatContent, CholesterolContent, SodiumContent, CarbohydrateContent, FiberContent, SugarContent, ProteinContent, RecipeServings 
# from recipes
# where AggregatedRating != 'NA'
# """)

In [0]:
recipes_clean.createOrReplaceTempView("recipes_clean")

# Feature processing

In [0]:
### Feature: CookDuration, PrepDuration
recipes_clean = recipes_clean.withColumn('CookHours', F.regexp_extract(F.col('CookTime'), '(\d+)(?=H)', 0).cast('int'))
recipes_clean = recipes_clean.withColumn('CookMinutes', F.regexp_extract(F.col('CookTime'), '(\d+)(?=M)', 0).cast('int'))
recipes_clean = recipes_clean.fillna({'CookHours': 0, 'CookMinutes': 0})
recipes_clean = recipes_clean.withColumn('CookDuration', recipes_clean['CookHours'] * 60 + recipes_clean['CookMinutes'])

recipes_clean = recipes_clean.withColumn('PrepHours', F.regexp_extract(F.col('PrepTime'), '(\d+)(?=H)', 0).cast('int'))
recipes_clean = recipes_clean.withColumn('PrepMinutes', F.regexp_extract(F.col('PrepTime'), '(\d+)(?=M)', 0).cast('int'))
recipes_clean = recipes_clean.fillna({'PrepHours': 0, 'PrepMinutes': 0})
recipes_clean = recipes_clean.withColumn('PrepDuration', recipes_clean['PrepHours'] * 60 + recipes_clean['PrepMinutes'])

recipes_clean = recipes_clean.drop(*['CookTime', 'CookHours', 'CookMinutes', 'PrepTime', 'PrepHours', 'PrepMinutes'])

### Feature: DescriptionLen
recipes_clean = recipes_clean.withColumn('DescriptionLen', F.length(recipes_clean['Description']))
recipes_clean = recipes_clean.drop('Description')

### Feature: RecipeIngredientPartsCount
recipes_clean = recipes_clean.withColumn('RecipeIngredientPartsCount', F.size(F.split(F.col('RecipeIngredientParts'), ',')))
recipes_clean = recipes_clean.drop('RecipeIngredientParts')

### Feature: RecipeInstructionSteps
recipes_clean = recipes_clean.withColumn('RecipeInstructionSteps', F.size(F.split(F.col('RecipeInstructions'), ',')))
recipes_clean = recipes_clean.drop('RecipeInstructions')

In [0]:
### Aggregate RecipeCategory
category_counts = recipes_clean.groupBy('RecipeCategory').agg(F.countDistinct('RecipeId').alias('CategoryCount'))
recipes_clean = recipes_clean.join(category_counts, on = 'RecipeCategory', how = 'left')
recipes_clean = recipes_clean.withColumn('RecipeCategory', F.when(F.col('CategoryCount') < 500, 'Other').otherwise(F.col('RecipeCategory')))
recipes_clean = recipes_clean.drop('CategoryCount')
recipes_clean.groupBy('RecipeCategory').agg(F.countDistinct('RecipeId').alias('count')).orderBy(F.desc('count')).show(50,truncate=False)

### Feature: dummy variables based on RecipeCategory
indexer = StringIndexer(inputCol='RecipeCategory', outputCol='RecipeCategoryIndex')
recipes_clean = indexer.fit(recipes_clean).transform(recipes_clean)
encoder = OneHotEncoder(inputCol='RecipeCategoryIndex', outputCol='RecipeCategoryVec')
recipes_clean = encoder.fit(recipes_clean).transform(recipes_clean)

recipe_category_index_mapping = recipes_clean.select(['RecipeCategory', 'RecipeCategoryIndex']).distinct()
# display(recipe_category_index_mapping)

+----------------+-----+
|RecipeCategory  |count|
+----------------+-----+
|Other           |17860|
|Dessert         |17278|
|Lunch/Snacks    |11701|
|One Dish Meal   |11147|
|Vegetable       |9546 |
|Breakfast       |7122 |
|Beverages       |5567 |
|Chicken         |5076 |
|Pork            |4559 |
|Breads          |4380 |
|Potato          |4297 |
|Chicken Breast  |4275 |
|Quick Breads    |4217 |
|Meat            |4096 |
|Sauces          |3875 |
|Cheese          |3102 |
|Bar Cookie      |2561 |
|Drop Cookies    |2404 |
|Pie             |2375 |
|Yeast Breads    |2263 |
|< 60 Mins       |2050 |
|Stew            |1964 |
|< 30 Mins       |1940 |
|Salad Dressings |1684 |
|Candy           |1684 |
|Beans           |1669 |
|Low Protein     |1622 |
|< 15 Mins       |1557 |
|Spreads         |1491 |
|Smoothies       |1321 |
|Poultry         |1158 |
|Steak           |1151 |
|Frozen Desserts |1101 |
|Onions          |1096 |
|Savory Pies     |1040 |
|Curries         |1001 |
|Rice            |998  |


In [0]:
### prepare response variable
### try multi class classification
# recipes_clean = recipes_clean.withColumn('AggregatedRatingBucket', F.when(recipes_clean['AggregatedRating'].isin([1, 1.5, 2, 2.5]), 'low').when(recipes_clean['AggregatedRating'].isin([3, 3.5, 4]), 'medium').otherwise('high'))
# recipes_clean = recipes_clean.withColumn('AggregatedRatingBucket', F.when(recipes_clean['AggregatedRatingBucket']=='low', '1').when(recipes_clean['AggregatedRatingBucket'] == 'medium', '2').otherwise('3'))
### change to binary class classification
recipes_clean.groupBy('AggregatedRating').count().orderBy('AggregatedRating').show()
recipes_clean = recipes_clean.withColumn('AggregatedRatingBucket', F.when(recipes_clean['AggregatedRating'].isin([1, 1.5, 2, 2.5, 3, 3.5]), 'low').otherwise('high'))
recipes_clean = recipes_clean.withColumn('AggregatedRatingBucket', F.when(recipes_clean['AggregatedRatingBucket']=='low', '1').otherwise('0'))
recipes_clean = recipes_clean.withColumn('AggregatedRatingBucket', recipes_clean['AggregatedRatingBucket'].cast(IntegerType()))
recipes_clean = recipes_clean.drop('AggregatedRating')

+----------------+------+
|AggregatedRating| count|
+----------------+------+
|               1|   225|
|             1.5|    76|
|               2|   328|
|             2.5|   673|
|               3|  2755|
|             3.5|  3978|
|               4| 16063|
|             4.5| 34330|
|               5|109201|
+----------------+------+



In [0]:
recipes_clean.groupBy('AggregatedRatingBucket').count().orderBy('AggregatedRatingBucket').show()

+----------------------+------+
|AggregatedRatingBucket| count|
+----------------------+------+
|                     0|159594|
|                     1|  8035|
+----------------------+------+



In [0]:
### convert string columns to numerical columns
cols_to_convert = ['DescriptionLen', 'RecipeInstructionSteps', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 'RecipeServings']
for col_name in cols_to_convert:
    recipes_clean = recipes_clean.withColumn(col_name, recipes_clean[col_name].cast("float"))

In [0]:
recipes_clean.select(cols_to_convert).describe().show()

+-------+------------------+----------------------+------------------+----------------+-------------------+------------------+------------------+-------------------+-----------------+-----------------+------------------+------------------+
|summary|    DescriptionLen|RecipeInstructionSteps|          Calories|      FatContent|SaturatedFatContent|CholesterolContent|     SodiumContent|CarbohydrateContent|     FiberContent|     SugarContent|    ProteinContent|    RecipeServings|
+-------+------------------+----------------------+------------------+----------------+-------------------+------------------+------------------+-------------------+-----------------+-----------------+------------------+------------------+
|  count|            167629|                167629|            167629|          167629|             167629|            167629|            167629|             167629|           167629|           167629|            167629|            106357|
|   mean|202.45951476176558|    12.37081

### check missing rate

In [0]:
# missing_rates = recipes_clean.select([(F.count(F.when(F.col(c).isNull(), c))/recipes_clean.count()).alias(c) for c in cols_to_convert]).show()

In [0]:
recipes_clean = recipes_clean.fillna(0, subset=cols_to_convert)

In [0]:
recipes_clean = recipes_clean.drop(*['RecipeCategory', 'ReviewCount', 'RecipeId'])

### check association direction between features and response

In [0]:
# feature_cols = ['DescriptionLen', 'RecipeInstructionSteps', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 
#     'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 'RecipeServings', 
#     'CookDuration', 'PrepDuration', 'RecipeIngredientPartsCount'] 
# response_col = 'AggregatedRatingBucket'

# for feature_col in feature_cols:
#     correlation = recipes_clean.stat.corr(feature_col, response_col)
#     print(f"Correlation between {feature_col} and {response_col}: {round(correlation, 4)}")

# Model building

### upsampling minority class

In [0]:
# # Calculating the ratio of weights to oversample
# score1_df = recipes_clean.filter(F.col("AggregatedRatingBucket") == 1)
# score2_df = recipes_clean.filter(F.col("AggregatedRatingBucket") == 2)
# score3_df = recipes_clean.filter(F.col("AggregatedRatingBucket") == 3)

# ratio_3_1 = int(score3_df.count()/score1_df.count())
# ratio_3_2 = int(score3_df.count()/score2_df.count())

# print("ratio_3_1: {}".format(ratio_3_1))
# print("ratio_3_2: {}".format(ratio_3_2))

# # duplicate the minority rows in Successful state
# up1_df = score1_df.withColumn("dummy", F.explode(F.array([F.lit(x) for x in range(int(ratio_3_1+1))]))).drop('dummy')
# # combine both oversampled successful rows and previous majority rows 
# score3_df = score3_df.unionAll(up1_df)

# # duplicate the minority rows in Successful state
# up2_df = score2_df.withColumn("dummy", F.explode(F.array([F.lit(x) for x in range(int(ratio_3_2+1))]))).drop('dummy')
# # combine both oversampled successful rows and previous majority rows 
# score3_df = score3_df.unionAll(up2_df)

# recipes_clean_up = score3_df
# # recipes_clean_up.groupBy('AggregatedRatingBucket').count().orderBy('AggregatedRatingBucket').show()

In [0]:
# # Calculating the ratio of weights to oversample
# score0_df = recipes_clean.filter(F.col("AggregatedRatingBucket") == 0)
# score1_df = recipes_clean.filter(F.col("AggregatedRatingBucket") == 1)

# ratio_1_0 = int(score1_df.count()/score0_df.count())

# print("ratio_1_0: {}".format(ratio_1_0))

# # duplicate the minority rows in Successful state
# up0_df = score0_df.withColumn("dummy", F.explode(F.array([F.lit(x) for x in range(int(ratio_1_0+1))]))).drop('dummy')
# # combine both oversampled successful rows and previous majority rows 
# score1_df = score1_df.unionAll(up0_df)

# recipes_clean_up = score1_df
# # recipes_clean_up.groupBy('AggregatedRatingBucket').count().orderBy('AggregatedRatingBucket').show()

### split train and test data

In [0]:
### use upsampling
# train, test = recipes_clean_up.randomSplit([0.7, 0.3], seed = 123)

### not use upsampling
train, test = recipes_clean.randomSplit([0.7, 0.3], seed = 123)

### downsample majority class

In [0]:
# train.groupBy('AggregatedRatingBucket').count().show()

In [0]:
score0_df = train.filter(F.col('AggregatedRatingBucket') == 0)
score1_df = train.filter(F.col('AggregatedRatingBucket') == 1)
ratio = int(score0_df.count()/score1_df.count())
print(ratio)

sampled_score0_df = score0_df.sample(False, 1/ratio)
train_down = sampled_score0_df.unionAll(score1_df)

19


## random forest

In [0]:
assembler = VectorAssembler(
    inputCols=['DescriptionLen', 'RecipeInstructionSteps', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 
    'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 'RecipeServings', 
    'CookDuration', 'PrepDuration', 'RecipeIngredientPartsCount', 'RecipeCategoryVec'],
    outputCol='features')

rf = RandomForestClassifier(labelCol="AggregatedRatingBucket", featuresCol="features", numTrees=500)
pipeline = Pipeline(stages=[assembler, rf])
model = pipeline.fit(train)
predictions = model.transform(test)

In [0]:
# AUC
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='AggregatedRatingBucket')
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Area Under ROC 0.9294897348025476


In [0]:
display(predictions)

Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,CookDuration,PrepDuration,DescriptionLen,RecipeIngredientPartsCount,RecipeInstructionSteps,RecipeCategoryIndex,RecipeCategoryVec,AggregatedRatingBucket,features,rawPrediction,probability,prediction
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,5,103.0,3,5.0,28.0,"Map(vectorType -> sparse, length -> 58, indices -> List(28), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 13, 14, 43), values -> List(103.0, 5.0, 5.0, 3.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(475.9840479444306, 24.01595205556923))","Map(vectorType -> dense, length -> 2, values -> List(0.9519680958888614, 0.048031904111138474))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,10,380.0,1,4.0,4.0,"Map(vectorType -> sparse, length -> 58, indices -> List(4), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 13, 14, 19), values -> List(380.0, 4.0, 10.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(476.6254546792619, 23.37454532073781))","Map(vectorType -> dense, length -> 2, values -> List(0.9532509093585243, 0.046749090641475644))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,960,358.0,1,9.0,32.0,"Map(vectorType -> sparse, length -> 58, indices -> List(32), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 13, 14, 47), values -> List(358.0, 9.0, 960.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(476.3657220901832, 23.63427790981634))","Map(vectorType -> dense, length -> 2, values -> List(0.9527314441803674, 0.04726855581963273))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,5,116.0,5,10.0,4.0,"Map(vectorType -> sparse, length -> 58, indices -> List(4), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 12, 13, 14, 19), values -> List(116.0, 10.0, 5.0, 5.0, 5.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(475.7549617264586, 24.2450382735416))","Map(vectorType -> dense, length -> 2, values -> List(0.9515099234529169, 0.048490076547083184))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,10,62.0,3,3.0,4.0,"Map(vectorType -> sparse, length -> 58, indices -> List(4), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 12, 13, 14, 19), values -> List(62.0, 3.0, 10.0, 10.0, 3.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(475.3984235654502, 24.601576434549937))","Map(vectorType -> dense, length -> 2, values -> List(0.9507968471309002, 0.04920315286909986))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,10,483.0,4,7.0,4.0,"Map(vectorType -> sparse, length -> 58, indices -> List(4), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 12, 13, 14, 19), values -> List(483.0, 7.0, 15.0, 10.0, 4.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(475.5092599377557, 24.490740062243955))","Map(vectorType -> dense, length -> 2, values -> List(0.951018519875512, 0.04898148012448794))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30,15,217.0,1,5.0,32.0,"Map(vectorType -> sparse, length -> 58, indices -> List(32), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 12, 13, 14, 47), values -> List(217.0, 5.0, 30.0, 15.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(475.1640472661522, 24.83595273384751))","Map(vectorType -> dense, length -> 2, values -> List(0.9503280945323049, 0.049671905467695045))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,3,336.0,1,2.0,32.0,"Map(vectorType -> sparse, length -> 58, indices -> List(32), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 11, 13, 14, 47), values -> List(336.0, 2.0, 1.0, 3.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(476.53621610442343, 23.463783895576114))","Map(vectorType -> dense, length -> 2, values -> List(0.9530724322088477, 0.04692756779115227))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,5,74.0,1,2.0,0.0,"Map(vectorType -> sparse, length -> 58, indices -> List(0), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 11, 13, 14, 15), values -> List(74.0, 2.0, 1.0, 5.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(476.02396880036787, 23.976031199632033))","Map(vectorType -> dense, length -> 2, values -> List(0.9520479376007359, 0.04795206239926408))",0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,45,986.0,1,12.0,0.0,"Map(vectorType -> sparse, length -> 58, indices -> List(0), values -> List(1.0))",0,"Map(vectorType -> sparse, length -> 73, indices -> List(0, 1, 11, 12, 13, 14, 15), values -> List(986.0, 12.0, 1.0, 1.0, 45.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(475.96039547766077, 24.039604522339012))","Map(vectorType -> dense, length -> 2, values -> List(0.951920790955322, 0.048079209044678044))",0.0


In [0]:
# Accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="AggregatedRatingBucket", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test Accuracy = %g" % accuracy)

In [0]:
# Confustion matrix
preds_and_labels = predictions.select(['prediction','AggregatedRatingBucket']).withColumn('AggregatedRatingBucket', F.col('AggregatedRatingBucket').cast(FloatType())).orderBy('prediction')

preds_and_labels = preds_and_labels.select(['prediction','AggregatedRatingBucket'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

In [0]:
# Precision and recall
# Class-0
print('\n--------------Class-0----------------')
print('Precision          :', round(metrics.precision(0), 3))
print('recall             :', round(metrics.recall(0), 3))

# Class-1
print('\n--------------Class-1----------------')
print('Precision          :', round(metrics.precision(1), 3))
print('recall             :', round(metrics.recall(1), 3))

# # Class-3
# print('\n--------------Class-3----------------')
# print('Precision          :', round(metrics.precision(3), 3))
# print('recall             :', round(metrics.recall(3), 3))

# Overall Accuracy
print('\n Overall Accuracy:', round(metrics.accuracy, 3))

In [0]:
### Plot feature importance for top 20 variables
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Get feature importances from the model
feature_importances = model.stages[-1].featureImportances
print(len(feature_importances))

# Convert indices to feature names
feature_names = ['DescriptionLen', 'RecipeInstructionSteps', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 
    'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 'RecipeServings', 
    'CookDuration', 'PrepDuration', 'RecipeIngredientPartsCount', 'RecipeCategoryVec']  # replace with your feature names

num_binary_features = len(feature_importances) - 15
binary_feature_names = ['RecipeCategoryVec_' + str(i) for i in range(num_binary_features)]
feature_names = feature_names + binary_feature_names

importances = {feature_names[i]: importance for i, importance in enumerate(feature_importances)}

# Convert to pandas DataFrame for easier plotting
importances_df = pd.DataFrame(list(importances.items()), columns=['Feature', 'Importance'])
importances_df = importances_df.sort_values(by='Importance', ascending=False)
importances_df = importances_df.iloc[0:20, :]
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importances_df['Feature'], importances_df['Importance'], color='skyblue')
plt.xlabel("Importance")
plt.title("Feature Importance")
plt.gca().invert_yaxis()
plt.show()

### multinomial logistic regression

In [0]:
assembler = VectorAssembler(
    inputCols=['DescriptionLen', 'RecipeInstructionSteps', 'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 
    'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent', 'RecipeServings', 
    'CookDuration', 'PrepDuration', 'RecipeIngredientPartsCount', 'RecipeCategoryVec'],
    outputCol='features')

lr = LogisticRegression(featuresCol = 'features', labelCol = 'AggregatedRatingBucket', maxIter=10)

pipeline = Pipeline(stages=[assembler, lr])
model = pipeline.fit(train)
lrModel = model.stages[-1]

predictions = model.transform(test)

In [0]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

In [0]:
# Getting the training summary
trainingSummary = lrModel.summary

In [0]:
# AUC
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='AggregatedRatingBucket')
print('Test Area Under ROC', evaluator.evaluate(predictions))

In [0]:
# Accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="AggregatedRatingBucket", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Test Accuracy = %g" % accuracy)

In [0]:
# Confusion matrix
preds_and_labels = predictions.select(['prediction','AggregatedRatingBucket']).withColumn('AggregatedRatingBucket', F.col('AggregatedRatingBucket').cast(FloatType())).orderBy('prediction')

#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','AggregatedRatingBucket'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

In [0]:
# Precision and recall
# Class-0
print('\n--------------Class-0----------------')
print('Precision          :', round(metrics.precision(0), 3))
print('recall             :', round(metrics.recall(0), 3))

# Class-1
print('\n--------------Class-1----------------')
print('Precision          :', round(metrics.precision(1), 3))
print('recall             :', round(metrics.recall(1), 3))

# Overall Accuracy
print('\n Overall Accuracy:', round(metrics.accuracy, 3))

In [0]:
# # for multiclass, we can inspect metrics on a per-label basis
# print("False positive rate by label:")
# for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
#     print("label %d: %s" % (i, round(rate, 4)))

# print("True positive rate by label:")
# for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
#     print("label %d: %s" % (i, round(rate, 4)))

# print("Precision by label:")
# for i, prec in enumerate(trainingSummary.precisionByLabel):
#     print("label %d: %s" % (i, round(prec, 4)))

# print("Recall by label:")
# for i, rec in enumerate(trainingSummary.recallByLabel):
#     print("label %d: %s" % (i, round(rec, 4)))

# print("F-measure by label:")
# for i, f in enumerate(trainingSummary.fMeasureByLabel()):
#     print("label %d: %s" % (i, round(f, 4)))

# accuracy = trainingSummary.accuracy
# falsePositiveRate = trainingSummary.weightedFalsePositiveRate
# truePositiveRate = trainingSummary.weightedTruePositiveRate
# fMeasure = trainingSummary.weightedFMeasure()
# precision = trainingSummary.weightedPrecision
# recall = trainingSummary.weightedRecall
# print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
#           % (round(accuracy, 4), round(falsePositiveRate, 4), round(truePositiveRate, 4), round(fMeasure, 4), round(precision, 4), round(recall, 4)))