# Ice cream!: A preliminary analysis 

In [None]:
import matplotlib.pyplot as plt
import math
import os
import random
import cv2

# code for visualization borrowed from Marília Prata: https://www.kaggle.com/mpwolke/cherry-oops-i-didn-t-it-again
def visualize_images(path, n_images, is_random=True, figsize=(16, 16)):
    plt.figure(figsize=figsize)
    w = int(n_images ** .5)
    h = math.ceil(n_images / w)
    
    all_names = os.listdir(path)
    image_names = all_names[:n_images]   
    if is_random:
        random.seed(0)
        image_names = random.sample(all_names, n_images)
            
    for ind, image_name in enumerate(image_names):
        img = cv2.imread(os.path.join(path, image_name))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
        plt.subplot(h, w, ind + 1)
        plt.imshow(img)
        plt.xticks([])
        plt.yticks([])
    
    plt.show()
visualize_images('../input/ice-cream-dataset/combined/images', 9)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.dates as mdates

**We will only look at Ben & Jerry's data to begin with.**

In [None]:
prod = pd.read_csv("../input/ice-cream-dataset/bj/products.csv")
rev = pd.read_csv("../input/ice-cream-dataset/bj/reviews.csv")

# Product data

In [None]:
prod.head()

In [None]:
prod.shape

## Distribution of average rating

In [None]:
sns.distplot(prod["rating"], bins=25)
plt.xlim([1,5])
plt.show()

In [None]:
prod["rating"].describe()

The reviews are generally very positive (median 4.6) with a skew left.

## Best and worst products by average rating

Best reviews

In [None]:
prod[["name","rating","rating_count"]].sort_values("rating", ascending=False).head(10)

Ice Cream Sammie and Chocolate Peanut Butter Split are the highest rated flavors (although they also have fewer reviews). Of the flavors with 70+ reviews we see Sweet Like Sugar Cookie Dough Core, Chocolate Therapy®, and Phish Food®.

Worst reviews

In [None]:
prod[["name","rating","rating_count"]].sort_values("rating").head(10)

Chocolate chip cooking dough core is the most disliked flavor by far. Also, "core" flavors tend to be disliked more often. Interestingly, disliked flavors have more reviews.

## Most reviewed products

In [None]:
prod[["name","rating","rating_count"]].sort_values("rating_count", ascending=False).head(10)

Several flavors have > 200 reviews. The top four most reviewed flavors have generally good ratings (4.6-4.7).

## Common and uncommon ingredients

Common ingredients

In [None]:
big_ingred_list = []
for ingred_list in prod["ingredients"]:
    # we can't quite do:  big_ingred_list.extend(ingred_list.split(", "))
    # because there are commas within ingredients i.e.  "LIQUID SUGAR (SUGAR, WATER)" is 1 ingredient
    start = 0
    inside = False
    for i,char in enumerate(ingred_list):
        if char == "(":
            inside = True
        if char == ")":
            inside = False
        if not inside and char == ",":
            big_ingred_list.append(ingred_list[start:i].lstrip())
            start = i+1

ct = Counter(big_ingred_list)
most_common = ct.most_common(30)
most_common

Uncommon ingredients

In [None]:
ct.most_common()[-20:]

In [None]:
# Number of unique ingredients
len(ct)

## Unique ingredients in the top flavors
The code below gets the ingredients used in the top 10 highest rated recipes and "subtracts" any ingredients which are commonly used across all flavors (defined above).

In [None]:
top_flavor_ingreds = prod.sort_values("rating", ascending=False)["ingredients"].head(10)
big_ingred_list2 = []
for ingred_list in top_flavor_ingreds:
    start = 0
    inside = False
    for i,char in enumerate(ingred_list):
        if char == "(":
            inside = True
        if char == ")":
            inside = False
        if not inside and char == ",":
            big_ingred_list2.append(ingred_list[start:i].lstrip())
            start = i+1

ct2 = Counter(big_ingred_list2)
[(ingred,count) for ingred,count in ct2.most_common() if ingred not in [i for i,x in most_common]]

We see peanuts and chocolate liquor appear in three of the top 10 flavors. Cocoa and lemon juice also appear in two.

# Reviews data

In [None]:
rev.head()

In [None]:
rev.shape

## Review dates

In [None]:
rev["date"] = pd.to_datetime(rev["date"], format="%Y-%m-%d")
mpl_data = mdates.date2num(rev["date"])
plt.hist(mpl_data, bins="auto")
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%Y'))
plt.show()

Reviews date back to 2017 and have been consistent since. Notably, there has been a spike in reviews in the last few months!

## Controversial products

In [None]:
# group by flavor and compute standard deviation of star ratings
std_rev = rev.groupby("key")["stars"].std().sort_values(ascending=False).head(5)
std_rev.name = "stdev_stars"

# check to see how many ratings they have, and the mean rating
pd.concat([std_rev,prod[["key","rating","rating_count"]].set_index("key")], axis=1).head(5)

The controversial products tend to have a low average rating, meaning there are a few positive outliers rating them as 5-star.

## Helpfulness of reviews

In [None]:
sum_votes = rev[["helpful_yes","helpful_no"]].sum(axis=1) # sum yes & no votes for each review
has_votes = sum_votes > 0 # reviews with votes
rev["vote_ratio"] = rev.loc[has_votes,"helpful_yes"].div(sum_votes)
sns.distplot(rev["vote_ratio"], bins=15)
plt.xlim([0,1])
plt.show()

There is a bimodal distribution. Some reviews are marked as very helpful, some are considered very unhelpful.

Below are some "unhelpful reviews." They generally have negative sentiment.

In [None]:
print("\n\n".join(rev.sort_values("vote_ratio")["text"].head(5).values))

# More to come...