# Midterm Assignment

 ## Import necessary libraries

In [7]:
import pandas as pd 
import numpy as np
from lets_plot import * # This imports all of ggplot2's functions
LetsPlot.setup_html()
import os

 ### Read the datasets

In [8]:
all_files = [os.path.join('../data/waitrose', file) for file in os.listdir('../data/waitrose') 
             if file.endswith('.csv')]

df = pd.concat((pd.read_csv(file) for file in all_files))

In [9]:
df = pd.concat((pd.read_csv(file) for file in all_files))

# Renaming

In [10]:
df = df.drop_duplicates()

df = df.drop(columns=['data-product-name', 
                      'data-product-type', 
                      'data-product-index'])
df = (
    df.rename(columns={
        'data-product-id': 'id',
        'data-product-on-offer': 'offer',
        'product-page': 'page',
        'product-name': 'name',
        'product-size': 'size',
    })

)

# DROPPING UNWANTED

In [11]:
df = df.drop(columns = ['image-url','page','offer-description'])

## Change 64 bit to 32

In [12]:
df['id'] = df['id'].astype('int32')

In [13]:
category_count = df.groupby('name')['category'].nunique()
category_count

name
019521 - Essential Flageolet Beans in Water      3
035738 - Solero Exotic Ice cream Lolly           3
044889 - Essential Double Cream Large            2
060167 - Kronenbourg 1664 Cans                   1
086143 - Cooks' Ingredients Garlic               3
                                                ..
£100,000 a Month for a Year Scratchcard          1
Öpso Japanese Sakura Tree Reed Diffuser          2
Öpso Mediterranean Citrus Grove Reed Diffuser    2
Öpso Nordic Birch Forest Reed Diffuser           2
Ürziger Würzgarten Kabinett Loosen               2
Name: category, Length: 15418, dtype: int64

In [14]:
multi_category_products = category_count[category_count > 1].index
multi_category_products[0:10]

Index(['019521 - Essential Flageolet Beans in Water',
       '035738 - Solero Exotic Ice cream Lolly',
       '044889 - Essential Double Cream Large',
       '086143 - Cooks' Ingredients Garlic',
       '10 Mins to Bed Where's Little Dinosaur',
       '100 Warm White Firefly Solar String',
       '118x180CM Duni Yellow Table Cover', '1664 Blanc Lager Bottles',
       '1ltr Square Bottle Ocean Bound Plastic', '2 Beef Tomahawk Steaks'],
      dtype='object', name='name')

In [15]:
df['category'] = df.groupby('name')['category'].transform(lambda x: ', '.join(x.unique()))

In [16]:
df.shape

(25378, 7)

In [17]:
df.drop_duplicates(inplace=True)

In [18]:
df.shape

(16065, 7)

In [19]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 16065 entries, 0 to 1438
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              16065 non-null  int32 
 1   offer           16065 non-null  bool  
 2   name            16054 non-null  object
 3   size            16028 non-null  object
 4   item-price      16054 non-null  object
 5   price-per-unit  15713 non-null  object
 6   category        16054 non-null  object
dtypes: bool(1), int32(1), object(5)
memory usage: 5.6 MB


# Data Cleaning

In [20]:
def clean_price(price):
    if isinstance(price,str):
        if '£' in price:
            price = price.replace('£','').replace('each est.','').replace('-','').strip()
        elif 'p' in price:
            price = '0.' + price.replace('p','').replace('each est.','').replace('-','').strip()

    return float(price)

In [21]:
df['price'] = df['item-price'].apply(clean_price)

# Specific Product

In [22]:
rasp_berries = df[
    df['name'].str.contains('raspberries', case=False, na=False) & 
     (df['category'].str.contains('everyday', case=False, na=False) | 
    ~df['category'].str.contains('everyday', case=False, na=False))
]

In [23]:
rasp_berries

Unnamed: 0,id,offer,name,size,item-price,price-per-unit,category,price
14,88061,True,Waitrose Raspberries,150g,£2.60,£17.34/kg,"Dietary & Lifestyle, Fresh & Chilled, Summer",2.6
41,890609,False,Essential Raspberries,150g,£2.00,£13.34/kg,"Dietary & Lifestyle, Everyday Value, Fresh & C...",2.0
173,771268,False,Essential Frozen Raspberries,350g,£2.35,67.1p/100g,"Dietary & Lifestyle, Everyday Value, Frozen, W...",2.35
200,96831,False,Duchy Organic Raspberries,125g,£2.85,£22.80/kg,"Dietary & Lifestyle, Fresh & Chilled, Organic ...",2.85
246,49829,True,Waitrose Raspberries,225g,£2.75,£12.23/kg,"Dietary & Lifestyle, Fresh & Chilled, Summer",2.75
331,88050,True,No.1 Speciality Raspberries,200g,£3.50,£17.50/kg,"Dietary & Lifestyle, Fresh & Chilled, Summer, ...",3.5
400,662398,False,"Innocent Kids Strawberries, Raspberries & Appl...",4x150ml,£3.80,63.3p/100ml,"Dietary & Lifestyle, Fresh & Chilled, Tea, Cof...",3.8
1323,55339,False,Waitrose Frozen British Raspberries,300g,£3.40,£1.14/100g,"Dietary & Lifestyle, Frozen",3.4
1764,749455,True,"Innocent Kids Strawberries, Raspberries & Appl...",10x150ml,£5.50,36.7p/100ml,"Dietary & Lifestyle, Fresh & Chilled, Tea, Cof...",5.5
1801,611803,False,Duchy Organic Frozen Raspberries,200g,£3.80,£19/kg,"Dietary & Lifestyle, Frozen, Organic Shop, Wai...",3.8


In [24]:
with pd.option_context('display.max_rows', None,):
    print(rasp_berries)

          id  offer                                               name  \
14     88061   True                               Waitrose Raspberries   
41    890609  False                              Essential Raspberries   
173   771268  False                       Essential Frozen Raspberries   
200    96831  False                          Duchy Organic Raspberries   
246    49829   True                               Waitrose Raspberries   
331    88050   True                        No.1 Speciality Raspberries   
400   662398  False  Innocent Kids Strawberries, Raspberries & Appl...   
1323   55339  False                Waitrose Frozen British Raspberries   
1764  749455   True  Innocent Kids Strawberries, Raspberries & Appl...   
1801  611803  False                   Duchy Organic Frozen Raspberries   
2260  757723  False      Divine FT 70% Dark Chocolate with Raspberries   
2880  520323  False        Cooks' Ingredients Freeze Dried Raspberries   
116   409892  False          TruFru Ra

In [25]:
rasp_remove = ["Divine FT 70% Dark Chocolate with Raspberries",
               "Innocent Kids Strawberries,Raspberries & Appl...",
               "Innocent Kids Strawberries, Raspberries & Appl...",
               "TruFru Raspberries White & Dark Chocolate",
               "Cooks' Ingredients Freeze Dried Raspberries"]

raspberries = rasp_berries[~rasp_berries['name'].str.contains('|'.join(rasp_remove), case=False, na=False)]

In [26]:
raspberries

Unnamed: 0,id,offer,name,size,item-price,price-per-unit,category,price
14,88061,True,Waitrose Raspberries,150g,£2.60,£17.34/kg,"Dietary & Lifestyle, Fresh & Chilled, Summer",2.6
41,890609,False,Essential Raspberries,150g,£2.00,£13.34/kg,"Dietary & Lifestyle, Everyday Value, Fresh & C...",2.0
173,771268,False,Essential Frozen Raspberries,350g,£2.35,67.1p/100g,"Dietary & Lifestyle, Everyday Value, Frozen, W...",2.35
200,96831,False,Duchy Organic Raspberries,125g,£2.85,£22.80/kg,"Dietary & Lifestyle, Fresh & Chilled, Organic ...",2.85
246,49829,True,Waitrose Raspberries,225g,£2.75,£12.23/kg,"Dietary & Lifestyle, Fresh & Chilled, Summer",2.75
331,88050,True,No.1 Speciality Raspberries,200g,£3.50,£17.50/kg,"Dietary & Lifestyle, Fresh & Chilled, Summer, ...",3.5
1323,55339,False,Waitrose Frozen British Raspberries,300g,£3.40,£1.14/100g,"Dietary & Lifestyle, Frozen",3.4
1801,611803,False,Duchy Organic Frozen Raspberries,200g,£3.80,£19/kg,"Dietary & Lifestyle, Frozen, Organic Shop, Wai...",3.8


In [27]:
rasp_berries_plot = raspberries.assign(
    new_price=lambda s: s['price'] / s['size'].str.replace('g','').astype('float') * 1000,
    price=lambda x: x['price'],
    size=lambda x: x['size']
)


rasp_berries_plot[['name','price','size','new_price']]

Unnamed: 0,name,price,size,new_price
14,Waitrose Raspberries,2.6,150g,17.333333
41,Essential Raspberries,2.0,150g,13.333333
173,Essential Frozen Raspberries,2.35,350g,6.714286
200,Duchy Organic Raspberries,2.85,125g,22.8
246,Waitrose Raspberries,2.75,225g,12.222222
331,No.1 Speciality Raspberries,3.5,200g,17.5
1323,Waitrose Frozen British Raspberries,3.4,300g,11.333333
1801,Duchy Organic Frozen Raspberries,3.8,200g,19.0


In [28]:
rasp_plot = rasp_berries_plot.sort_values(by='new_price', ascending=True)

In [29]:
(
    ggplot(rasp_plot, aes(x='new_price', y='name')) +
    geom_line(color='blue', size=1.2) +
    labs(
        title='Price of Raspberries ',
        x='Price ($)',
        y='Product Name'
    ) +  
    theme(
        axis_text_x=element_text(hjust=1),  
        plot_title=element_text(size=14, face='bold')  
    ) + ggsize(800, 800)
)

In [30]:
(
    ggplot(rasp_plot, aes(x='new_price', y='name')) +
    geom_point(color='green', size=3, tooltips=layer_tooltips().line('@name').line('@new_price')) +  
    labs(
        title='Price of Raspberries',
        x='Product Name',
        y='Price ($)'
    ) +  
    theme(
        axis_text_x=element_text(hjust=1),  
        plot_title=element_text(size=14, face='bold')  
    ) + 
    ggsize(800, 800)
)


In [31]:
rasp_plot = rasp_plot.sort_values(by='price', ascending=False)

# This configures what shows up when you hover your mouse over the plot.
tooltip_setup = (
    layer_tooltips()
        .line('@name')
        .line('[@size -- @price]')
        .format('@price', '£ {.2f}')
)

raspberry = (
    # Maps the columns to the aesthetics of the plot.
    ggplot(rasp_plot, aes(y='name', x='price', fill='name')) +

    # GEOMS

    # Add a line range that 'listens to' columns informed in `ymin` and `ymax` aesthetics
    #geom_linerange(size=1, alpha=0.75, tooltips=tooltip_setup) +

    # Add points to the plot (listen to `x` and `y` and fill aesthetics)
    geom_point(size=3, stroke=1, shape=21, tooltips=tooltip_setup) +

    # SCALES

    # Remove the legend (we can already read the categories from the y-axis)
    scale_fill_discrete(guide='none') +

    # Specify names for the axes
    scale_y_continuous(name="Categories\n(from largest to smallest median)", expand=[0.05, 0.05]) +
    scale_x_continuous(name="Price (£)", expand=[0., 0.05], format='£ {.2f}', breaks=np.arange(0, 20, 0.5)) +

    # LABELS
    # It's nice when the plot tells you the key takeaways
    labs(title='"Raspberries"',
         subtitle="Dots represent the price") +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18),
        legend_position='none') +
    ggsize(1000, 500)

)

raspberry