In [2]:
import os
import pandas as pd

In [3]:
# Identify the location of the original files
# This represents the path: ../data/waitrose-2024-07
data_folder = os.path.join('..', 'data', 'waitrose-2024-07')

# Use a list comprehension to get all the files in the folder
all_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) 
             if file.endswith('.csv')]

# Print the list of files if you want to check
# print(all_files)

# Read every single file as a DataFrame
# Save the dataframes in a list
list_of_dfs = [pd.read_csv(file) for file in all_files]

# Use pd.concat to concatenate all the files into a single DataFrame
df = pd.concat(list_of_dfs)

# Check that we have all the data
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25418 entries, 0 to 805
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   data-product-id        25418 non-null  int64  
 1   data-product-name      25418 non-null  object 
 2   data-product-type      25418 non-null  object 
 3   data-product-on-offer  25418 non-null  bool   
 4   data-product-index     25408 non-null  float64
 5   image-url              25418 non-null  object 
 6   product-page           25418 non-null  object 
 7   product-name           25407 non-null  object 
 8   product-size           25363 non-null  object 
 9   item-price             25407 non-null  object 
 10  price-per-unit         24976 non-null  object 
 11  offer-description      7201 non-null   object 
 12  category               25418 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(10)
memory usage: 2.5+ MB


# Exploring the item-price column
Cleaning needed:
Remove £
p -> 0.
remove est
ranges -> average

If cleaning works:
### This should not raise an error
df['item-price'].astype(float)

### There should be no None or NaN values
df['item-price'].astype(float).isna().sum() == 0



In [58]:
price_list = df['item-price'].values.tolist()

In [34]:
def clean_item_price(item_price: str):
    item_price = str(item_price).strip()  # Convert to string and remove surrounding whitespace

    # Remove 'each est' text
    if ' each est.' in item_price:
        # Remove this string from item_price 
        # After this line, the same old code will still work!
        item_price = item_price.removesuffix(' each est.')
    # Remove '£' symbol
    item_price = item_price.replace('£', '').strip()

    # Handle pence values ('p') by converting to pounds
    if 'p' in item_price:
        item_price = item_price.replace('p', '').strip()
        try:
            # Convert to pounds by dividing by 100
            item_price = str(float(item_price) / 100)
        except ValueError:
            return None  # If conversion fails, return None

    # Handle range values separated by '-'
    if '-' in item_price:
        item_price_parts = item_price.split('-')
        # Convert parts to float if they are valid numbers
        item_price_parts = [float(part.strip()) for part in item_price_parts if part.strip().replace('.', '', 1).isdigit()]

        # Check if we have exactly two parts to average
        if len(item_price_parts) == 2:
            item_price = str((item_price_parts[0] + item_price_parts[1]) / 2)
        else:
            return None  # Return None if there aren’t exactly two parts to average

    # Final attempt to convert the cleaned item_price to float
    try:
        return float(item_price)
    except ValueError:
        return None  # If conversion to float fails, return None

# Apply the function to the 'item-price' column
new_item_price = df['item-price'].apply(clean_item_price)

# Calculate the percentage of invalid numbers
total_invalid_numbers = pd.to_numeric(new_item_price, errors='coerce').isna().sum()
ratio_invalid_numbers = 100 * (total_invalid_numbers / len(df))

print(f"{ratio_invalid_numbers:.2f}% of the column cannot be directly converted to a float.")



0.04% of the column cannot be directly converted to a float.


In [35]:
problematic_rows = pd.to_numeric(new_item_price, errors='coerce').isna()

# See how this looks like
problematic_rows
df['item-price'][problematic_rows == True].head(20)

48    NaN
49    NaN
24    NaN
48    NaN
49    NaN
48    NaN
49    NaN
48    NaN
49    NaN
48    NaN
49    NaN
Name: item-price, dtype: object

In [36]:
df[problematic_rows == True]

Unnamed: 0,data-product-id,data-product-name,data-product-type,data-product-on-offer,data-product-index,image-url,product-page,product-name,product-size,item-price,price-per-unit,offer-description,category
48,722909,John Lewis Bumble Bee Napkin 33cm,G,True,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/john-le...,,,,,25% Off. Was £2.00,Home
49,624080,Cocktail Sticks Pot 150pk,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/cocktai...,,,,,,Home
24,522601,Essential Pedal Bin Liners Tie Handles,G,False,25.0,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/essenti...,,,,,,Household
48,85201,Essential Leeks,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/essenti...,,,,,,Dietary & Lifestyle
49,85513,Duchy Organic Carrots,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/duchy-o...,,,,,,Dietary & Lifestyle
48,67459,No.1 Royal Deeside Still Water,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/no1-roy...,,,,,,"Tea, Coffee & Soft Drinks"
49,69460,Waitrose Pineapple Fruit Juice,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/waitros...,,,,,,"Tea, Coffee & Soft Drinks"
48,522314,Waitrose Red Leicester Cheese & Sriracha Burge...,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/waitros...,,,,,,Summer
49,750319,Waitrose 2 Lemon & Lime Daisy Cheesecakes,G,True,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/waitros...,,,,,25% Off. Was £4.00,Summer
48,85247,Essential Cauliflower,G,False,,https://ecom-su-static-prod.wtrecom.com/images...,https://www.waitrose.com/ecom/products/essenti...,,,,,,Fresh & Chilled


In [37]:
valid_rows = ~ problematic_rows

print(f"There are {sum(problematic_rows)} rows with NaN in the original DataFrame")
print(f"There will be {sum(valid_rows)} rows left once we remove those")

There are 11 rows with NaN in the original DataFrame
There will be 25407 rows left once we remove those


In [38]:
df = df[valid_rows].copy()

In [39]:
df['item-price'].apply(clean_item_price).astype(float)

0       3.50
1       3.50
2       9.00
3      12.00
4       3.75
       ...  
801     7.00
802     6.00
803     1.05
804    15.49
805    14.99
Name: item-price, Length: 25407, dtype: float64

In [40]:
df['item-price'] = df['item-price'].apply(clean_item_price).astype(float)
df['item-price'].dtype

dtype('float64')

In [41]:
# All numbers are proper numbers. There isn't a single NaN here
df['item-price'].isna().sum() == 0

True

In [44]:
from lets_plot import *
LetsPlot.setup_html()
import numpy as np


In [45]:
plot_df = (
    df.groupby('category')['item-price'].describe()
        .reset_index()
        .rename(columns={'25%': 'Q1', '50%': 'median', '75%': 'Q3'})
        .sort_values(by='median')
)

# plot_df.head() to see how it looks like

# This configures what shows up when you hover your mouse over the plot.
tooltip_setup = (
    layer_tooltips()
        .line('@category')
        .line('[@Q1 -- @median -- @Q3]')
        .format('@Q1', '£ {.2f}')
        .format('@median', '£ {.2f}')
        .format('@Q3', '£ {.2f}')
)

g = (
    # Maps the columns to the aesthetics of the plot.
    ggplot(plot_df, aes(y='category', x='median', xmin='Q1', xmax='Q3', fill='category')) +

    # GEOMS

    # Add a line range that 'listens to' columns informed in `ymin` and `ymax` aesthetics
    geom_linerange(size=1, alpha=0.75, tooltips=tooltip_setup) +

    # Add points to the plot (listen to `x` and `y` and fill aesthetics)
    geom_point(size=3, stroke=1, shape=21, tooltips=tooltip_setup) +

    # SCALES

    # Remove the legend (we can already read the categories from the y-axis)
    scale_fill_discrete(guide='none') +

    # Specify names for the axes
    scale_y_continuous(name="Categories\n(from largest to smallest median)", expand=[0.05, 0.05]) +
    scale_x_continuous(name="Price (£)", expand=[0., 0.05], format='£ {.2f}', breaks=np.arange(0, 20, 2.5)) +

    # LABELS
    # It's nice when the plot tells you the key takeaways
    labs(title='"Beer, Wine & Spirits" has the highest median price for individual items',
         subtitle="Dots represent the median price, bars represent the 25th and 75th percentiles") +
    theme(axis_text_x=element_text(size=15),
        axis_text_y=element_text(size=17),
        axis_title_x=element_text(size=20),
        axis_title_y=element_text(size=20),
        plot_title=element_text(size=19, face='bold'),
        plot_subtitle=element_text(size=18),
        legend_position='none') +
    ggsize(1000, 500)

)

g