# Online Retail Dataset Analysis

This notebook loads and examines the online retail dataset with a focus on the InvoiceId column, where some rows start with a letter.

In [3]:
# Import necessary libraries
!pip install --quiet pandas
import pandas as pd


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# Load the online retail dataset
url = 'https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/retail-data/all/online-retail-dataset.csv'
df = pd.read_csv(url)

# Display the first few rows of the dataset
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
# Examine the InvoiceNo column (not InvoiceId as mentioned in the prompt, as the dataset uses InvoiceNo)
print("Column names in the dataset:")
print(df.columns.tolist())

# Check if we have InvoiceNo or InvoiceId
invoice_col = 'InvoiceNo'

# Check the first 10 values of the invoice column
print(f"\nFirst 10 values of {invoice_col}:")
print(df[invoice_col].head(10).tolist())

Column names in the dataset:
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

First 10 values of InvoiceNo:
['536365', '536365', '536365', '536365', '536365', '536365', '536365', '536366', '536366', '536367']


In [5]:
# Analyze the pattern of invoice numbers

# Function to check if a value starts with a letter
def starts_with_letter(value):
    if isinstance(value, str) and len(value) > 0:
        return value[0].isalpha()
    return False

# Apply the function to the invoice column
df['starts_with_letter'] = df[invoice_col].apply(starts_with_letter)

# Count how many invoice numbers start with a letter
letter_count = df['starts_with_letter'].sum()
total_rows = len(df)

print(f"Number of invoices that start with a letter: {letter_count} out of {total_rows} ({letter_count/total_rows:.2%})")

# Get examples of invoices that start with a letter
lettered_invoices = df[df['starts_with_letter']][invoice_col].unique()
print(f"\nExamples of invoices that start with a letter (showing up to 20):")
print(lettered_invoices[:20].tolist())

Number of invoices that start with a letter: 9291 out of 541909 (1.71%)

Examples of invoices that start with a letter (showing up to 20):
['C536379', 'C536383', 'C536391', 'C536506', 'C536543', 'C536548', 'C536606', 'C536622', 'C536625', 'C536642', 'C536734', 'C536737', 'C536757', 'C536758', 'C536760', 'C536807', 'C536812', 'C536814', 'C536815', 'C536816']


In [9]:
# Create separate dataframes for invoices starting with letters vs. numbers
df_with_letters = df[df['starts_with_letter']]
df_without_letters = df[~df['starts_with_letter']]

print(f"DataFrame with letter invoices: {df_with_letters.shape[0]} rows")
print(f"DataFrame without letter invoices: {df_without_letters.shape[0]} rows")

DataFrame with letter invoices: 9291 rows
DataFrame without letter invoices: 532618 rows


In [10]:
# Sample rows from invoices starting with letters
print("Sample rows from invoices starting with letters:")
df_with_letters.head(3)

Sample rows from invoices starting with letters:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,starts_with_letter
141,C536379,D,Discount,-1,12/1/2010 9:41,27.5,14527.0,United Kingdom,True
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,12/1/2010 9:49,4.65,15311.0,United Kingdom,True
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,12/1/2010 10:24,1.65,17548.0,United Kingdom,True


In [11]:
# Sample rows from invoices with no letters
print("Sample rows from invoices with no letters:")
df_without_letters.head(3)

Sample rows from invoices with no letters:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,starts_with_letter
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,False
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,False
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,False


In [12]:
# Create 'sets' directory if it doesn't exist
import os
sets_dir = os.path.join(os.getcwd(), 'sets')
os.makedirs(sets_dir, exist_ok=True)

# Save dataframes to CSV files
letters_file = os.path.join(sets_dir, 'invoices_with_letters.csv')
no_letters_file = os.path.join(sets_dir, 'invoices_no_letters.csv')

df_with_letters.to_csv(letters_file, index=False)
df_without_letters.to_csv(no_letters_file, index=False)

print(f"Saved dataframe with letter invoices to: {letters_file}")
print(f"Saved dataframe with number invoices to: {no_letters_file}")
print(f"Number of rows saved: {df_with_letters.shape[0]} with letters, {df_without_letters.shape[0]} without letters")

Saved dataframe with letter invoices to: /home/coder/src/query/sets/invoices_with_letters.csv
Saved dataframe with number invoices to: /home/coder/src/query/sets/invoices_no_letters.csv
Number of rows saved: 9291 with letters, 532618 without letters


In [5]:
# Save the entire original dataset to a local CSV file
import os

# Ensure the sets directory exists
sets_dir = os.path.join(os.getcwd(), 'sets')
os.makedirs(sets_dir, exist_ok=True)

# Define the path for the original dataset
original_file = os.path.join(sets_dir, 'original_dataset.csv')

# Save the entire dataframe to a CSV file
df.to_csv(original_file, index=False)

print(f"Saved original dataset to: {original_file}")
print(f"Number of rows saved: {df.shape[0]} rows, {df.shape[1]} columns")

Saved original dataset to: /home/coder/src/query/sets/original_dataset.csv
Number of rows saved: 541909 rows, 8 columns
