In [1]:
import pandas as pd
file_path = 'online_retail_II.xlsx'
df_1 = pd.read_excel(file_path, sheet_name='Year 2009-2010')
df_2 = pd.read_excel(file_path, sheet_name='Year 2010-2011')
df_master = pd.concat([df_1, df_2], ignore_index=True)
print(f"Total rows after merging: {len(df_master)}")

Total rows after merging: 1067371


In [2]:
print("--- Missing Values ---")
print(df_master.isnull().sum())

print(f"\nDuplicate rows found: {df_master.duplicated().sum()}")

print("\n--- Summary Statistics ---")
print(df_master[['Quantity', 'Price']].describe())

--- Missing Values ---
Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

Duplicate rows found: 34335

--- Summary Statistics ---
           Quantity         Price
count  1.067371e+06  1.067371e+06
mean   9.938898e+00  4.649388e+00
std    1.727058e+02  1.235531e+02
min   -8.099500e+04 -5.359436e+04
25%    1.000000e+00  1.250000e+00
50%    3.000000e+00  2.100000e+00
75%    1.000000e+01  4.150000e+00
max    8.099500e+04  3.897000e+04


In [3]:
# 1. Remove Exact Duplicates
df_clean = df_master.drop_duplicates()
print(f"Removed {len(df_master) - len(df_clean)} duplicate rows.")

# 2. Handle 'Anomalies' (Negative Price/Quantity)
df_clean = df_clean[(df_clean['Quantity'] > 0) & (df_clean['Price'] > 0)]

# 3. Handle Missing Customer IDs
df_clean = df_clean.dropna(subset=['Customer ID'])

print(f"Final rows for analysis: {len(df_clean)}")

Removed 34335 duplicate rows.
Final rows for analysis: 779425


In [4]:
df_clean['Total_Sales'] = df_clean['Quantity'] * df_clean['Price']

df_clean.columns = [col.lower().replace(' ', '_') for col in df_clean.columns]

In [5]:
print(df_clean.head())

  invoice stockcode                          description  quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          invoicedate  price  customer_id         country  total_sales  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom         83.4  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom         81.0  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom         81.0  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom        100.8  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom         30.0  


In [6]:
df_clean.to_csv('online_retail_cleaned.csv', index=False)