In [1]:
# import the data

import pandas as pd

sales = pd.read_csv('messy_customer_sales.csv')
sales.head()

Unnamed: 0,customer_id,customer_name,email,signup_date,last_purchase_date,purchase_amount,product_category,country
0,145.0,Ryan Perez,ryan.perez@example.com,04-29-2025,21/03/2025,133.82,Fashion,france
1,328.0,Rachel Donovan,,07/08/2024,09-11-2024,210.23,Electronics,France
2,358.0,CHRISTOPHER HARRIS,christopher.harris@example.com,13/06/2025,2024-08-24,288.82,Electronics,uk
3,435.0,Patricia Caldwell,patricia.caldwell@example.com,17/06/2025,15/02/2025,312.42,Fashion,UK
4,429.0,Lori Nelson,lorinelson@@example.com,2024-10-30,08-25-2024,159.17,Fashion,france


In [2]:
# Understand the data

sales.dtypes # Show what datatypes the individual columns have
sales.info() # Shows how much memory is used, how many non-null values each column has
sales.describe() # Generates descriptive statistics (e.g., mean, std, min, max) for all numerical columns.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         981 non-null    float64
 1   customer_name       1000 non-null   object 
 2   email               955 non-null    object 
 3   signup_date         1000 non-null   object 
 4   last_purchase_date  899 non-null    object 
 5   purchase_amount     1000 non-null   float64
 6   product_category    1000 non-null   object 
 7   country             949 non-null    object 
dtypes: float64(2), object(6)
memory usage: 62.6+ KB


Unnamed: 0,customer_id,purchase_amount
count,981.0,1000.0
mean,258.904179,395.91217
std,145.19002,1022.938237
min,1.0,-98.65
25%,137.0,132.7475
50%,258.0,257.365
75%,383.0,384.8625
max,500.0,9771.97


In [3]:
# Deal with missing values

# Step 1: Convert customer_id to string
sales['customer_id'] = sales['customer_id'].astype(str)

# Step 2: Create a mask for missing or blank customer_id
missing_mask = sales['customer_id'].isin(['nan', '', ' '])

# Step 3: Get max of existing numeric customer IDs
existing_ids = pd.to_numeric(sales.loc[~missing_mask, 'customer_id'], errors='coerce')
existing_ids = existing_ids.dropna().astype(int)

max_id = existing_ids.max() if not existing_ids.empty else 0

# Step 4: Generate and assign new customer IDs
new_ids = range(max_id + 1, max_id + 1 + missing_mask.sum())
sales.loc[missing_mask, 'customer_id'] = [f'{i:03d}' for i in new_ids]

# Step 5 (optional): Make sure all IDs are consistently zero-padded strings
sales['customer_id'] = sales['customer_id'].astype(str).str.zfill(3)

In [4]:
# Step 1: Convert to numeric (float), coercing errors to NaN
sales['customer_id'] = pd.to_numeric(sales['customer_id'], errors='coerce')

# Step 2: Drop or fill NaNs if necessary (optional)
# sales['customer_id'] = sales['customer_id'].fillna(0)

# Step 3: Convert to integer (after step 1 removes invalid strings)
sales['customer_id'] = sales['customer_id'].astype('Int64')  # capital 'I' for nullable int type


In [5]:
sales.dtypes
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         1000 non-null   Int64  
 1   customer_name       1000 non-null   object 
 2   email               955 non-null    object 
 3   signup_date         1000 non-null   object 
 4   last_purchase_date  899 non-null    object 
 5   purchase_amount     1000 non-null   float64
 6   product_category    1000 non-null   object 
 7   country             949 non-null    object 
dtypes: Int64(1), float64(1), object(6)
memory usage: 63.6+ KB


In [6]:
sales.value_counts('product_category')

product_category
Home           290
Fashion        286
Electronics    278
electronics     34
fashion         27
home            27
Fashio          20
Hom             20
Electronic      15
fashio           2
hom              1
Name: count, dtype: int64

In [7]:
# deal with product_category

sales['product_category'] = sales['product_category'].str.lower()
sales['product_category'] = sales['product_category'].replace('hom', 'home')
sales['product_category'] = sales['product_category'].replace('electronic', 'electronics')
sales['product_category'] = sales['product_category'].replace('fashio', 'fashion')


In [8]:
sales.value_counts('product_category')

product_category
home           338
fashion        335
electronics    327
Name: count, dtype: int64

In [9]:
# deal with country

sales.value_counts('country')

country
UK                114
United States     114
uk                114
France            105
france            103
usa               102
United Kingdom    101
FRA               100
USA                96
Name: count, dtype: int64

In [10]:
sales['country'] = sales['country'].str.lower()
sales['country'] = sales['country'].replace('united states', 'usa')
sales['country'] = sales['country'].replace('united kingdom', 'uk')
sales['country'] = sales['country'].replace('fra', 'france')

sales.value_counts('country')

country
uk        329
usa       312
france    308
Name: count, dtype: int64

In [11]:
# deal with customer name
sales['customer_name'] = sales['customer_name'].str.title().str.strip()
sales.value_counts('customer_name').head(20)

customer_name
Robert Johnson      2
Chase Smith         2
Andrea Hart         2
Joseph Santos       2
Jeffrey Clark       2
Samantha Moore      2
Rebecca Miller      2
David Clark         2
Mark Adams          2
Andrew Johnson      2
Matthew Hall        1
Matthew Abbott      1
Mary Silva          1
Mary Martinez Md    1
Matthew Martinez    1
Mary Jones          1
Mary Ellis          1
Mary Edwards        1
Matthew Hansen      1
Aaron Briggs        1
Name: count, dtype: int64

In [12]:
sales[sales['customer_name'].str.startswith(' ')]
sales[sales['customer_name'].str.endswith(' ')]

Unnamed: 0,customer_id,customer_name,email,signup_date,last_purchase_date,purchase_amount,product_category,country


In [13]:
# deal with email

sales['email'] = sales['email'].str.strip().str.lower().str.replace('@@', '@', regex=False)


In [14]:
sales['email'].isna().sum()       # missing or invalid
sales['email'].value_counts().head(20)  # most common


email
andrew.johnson@example.com     2
david.clark@example.com        2
jeffrey.clark@example.com      2
joseph.santos@example.com      2
mark.adams@example.com         2
chase.smith@example.com        2
rebecca.miller@example.com     2
samantha.moore@example.com     2
andrea.hart@example.com        2
pamela.bell@example.com        1
barbara.flores@example.com     1
edward.herman@example.com      1
deanna.ochoa@example.com       1
kelly.williams@example.com     1
thomas.martinez@example.com    1
mark.golden@example.com        1
joseph.rivera@example.com      1
peter.hall@example.com         1
william.smith@example.com      1
brenda.white@example.com       1
Name: count, dtype: int64

In [15]:
sales[sales.duplicated('customer_name', keep=False)][['email', 'customer_name']].sort_values('email').head(30)


Unnamed: 0,email,customer_name
245,andrea.hart@example.com,Andrea Hart
660,andrea.hart@example.com,Andrea Hart
64,andrew.johnson@example.com,Andrew Johnson
158,andrew.johnson@example.com,Andrew Johnson
999,chase.smith@example.com,Chase Smith
214,chase.smith@example.com,Chase Smith
799,david.clark@example.com,David Clark
379,david.clark@example.com,David Clark
774,jeffrey.clark@example.com,Jeffrey Clark
574,jeffrey.clark@example.com,Jeffrey Clark


In [16]:
# deal with signup_date 

sales.dtypes
sales.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         1000 non-null   Int64  
 1   customer_name       1000 non-null   object 
 2   email               955 non-null    object 
 3   signup_date         1000 non-null   object 
 4   last_purchase_date  899 non-null    object 
 5   purchase_amount     1000 non-null   float64
 6   product_category    1000 non-null   object 
 7   country             949 non-null    object 
dtypes: Int64(1), float64(1), object(6)
memory usage: 63.6+ KB


In [17]:
sales['purchase_amount'].unique()[:20] 

array([ 133.82,  210.23,  288.82,  312.42,  159.17,  197.21,  335.26,
        325.56,  112.1 ,  380.44,  427.61,  -32.55,   80.46, 8624.18,
        362.98,  321.54,  259.4 ,  273.81,  356.85,  354.76])

In [18]:
sales[sales['purchase_amount'] < 0]


Unnamed: 0,customer_id,customer_name,email,signup_date,last_purchase_date,purchase_amount,product_category,country
11,450,Mark Nichols,mark.nichols@example.com,2025/06/03,,-32.55,fashion,france
50,277,Dr. Allison Stanley,dr..allison.stanley@example.com,24/06/2025,22/06/2025,-21.8,fashion,france
159,146,Ethan Gonzales,,04-25-2025,08/09/2024,-24.36,home,france
168,102,Joseph Garrison,joseph.garrison@example.com,02/01/2025,2024/10/13,-93.2,fashion,usa
190,291,Ashley Johnston,ashley.johnston@example.com,2025/04/23,2024/07/29,-68.41,fashion,usa
225,116,Heather Patton,heather.patton@example.com,2024/12/23,05/02/2025,-6.43,electronics,usa
229,307,John Curtis,john.curtis@example.com,09-28-2024,,-8.96,home,uk
243,200,Michele Murphy,michele.murphy@example.com,2024/09/03,04-17-2025,-68.93,home,uk
282,123,Kevin Powell,kevin.powell@example.com,17/11/2024,20/09/2024,-81.17,fashion,usa
342,92,Samantha Moore,samantha.moore@example.com,2025/02/21,2025-01-14,-12.15,home,usa


In [19]:
sales = sales[sales['purchase_amount'] >= 0]


In [20]:
# deal with dates

sales[['signup_date', 'last_purchase_date']].isna().sum()

signup_date            0
last_purchase_date    97
dtype: int64

In [21]:
sales[['signup_date', 'last_purchase_date']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 971 entries, 0 to 999
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   signup_date         971 non-null    object
 1   last_purchase_date  874 non-null    object
dtypes: object(2)
memory usage: 22.8+ KB


In [22]:
common_formats = [
    '%Y-%m-%d',     # 2023-12-31
    '%d-%m-%Y',     # 31-12-2023
    '%m/%d/%Y',     # 12/31/2023
    '%d.%m.%Y',     # 31.12.2023
    '%b %d, %Y',    # Dec 31, 2023
    '%B %d, %Y',    # December 31, 2023
    '%Y/%m/%d',     # 2023/12/31
]


from datetime import datetime

def try_parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    for fmt in common_formats:
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except (ValueError, TypeError):
            continue
    return pd.NaT  # if all formats fail

sales['signup_date'] = sales['signup_date'].apply(try_parse_date)
sales['last_purchase_date'] = sales['last_purchase_date'].apply(try_parse_date)

sales['signup_date'] = sales['signup_date'].dt.date
sales['last_purchase_date'] = sales['last_purchase_date'].dt.date



In [23]:
sales.dtypes

customer_id             Int64
customer_name          object
email                  object
signup_date            object
last_purchase_date     object
purchase_amount       float64
product_category       object
country                object
dtype: object

In [24]:
import sqlite3

# Create or connect to a local SQLite database file
conn = sqlite3.connect('cleaned_sales.db')

# Export DataFrame to a table named 'sales_data'
sales.to_sql('sales_data', conn, if_exists='replace', index=False)

# Optional: Close the connection
conn.close()



In [25]:
conn = sqlite3.connect('cleaned_sales.db')
pd.read_sql("SELECT * FROM sales_data LIMIT 5;", conn)


Unnamed: 0,customer_id,customer_name,email,signup_date,last_purchase_date,purchase_amount,product_category,country
0,145,Ryan Perez,ryan.perez@example.com,,,133.82,fashion,france
1,328,Rachel Donovan,,2024-07-08,2024-11-09,210.23,electronics,france
2,358,Christopher Harris,christopher.harris@example.com,,2024-08-24,288.82,electronics,uk
3,435,Patricia Caldwell,patricia.caldwell@example.com,,,312.42,fashion,uk
4,429,Lori Nelson,lorinelson@example.com,2024-10-30,,159.17,fashion,france


In [26]:
sales.to_csv('cleaned_sales_backup.csv', index=False)
