# Data Cleaning Techniques: Fixing Common Data Issues

This notebook covers practical techniques for cleaning messy data, including handling formatting issues, special characters, and data entry errors.

In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Create a Messy Dataset

In [2]:
# Create a dataset with various formatting and data entry issues
messy_data = {
    'name': ['John Smith', ' jane doe', 'BOB JOHNSON', 'Alice  Brown ', 'charlie wilson',
             'David Lee!!!', 'Emily@Davis', 'MICHAEL O\'BRIEN', 'Sarah-Jane  Williams', 'Tom_Anderson'],
    'phone': ['(555) 123-4567', '555.987.6543', '555-246-8135', '15551234567', '555 369 2580',
              '(555)4567890', '+1-555-111-2222', '555.333.4444 ext 123', 'CALL: 555-9876', '5559998888'],
    'email': ['john@email.com', 'JANE@GMAIL.COM', 'bob@company..com', 'alice@@email.com', 'charlie@email',
              'david@email.com ', ' emily@email.com', 'michael@email.COM', 'sarah jane@email.com', 'tom@.com'],
    'date_joined': ['2023-01-15', '01/20/2023', '2023.02.28', '03-15-23', '2023/4/1',
                   '20230510', 'June 15, 2023', '2023-07-01 00:00:00', '08/15/23', '2023-09-30'],
    'salary': ['$75,000', '82000', '$90,500.00', '65000.0', '$120k',
              '95,000', 'USD 88000', '77500 dollars', '$105.000', '92000.50'],
    'department': ['Sales', 'sales', 'MARKETING', 'IT ', ' It',
                  'Human Resources', 'HR', 'Financ', 'Engineering', 'sales '],
    'product_code': ['ABC-123', 'DEF456', 'GHI 789', 'jkl-012', 'MNO_345',
                    'PQR#678', 'STU901', 'VWX-234-A', 'YZ-567', 'ABC123']
}

df = pd.DataFrame(messy_data)
print("Original messy data:")
df

Original messy data:


Unnamed: 0,name,phone,email,date_joined,salary,department,product_code
0,John Smith,(555) 123-4567,john@email.com,2023-01-15,"$75,000",Sales,ABC-123
1,jane doe,555.987.6543,JANE@GMAIL.COM,01/20/2023,82000,sales,DEF456
2,BOB JOHNSON,555-246-8135,bob@company..com,2023.02.28,"$90,500.00",MARKETING,GHI 789
3,Alice Brown,15551234567,alice@@email.com,03-15-23,65000.0,IT,jkl-012
4,charlie wilson,555 369 2580,charlie@email,2023/4/1,$120k,It,MNO_345
5,David Lee!!!,(555)4567890,david@email.com,20230510,95000,Human Resources,PQR#678
6,Emily@Davis,+1-555-111-2222,emily@email.com,"June 15, 2023",USD 88000,HR,STU901
7,MICHAEL O'BRIEN,555.333.4444 ext 123,michael@email.COM,2023-07-01 00:00:00,77500 dollars,Financ,VWX-234-A
8,Sarah-Jane Williams,CALL: 555-9876,sarah jane@email.com,08/15/23,$105.000,Engineering,YZ-567
9,Tom_Anderson,5559998888,tom@.com,2023-09-30,92000.50,sales,ABC123


## Section 1: Cleaning Text Data

In [3]:
df_clean = df.copy()

# Clean names: remove special characters, fix capitalization, trim whitespace
def clean_name(name):
    # Remove special characters except spaces, hyphens, and apostrophes
    name = re.sub(r'[^a-zA-Z\s\-\']', '', name)
    # Remove extra spaces
    name = ' '.join(name.split())
    # Proper case
    name = name.title()
    return name.strip()

df_clean['name'] = df['name'].apply(clean_name)

# Clean and standardize department names
def standardize_department(dept):
    dept = dept.strip().lower()
    # Fix common variations
    dept_mapping = {
        'it': 'IT',
        'hr': 'Human Resources',
        'human resources': 'Human Resources',
        'sales': 'Sales',
        'marketing': 'Marketing',
        'financ': 'Finance',
        'engineering': 'Engineering'
    }
    return dept_mapping.get(dept, dept.title())

df_clean['department'] = df['department'].apply(standardize_department)

print("Cleaned text fields:")
print(df_clean[['name', 'department']])

Cleaned text fields:
                  name       department
0           John Smith            Sales
1             Jane Doe            Sales
2          Bob Johnson        Marketing
3          Alice Brown               IT
4       Charlie Wilson               IT
5            David Lee  Human Resources
6           Emilydavis  Human Resources
7      Michael O'Brien          Finance
8  Sarah-Jane Williams      Engineering
9          Tomanderson            Sales


## Section 2: Standardizing Phone Numbers and Email

In [4]:
# Standardize phone numbers to format: (555) 555-5555
def clean_phone(phone):
    # Extract only digits
    digits = re.sub(r'\D', '', str(phone))
    
    # Handle extensions by taking first 10 digits
    if len(digits) >= 10:
        digits = digits[:10]
    
    # Format if we have exactly 10 digits
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    else:
        return 'Invalid'

df_clean['phone'] = df['phone'].apply(clean_phone)

# Clean email addresses
def clean_email(email):
    # Convert to lowercase and strip whitespace
    email = str(email).lower().strip()
    
    # Fix common issues
    email = re.sub(r'\.{2,}', '.', email)  # Replace multiple dots with single dot
    email = re.sub(r'@{2,}', '@', email)   # Replace multiple @ with single @
    email = re.sub(r'\s+', '', email)      # Remove spaces
    
    # Basic validation
    if re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
        return email
    else:
        return 'Invalid Email'

df_clean['email'] = df['email'].apply(clean_email)

print("Cleaned contact information:")
print(df_clean[['phone', 'email']])

Cleaned contact information:
            phone                email
0  (555) 123-4567       john@email.com
1  (555) 987-6543       jane@gmail.com
2  (555) 246-8135      bob@company.com
3  (155) 512-3456      alice@email.com
4  (555) 369-2580        Invalid Email
5  (555) 456-7890      david@email.com
6  (155) 511-1222      emily@email.com
7  (555) 333-4444    michael@email.com
8         Invalid  sarahjane@email.com
9  (555) 999-8888        Invalid Email


## Section 3: Fixing Date Formats

In [5]:
# Standardize all dates to YYYY-MM-DD format
def parse_date(date_str):
    date_str = str(date_str).strip()
    
    # Try different date formats
    formats = [
        '%Y-%m-%d',           # 2023-01-15
        '%m/%d/%Y',           # 01/20/2023
        '%Y.%m.%d',           # 2023.02.28
        '%m-%d-%y',           # 03-15-23
        '%Y/%m/%d',           # 2023/4/1
        '%Y%m%d',             # 20230510
        '%B %d, %Y',          # June 15, 2023
        '%Y-%m-%d %H:%M:%S',  # 2023-07-01 00:00:00
        '%m/%d/%y',           # 08/15/23
    ]
    
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).strftime('%Y-%m-%d')
        except ValueError:
            continue
    
    return 'Invalid Date'

df_clean['date_joined'] = df['date_joined'].apply(parse_date)

print("Original vs Cleaned dates:")
comparison = pd.DataFrame({
    'original': df['date_joined'],
    'cleaned': df_clean['date_joined']
})
print(comparison)

Original vs Cleaned dates:
              original     cleaned
0           2023-01-15  2023-01-15
1           01/20/2023  2023-01-20
2           2023.02.28  2023-02-28
3             03-15-23  2023-03-15
4             2023/4/1  2023-04-01
5             20230510  2023-05-10
6        June 15, 2023  2023-06-15
7  2023-07-01 00:00:00  2023-07-01
8             08/15/23  2023-08-15
9           2023-09-30  2023-09-30


## Section 4: Cleaning Numeric Data with Text

In [6]:
# Clean salary data - convert to numeric
def clean_salary(salary):
    salary = str(salary).upper()
    
    # Remove currency symbols and text
    salary = re.sub(r'[^0-9.,K]', '', salary)
    
    # Handle 'K' notation
    if 'K' in salary:
        salary = salary.replace('K', '000')
    
    # Fix European decimal notation (if present)
    if '.' in salary and ',' in salary:
        if salary.index('.') > salary.index(','):
            salary = salary.replace(',', '')
        else:
            salary = salary.replace('.', '').replace(',', '.')
    else:
        salary = salary.replace(',', '')
    
    try:
        return float(salary)
    except:
        return np.nan

df_clean['salary'] = df['salary'].apply(clean_salary)

print("Cleaned salary data:")
salary_comparison = pd.DataFrame({
    'original': df['salary'],
    'cleaned': df_clean['salary']
})
print(salary_comparison)

Cleaned salary data:
        original   cleaned
0        $75,000   75000.0
1          82000   82000.0
2     $90,500.00   90500.0
3        65000.0   65000.0
4          $120k  120000.0
5         95,000   95000.0
6      USD 88000   88000.0
7  77500 dollars   77500.0
8       $105.000     105.0
9       92000.50   92000.5


## Section 5: Standardizing Product Codes

In [7]:
# Standardize product codes to format: XXX-###
def standardize_product_code(code):
    # Convert to uppercase and remove spaces
    code = str(code).upper().replace(' ', '')
    
    # Extract letters and numbers
    letters = re.findall(r'[A-Z]+', code)
    numbers = re.findall(r'\d+', code)
    
    if letters and numbers:
        # Take first 3 letters and first 3 digits
        letter_part = (letters[0][:3]).ljust(3, 'X')
        number_part = (numbers[0][:3]).zfill(3)
        return f"{letter_part}-{number_part}"
    else:
        return 'Invalid Code'

df_clean['product_code'] = df['product_code'].apply(standardize_product_code)

print("Standardized product codes:")
code_comparison = pd.DataFrame({
    'original': df['product_code'],
    'cleaned': df_clean['product_code']
})
print(code_comparison)

Standardized product codes:
    original  cleaned
0    ABC-123  ABC-123
1     DEF456  DEF-456
2    GHI 789  GHI-789
3    jkl-012  JKL-012
4    MNO_345  MNO-345
5    PQR#678  PQR-678
6     STU901  STU-901
7  VWX-234-A  VWX-234
8     YZ-567  YZX-567
9     ABC123  ABC-123


### Exercise 1: Create a Phone Number Validator
Write a function that validates and formats international phone numbers.

In [8]:
def validate_international_phone(phone, country_code='US'):
    """
    Validate and format phone numbers for different countries
    US: (555) 555-5555
    UK: +44 20 5555 5555
    Return formatted number or 'Invalid'
    """
    # Your code here
    pass

# Test your function
# test_phones = ['555-123-4567', '+44 20 1234 5678', '1234567890']
# for phone in test_phones:
#     print(f"{phone} -> {validate_international_phone(phone)}")

## Final Comparison

In [9]:
# Show before and after
print("BEFORE CLEANING:")
print(df.head(3))
print("\n" + "="*80 + "\n")
print("AFTER CLEANING:")
print(df_clean.head(3))

BEFORE CLEANING:
          name           phone             email date_joined      salary  \
0   John Smith  (555) 123-4567    john@email.com  2023-01-15     $75,000   
1     jane doe    555.987.6543    JANE@GMAIL.COM  01/20/2023       82000   
2  BOB JOHNSON    555-246-8135  bob@company..com  2023.02.28  $90,500.00   

  department product_code  
0      Sales      ABC-123  
1      sales       DEF456  
2  MARKETING      GHI 789  


AFTER CLEANING:
          name           phone            email date_joined   salary  \
0   John Smith  (555) 123-4567   john@email.com  2023-01-15  75000.0   
1     Jane Doe  (555) 987-6543   jane@gmail.com  2023-01-20  82000.0   
2  Bob Johnson  (555) 246-8135  bob@company.com  2023-02-28  90500.0   

  department product_code  
0      Sales      ABC-123  
1      Sales      DEF-456  
2  Marketing      GHI-789  


In [10]:
# Data quality report
print("Data Quality Report:")
print("="*40)
for col in df_clean.columns:
    if 'Invalid' in df_clean[col].astype(str).values or df_clean[col].isnull().any():
        invalid_count = ((df_clean[col] == 'Invalid') | 
                        (df_clean[col].astype(str).str.contains('Invalid')) | 
                        df_clean[col].isnull()).sum()
        print(f"{col}: {invalid_count} issues found")
    else:
        print(f"{col}: ✓ All clean")

Data Quality Report:
name: ✓ All clean
phone: 1 issues found
email: ✓ All clean
date_joined: ✓ All clean
salary: ✓ All clean
department: ✓ All clean
product_code: ✓ All clean


### Exercise 2: Create a Comprehensive Data Cleaner
Combine all cleaning functions into a single pipeline.

In [11]:
def clean_dataset(df):
    """
    Apply all cleaning operations to a dataset
    Return cleaned dataframe and a cleaning report
    """
    # Your code here
    pass

# Test your function
# df_final, report = clean_dataset(df.copy())
# print(report)

## Summary

### Key Data Cleaning Techniques:

1. **Text Cleaning**:
   - Remove special characters
   - Standardize capitalization
   - Trim whitespace
   - Fix encoding issues

2. **Format Standardization**:
   - Phone numbers: Consistent formatting
   - Emails: Lowercase, validate structure
   - Dates: Convert to standard format
   - Codes: Apply consistent patterns

3. **Numeric Data with Text**:
   - Remove currency symbols
   - Handle abbreviations (K, M)
   - Fix decimal separators
   - Convert to appropriate type

4. **Common Issues to Watch For**:
   - Leading/trailing spaces
   - Inconsistent capitalization
   - Special characters in wrong places
   - Mixed formats in same column
   - Data entry variations

Remember: Always preserve the original data and document your cleaning steps!