In [1]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).





# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).






# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

In [2]:
import pandas as pd

# Sample DataFrame (replace with your actual data)
data = {'date_column': ['2023/10/26', '26-Nov-22', '12-05-2024']}
df = pd.DataFrame(data)

def standardize_date(date_str):
    formats_to_try = ['%Y/%m/%d', '%d-%b-%y', '%d-%m-%Y']
    for fmt in formats_to_try:
        try:
            return pd.to_datetime(date_str, format=fmt).strftime('%Y-%m-%d')
        except ValueError:
            continue
    return None # Or handle the error as needed

df['date_column_standardized'] = df['date_column'].apply(standardize_date)
print(df)

  date_column date_column_standardized
0  2023/10/26               2023-10-26
1   26-Nov-22               2022-11-26
2  12-05-2024               2024-05-12


In [3]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.








# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).











In [4]:
import pandas as pd

# Sample DataFrame with inconsistent date formats
data = {'date_column': ['2023/10/26', '26-Nov-22', '12.05.2024', 'Oct 26, 2023', '20221126']}
df = pd.DataFrame(data)

def standardize_date_robust(date_str):
    formats_to_try = ['%Y/%m/%d', '%d-%b-%y', '%d.%m.%Y', '%b %d, %Y', '%Y%m%d']
    for fmt in formats_to_try:
        try:
            return pd.to_datetime(date_str, format=fmt).strftime('%Y-%m-%d')
        except ValueError:
            continue
    return None # Handle unparseable dates as needed

df['date_column_standardized'] = df['date_column'].apply(standardize_date_robust)
print(df)

    date_column date_column_standardized
0    2023/10/26               2023-10-26
1     26-Nov-22               2022-11-26
2    12.05.2024               2024-05-12
3  Oct 26, 2023               2023-10-26
4      20221126               2022-11-26
