In [2]:
import pandas as pd

# Existing dataset
main_df = pd.read_csv('D:/Projects/bank/bank-full.csv', sep=';')

# Artificial invalid data to inject
invalid_data = pd.DataFrame({
    'age': [-1, 200, 30, 45],
    'job': ['#manager', '*unknown', 'admin.', 'teacher'],
    'marital': ['single', 'married', '#divorced', 'single'],
    'education': ['tertiary', 'primary', 'xyz', 'secondary'],
    'default': ['no', 'yes', 'no', 'maybe'],
    'balance': ['abc', -50000, 999999999, 1234],
    'housing': ['yes', 'no', 'yes', 'true'],
    'loan': ['no', 'yes', 'no', 'no'],
    'contact': ['telephone', 'cellular', 'sms', 'whatsapp'],
    'day': [32, 0, 15, 10],
    'month': ['abc', 'jan', 'jul', 'dec'],
    'duration': [500, -1, 500, 100],
    'campaign': [0, 100, -10, 2],
    'pdays': [1000, 2000, -5, 50],
    'previous': [-5, 500, 301, 100],
    'poutcome': ['success', 'other', 'fail', 'unknown'],
    'y': ['yes', 'no', 'yes', 'no']
})

# Convert numeric columns correctly where appropriate
numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
for col in numeric_columns:
    invalid_data[col] = pd.to_numeric(invalid_data[col], errors='coerce')

# Append invalid data to main dataset
df_combined = pd.concat([main_df, invalid_data], ignore_index=True)

# Save combined dataset if needed
df_combined.to_csv('D:/Projects/bank/bank-full-with-errors.csv', sep=';', index=False)

print(f"Original records: {len(main_df)}")
print(f"After adding invalid records: {len(df_combined)}")


Original records: 45211
After adding invalid records: 45215


In [3]:
import pandas as pd
from io import StringIO

# -------------------------
# Artificial invalid data as CSV text
invalid_data = """
age;job;marital;education;default;balance;housing;loan;contact;day;month;duration;campaign;pdays;previous;poutcome;y
abc;student;married;primary;no;notanumber;yes;no;cellular;12;jun;300;1;-1;0;unknown;no
88;@manager;single;tertiary;yes;5000;no;no;telephone;;jul;200;2;-1;0;failure;yes
33;technician;single;secondary;no;2000;no;no;cellular;15;abc;250;1;-1;0;success;yes
45;blue-collar;married;secondary;no;1500;yes;yes;unknown;20;dec;300;1;99999;0;other;no
45;blue-collar;married;secondary;no;1500;yes;yes;unknown;20;dec;300;1;99999;0;other;no
;;single;secondary;no;;yes;no;cellular;10;nov;120;1;-1;0;unknown;no
"""

# -------------------------
# Read your original dataset
df = pd.read_csv('D:/Projects/bank/bank-full-with-errors.csv', sep=';')

# -------------------------
# Load invalid data
invalid_df = pd.read_csv(StringIO(invalid_data.strip()), sep=';')

# -------------------------
# Combine datasets
df_combined = pd.concat([df, invalid_df], ignore_index=True)

# -------------------------
# Export combined dataset as CSV (new file)
df_combined.to_csv('D:/Projects/bank/bank-full-with-errors-test.csv', sep=';', index=False)

print("✅ Combined dataset with invalid test data saved.")


✅ Combined dataset with invalid test data saved.


In [4]:
import pandas as pd
import numpy as np
import re

# --------------------------
# Load Data (Read only ONCE)
df = pd.read_csv('D:/Projects/bank/bank-full-with-errors-test.csv', sep=';', dtype=str)

# Preserve original CSV row numbers aligned to Excel (Header = Row 1)
df.reset_index(inplace=True)
df.rename(columns={'index': 'OriginalRowNumber_ZeroBased'}, inplace=True)
df['CSV_Row_Number'] = df['OriginalRowNumber_ZeroBased'] + 2  # Header + 1-based counting

total_records = len(df)
total_columns = len(df.columns)
print(f"Total Records: {total_records}")
print(f"Total Columns: {total_columns}")
# --------------------------
# Expected Domains
expected_domains = {
    'age': (18, 95),
    'balance': None,
    'day': (1, 31),
    'duration': (0, np.inf),
    'campaign': (1, np.inf),
    'pdays': (-1, 999),
    'previous': (0, 300),
    'job': ["admin.", "unknown", "unemployed", "management", "housemaid",
            "entrepreneur", "student", "blue-collar", "self-employed",
            "retired", "technician", "services"],
    'marital': ["married", "divorced", "single"],
    'education': ["unknown", "secondary", "primary", "tertiary"],
    'default': ["yes", "no"],
    'housing': ["yes", "no"],
    'loan': ["yes", "no"],
    'contact': ["unknown", "telephone", "cellular"],
    'month': ["jan", "feb", "mar", "apr", "may", "jun",
              "jul", "aug", "sep", "oct", "nov", "dec"],
    'poutcome': ["unknown", "other", "failure", "success"],
    'y': ["yes", "no"]
}

categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# --------------------------
# Reporting Utility
report = []
detailed_report = []

def add_issue(pillar, column, description, rows):
    adjusted_rows = df.loc[rows, 'CSV_Row_Number'].tolist() if rows else None
    count = len(rows) if rows else 0
    percentage = (count / total_records) * 100
    percentage_str = "<0.01%" if 0 < percentage < 0.01 else f"{percentage:.2f}%"
    report.append({
        'Pillar': pillar,
        'Column Name': column,
        'Issue description': description,
        'Row number': adjusted_rows,
        'Issue Record Count': count,
        'Percentage': percentage_str
    })

# --------------------------
# Completeness Check
missing_cols = df.columns[df.isnull().any()].tolist()
if missing_cols:
    rows = df[df[missing_cols].isnull().any(axis=1)].index.tolist()
    for r in rows:
        detailed_report.append({
            'Row': df.loc[r, 'CSV_Row_Number'],
            'Pillar': 'Completeness',
            'Column': ', '.join(missing_cols),
            'Actual Value': 'Missing',
            'Expected': 'Non-missing value'
        })
    add_issue('Completeness', ', '.join(missing_cols), 'Missing values', rows)
else:
    add_issue('Completeness', 'All Columns', 'No missing values found', [])

# --------------------------
# Uniqueness Check
duplicates = df[df.duplicated()].index.tolist()
for r in duplicates:
    detailed_report.append({
        'Row': df.loc[r, 'CSV_Row_Number'],
        'Pillar': 'Uniqueness',
        'Column': 'All Columns',
        'Actual Value': 'Duplicate Row',
        'Expected': 'Unique Row'
    })
add_issue('Uniqueness', 'All Columns', 'Duplicate records', duplicates)

# --------------------------
# Accuracy Checks
accuracy_found = False
for col in numeric_columns:
    if col in df.columns:
        converted_col = pd.to_numeric(df[col], errors='coerce')
        non_numeric_rows = df[pd.to_numeric(df[col], errors='coerce').isna() & df[col].notna()].index.tolist()
        if non_numeric_rows:
            for r in non_numeric_rows:
                detailed_report.append({
                    'Row': df.loc[r, 'CSV_Row_Number'],
                    'Pillar': 'Accuracy',
                    'Column': col,
                    'Actual Value': df.loc[r, col],
                    'Expected': 'Numeric'
                })
            add_issue('Accuracy', col, 'Non-numeric value found in numeric column', non_numeric_rows)
            accuracy_found = True
        if col == 'balance':
            continue
        expected_range = expected_domains[col]
        if expected_range is not None:
            valid_mask = converted_col.between(*expected_range)
            invalid_range_rows = df[~valid_mask & converted_col.notna()].index.tolist()
            if invalid_range_rows:
                for r in invalid_range_rows:
                    detailed_report.append({
                        'Row': df.loc[r, 'CSV_Row_Number'],
                        'Pillar': 'Accuracy',
                        'Column': col,
                        'Actual Value': df.loc[r, col],
                        'Expected': f"{expected_range[0]} to {expected_range[1]}"
                    })
                add_issue('Accuracy', col, f'Values out of range {expected_range}', invalid_range_rows)
                accuracy_found = True

for col in categorical_columns:
    if col in df.columns:
        invalid_rows = df[~df[col].isin(expected_domains[col])].index.tolist()
        if invalid_rows:
            for r in invalid_rows:
                detailed_report.append({
                    'Row': df.loc[r, 'CSV_Row_Number'],
                    'Pillar': 'Accuracy',
                    'Column': col,
                    'Actual Value': df.loc[r, col],
                    'Expected': f"{expected_domains[col]}"
                })
            add_issue('Accuracy', col, 'Values not in expected domain list', invalid_rows)
            accuracy_found = True

if not accuracy_found:
    add_issue('Accuracy', 'All relevant columns', 'No accuracy issues found', [])

# --------------------------
# Consistency Checks (Special Characters in Categorical Columns)
special_characters = r'[^a-zA-Z0-9 ._-]'
consistency_found = False
for col in categorical_columns:
    if col in df.columns:
        invalid_rows = df[df[col].astype(str).apply(lambda x: bool(re.search(special_characters, x)))].index.tolist()
        if invalid_rows:
            for r in invalid_rows:
                detailed_report.append({
                    'Row': df.loc[r, 'CSV_Row_Number'],
                    'Pillar': 'Consistency',
                    'Column': col,
                    'Actual Value': df.loc[r, col],
                    'Expected': 'No special characters allowed'
                })
            add_issue('Consistency', col, 'Special characters not allowed in categorical fields', invalid_rows)
            consistency_found = True

if not consistency_found:
    add_issue('Consistency', 'All relevant columns', 'No consistency issues found', [])

# --------------------------
# --------------------------
# Create DataFrames
dq_report = pd.DataFrame(report).sort_values(by='Pillar').reset_index(drop=True)

detailed_report_df = pd.DataFrame(detailed_report)
if not detailed_report_df.empty:
    detailed_report_df = detailed_report_df.sort_values(by='Row').reset_index(drop=True)

# --------------------------
# Display Summary
from IPython.display import display
display(dq_report.style.set_properties(**{'text-align': 'left'})
        .set_caption("Data Quality Report (Accuracy, Completeness, Consistency, Uniqueness)"))

if not detailed_report_df.empty:
    display(detailed_report_df.style.set_properties(**{'text-align': 'left'})
            .set_caption("Detailed Expected vs Actual Values Report (With Pillars)"))
else:
    print("No issues found.")

import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font, Alignment
from datetime import datetime

# --------------------------
# --------------------------
# Convert lists to strings for Excel
if 'Row number' in dq_report.columns:
    dq_report['Row number'] = dq_report['Row number'].apply(
        lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x
    )

# --------------------------
# Export to Excel
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font, Alignment
from datetime import datetime

output_file = f"D:/Projects/bank/Data_Quality_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"

wb = openpyxl.Workbook()
wb.remove(wb.active)  # remove default sheet

# --------------------------
# Write Summary Report
ws1 = wb.create_sheet(title="Summary_Report")
for r_idx, row in enumerate(dataframe_to_rows(dq_report, index=False, header=True), 1):
    ws1.append(row)
    for c_idx, cell in enumerate(ws1[r_idx], 1):
        if r_idx == 1:
            cell.font = Font(bold=True)
            ws1.auto_filter.ref = ws1.dimensions
        cell.alignment = Alignment(wrap_text=True, vertical='top')

for col in ws1.columns:
    max_length = max(len(str(cell.value)) if cell.value is not None else 0 for cell in col)
    ws1.column_dimensions[col[0].column_letter].width = max_length + 2

# --------------------------
# Write Detailed Report
ws2 = wb.create_sheet(title="Detailed_Report")
if not detailed_report_df.empty:
    for r_idx, row in enumerate(dataframe_to_rows(detailed_report_df, index=False, header=True), 1):
        ws2.append(row)
        for c_idx, cell in enumerate(ws2[r_idx], 1):
            if r_idx == 1:
                cell.font = Font(bold=True)
                ws2.auto_filter.ref = ws2.dimensions
            cell.alignment = Alignment(wrap_text=True, vertical='top')

    for col in ws2.columns:
        max_length = max(len(str(cell.value)) if cell.value is not None else 0 for cell in col)
        ws2.column_dimensions[col[0].column_letter].width = max_length + 2

# --------------------------
# Save Workbook
wb.save(output_file)
print(f"\n Data Quality Report saved to:\n{output_file}")


Total Records: 45221
Total Columns: 19


Unnamed: 0,Pillar,Column Name,Issue description,Row number,Issue Record Count,Percentage
0,Accuracy,previous,"Values out of range (0, 300)","[45213, 45214, 45215]",3,<0.01%
1,Accuracy,poutcome,Values not in expected domain list,[45215],1,<0.01%
2,Accuracy,age,Non-numeric value found in numeric column,[45217],1,<0.01%
3,Accuracy,age,"Values out of range (18, 95)","[45213, 45214]",2,<0.01%
4,Accuracy,balance,Non-numeric value found in numeric column,[45217],1,<0.01%
5,Accuracy,day,"Values out of range (1, 31)","[45213, 45214]",2,<0.01%
6,Accuracy,duration,"Values out of range (0, inf)",[45214],1,<0.01%
7,Accuracy,campaign,"Values out of range (1, inf)","[45213, 45215]",2,<0.01%
8,Accuracy,pdays,"Values out of range (-1, 999)","[45213, 45214, 45215, 45220, 45221]",5,0.01%
9,Accuracy,month,Values not in expected domain list,"[45213, 45219]",2,<0.01%


Unnamed: 0,Row,Pillar,Column,Actual Value,Expected
0,45213,Completeness,"age, job, balance, day",Missing,Non-missing value
1,45213,Accuracy,age,-1,18 to 95
2,45213,Accuracy,day,32.0,1 to 31
3,45213,Accuracy,campaign,0,1 to inf
4,45213,Accuracy,pdays,1000,-1 to 999
5,45213,Accuracy,month,abc,"['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']"
6,45213,Accuracy,job,#manager,"['admin.', 'unknown', 'unemployed', 'management', 'housemaid', 'entrepreneur', 'student', 'blue-collar', 'self-employed', 'retired', 'technician', 'services']"
7,45213,Accuracy,previous,-5,0 to 300
8,45213,Consistency,job,#manager,No special characters allowed
9,45214,Accuracy,pdays,2000,-1 to 999



 Data Quality Report saved to:
D:/Projects/bank/Data_Quality_Report_20250718_103148.xlsx
