In [12]:
import pandas as pd
from datetime import datetime
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font, Alignment

# -------------------------
# Load Data
df = pd.read_csv('D:/Projects/bank/bank-full.csv', sep=';', dtype=str)
df.reset_index(inplace=True)
df.rename(columns={'index': 'OriginalRowNumber_ZeroBased'}, inplace=True)
df['CSV_Row_Number'] = df['OriginalRowNumber_ZeroBased'] + 2  # Excel-style row numbers

total_records = len(df)

# -------------------------
# Reporting Utilities
summary_report = []
detailed_report = []

def add_summary(pillar, column, description, rows):
    adjusted_rows = df.loc[rows, 'CSV_Row_Number'].tolist() if rows else ''
    count = len(rows) if rows else 0
    percentage = (count / total_records) * 100
    percentage_str = "<0.01%" if 0 < percentage < 0.01 else f"{percentage:.2f}%"

    summary_report.append({
        'Pillar': pillar,
        'Column Name': column,
        'Issue description': '' if count == 0 else description,
        'Row number': adjusted_rows,
        'Issue Record Count': count,
        'Percentage': percentage_str
    })

def add_detailed(rows, column, issue_description):
    for row in rows:
        detailed_report.append({
            'Row Number': int(df.loc[row, 'CSV_Row_Number']),
            'Column Name': column,
            'Value': df.loc[row, column],
            'Issue description': issue_description
        })

# -------------------------
# Timeliness Logic Example
recent_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun']

if 'month' in df.columns:
    invalid_months_rows = df[~df['month'].str.lower().isin(recent_months)].index.tolist()
    add_summary('Timeliness', 'month', 'Month not within recent months (jan-jun)', invalid_months_rows)
    add_detailed(invalid_months_rows, 'month', 'Month not within recent months (jan-jun)')

# -------------------------
# If no issues found
if len(summary_report) == 0 or all(item['Issue Record Count'] == 0 for item in summary_report):
    add_summary('Timeliness', 'All relevant columns', '', [])

# -------------------------
# Create DataFrames
dq_summary = pd.DataFrame(summary_report).reset_index(drop=True)
dq_detailed = pd.DataFrame(detailed_report).sort_values(by='Row Number').reset_index(drop=True)

# -------------------------
# Show in VS Code (Console)
print("\nSummary Report:")
print(dq_summary.to_string(index=False))

print("\nDetailed Report:")
if not dq_detailed.empty:
    print(dq_detailed.to_string(index=False))
else:
    print("No timeliness issues found.")

# -------------------------
# Export to Excel
dq_summary['Row number'] = dq_summary['Row number'].apply(
    lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x
)

output_file = f"D:/Projects/bank/Data_Quality_Timeliness_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
wb = Workbook()
wb.remove(wb.active)

# Summary Sheet
ws1 = wb.create_sheet(title="Summary_Report")
for r_idx, row in enumerate(dataframe_to_rows(dq_summary, index=False, header=True), 1):
    ws1.append(row)
    for c_idx, cell in enumerate(ws1[r_idx], 1):
        if r_idx == 1:
            cell.font = Font(bold=True)
            ws1.auto_filter.ref = ws1.dimensions
        cell.alignment = Alignment(wrap_text=True, vertical='top')

# Detailed Sheet
ws2 = wb.create_sheet(title="Detail_Report")
for r_idx, row in enumerate(dataframe_to_rows(dq_detailed, index=False, header=True), 1):
    ws2.append(row)
    for c_idx, cell in enumerate(ws2[r_idx], 1):
        if r_idx == 1:
            cell.font = Font(bold=True)
            ws2.auto_filter.ref = ws2.dimensions
        cell.alignment = Alignment(wrap_text=True, vertical='top')

for ws in [ws1, ws2]:
    for col in ws.columns:
        max_length = max(len(str(cell.value)) if cell.value is not None else 0 for cell in col)
        ws.column_dimensions[col[0].column_letter].width = max_length + 2

wb.save(output_file)
print(f"\nData Quality Timeliness Report saved to:\n{output_file}")



Summary Report:
    Pillar Column Name                        Issue description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        Row number  Issue Record Count Percentage
Timeliness       month Month not within recent months (jan-jun) [12445, 12446, 12447, 12448, 12449, 12450, 12451, 12452, 12453, 12454, 12455, 12456, 12457, 12458, 12459, 12460, 12461