In [None]:
import csv

# Define the expected schema as a dictionary where keys are column names
# and values are the expected data types
expected_schema = {
    'field1': str,
    'field2': int,
    'field3': float,
    'field4': str
}

# Define the delimiter used in the input file (e.g., ',' for CSV or '\t' for TSV)
delimiter = ','

def validate_row(row, schema):
    """
    Validate a single row of data against the expected schema.
    Returns a dictionary of exceptions, if any.
    """
    exceptions = {}
    for field, expected_type in schema.items():
        if field not in row:
            exceptions[field] = "Missing field"
        else:
            value = row[field]
            try:
                # Attempt type conversion to validate data type
                if expected_type == int:
                    int(value)
                elif expected_type == float:
                    float(value)
                elif expected_type == str and not isinstance(value, str):
                    raise ValueError("Expected string")
            except ValueError:
                exceptions[field] = f"Invalid type: expected {expected_type.__name__}, got {type(value).__name__}"
    return exceptions

def validate_file(file_path):
    """
    Validate a CSV/TSV file against the expected schema.
    """
    total_exceptions = {}
    example_exceptions = {}

    with open(file_path, 'r') as file:
        reader = csv.DictReader(file, delimiter=delimiter)
        for row_number, row in enumerate(reader, start=1):
            # Validate each row
            exceptions = validate_row(row, expected_schema)
            if exceptions:
                # Log exceptions and count their occurrences
                for field, error in exceptions.items():
                    total_exceptions[field] = total_exceptions.get(field, 0) + 1
                    # Store the first example of each error type
                    if field not in example_exceptions:
                        example_exceptions[field] = (row_number, error)
    
    # Output the summary of exceptions
    print("\nValidation Summary:")
    if total_exceptions:
        for field, count in total_exceptions.items():
            row_number, error_message = example_exceptions[field]
            print(f"Field '{field}': {count} errors")
            print(f"Example from row {row_number}: {error_message}")
    else:
        print("No exceptions found. All data is valid.")

# Example usage
file_path = 'data.csv'  # Update this with your file path
validate_file(file_path)


In [None]:
Python 

In [None]:
import subprocess
import csv
from datetime import datetime, timedelta

def get_dates_last_month():
    """Generate a list of all dates from the previous month."""
    today = datetime.today()
    # Find the first day of the current month and then subtract a day to get the last day of the previous month
    first_day_current_month = today.replace(day=1)
    last_day_previous_month = first_day_current_month - timedelta(days=1)
    first_day_previous_month = last_day_previous_month.replace(day=1)

    # Generate all dates for the previous month
    date_list = []
    current_date = first_day_previous_month
    while current_date <= last_day_previous_month:
        date_list.append(current_date.strftime('%Y-%m-%d'))
        current_date += timedelta(days=1)
    return date_list

def run_hive_query(date):
    """Run the Hive query for a given date."""
    query = f"select utc_date, sum(1) as num_rows from my_table where utc_date = '{date}' group by utc_date"
    try:
        # Use subprocess to run the Hive query
        result = subprocess.check_output(['hive', '-e', query], text=True)
        return result.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error running query for date {date}: {e}")
        return None

def save_to_csv(data, output_file):
    """Save the query results to a CSV file."""
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['utc_date', 'num_rows'])  # Write header
        writer.writerows(data)

def main():
    dates = get_dates_last_month()
    output_file = 'hive_query_results.csv'
    results = []

    # Run query for each date and collect results
    for date in dates:
        result = run_hive_query(date)
        if result:
            try:
                utc_date, num_rows = result.split()
                results.append([utc_date, num_rows])
            except ValueError:
                print(f"Unexpected format for date {date}: {result}")

    # Save all results to a CSV file
    save_to_csv(results, output_file)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main()
