**Function to split chunks based on the chunksize**

In [None]:
# prompt: Create a function to split a large csv into chunks in a specified folder or path using the chunksize parameter in read_csv

import pandas as pd
import os


def split_csv_into_chunks(file_path, chunksize, output_directory, sep=','):
  """Splits a large CSV file into smaller chunks using the chunksize parameter.

  Args:
    file_path: The path to the large CSV file.
    chunksize: The number of rows per chunk.
    output_directory: The directory where the chunks should be saved.
  """
  try:
    if not os.path.exists(output_directory):
      os.makedirs(output_directory)

    for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize, sep=sep)):
      output_file = os.path.join(output_directory, f"chunk_{i+1}.csv")
      chunk.to_csv(output_file, index=False)

    print(f"File '{file_path}' split into {i+1} chunks in '{output_directory}'.")

  except FileNotFoundError as e:
    print(f"Error: {e}")
  except Exception as e:
    print(f"An unexpected error occurred: {e}")


# Example usage:
# Replace 'your_large_file.csv' with the actual path to your file
# Replace 'output_chunks_folder' with the desired output directory
split_csv_into_chunks('/content/lifebear.csv', 1000000, '/content/chunks', sep=';')


  for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunksize, sep=sep)):


File '/content/lifebear.csv' split into 4 chunks in '/content/chunks'.


Function to remove invalid rows from a CSV

In [None]:
# prompt: Create a function that removes invalid rows from a CSV. Detect the number of columns based on the header and the specified delimiter. Rows where the number of columns don't match the expected number of columns and may cause problems when trying to use the read_csv function. Create a new CSV file with the valid rows.

import csv

def remove_invalid_rows(input_file, output_file, delimiter=','):
  """Removes invalid rows from a CSV file.

  Args:
    input_file: Path to the input CSV file.
    output_file: Path to the output CSV file.
    delimiter: Delimiter used in the CSV file.
  """

  with open(input_file, 'r', encoding='utf-8') as infile, \
       open(output_file, 'w', newline='', encoding='utf-8') as outfile:

    reader = csv.reader(infile, delimiter=delimiter)
    writer = csv.writer(outfile, delimiter=delimiter)

    header = next(reader)  # Read the header row
    expected_num_columns = len(header)
    writer.writerow(header)  # Write the header to the output file

    for row in reader:
      if len(row) == expected_num_columns:
        writer.writerow(row)

# prompt: Create a function that removes invalid rows from a CSV. Detect the number of columns based on the header and the specified delimiter. Rows where the number of columns don't match the expected number of columns and may cause problems when trying to use the read_csv function.

import csv

def remove_invalid_rows_from_csv(csv_file_path, delimiter=','):
  """Removes rows from a CSV file that have an invalid number of columns.

  Args:
    csv_file_path: The path to the CSV file.
    delimiter: The delimiter used in the CSV file.

  Returns:
    A list of valid rows.
  """

  with open(csv_file_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=delimiter)
    header = next(reader)  # Get the header row
    expected_num_columns = len(header)
    valid_rows = [header]  # Start with the header

    for row in reader:
      if len(row) == expected_num_columns:
        valid_rows.append(row)
      else:
        print(f"Warning: Skipping row with invalid number of columns: {row}")

  return valid_rows

# Example usage:
# valid_rows = remove_invalid_rows_from_csv('my_file.csv')



Function to remove blank records or records with too many blank fields.

In [None]:
# prompt: Create a function to remove blank records or records with too many blank fields either based on the number of columns or based on specific columns.Append the invalid records to a dataframe and drop them from the original dataframe. Return both the updated dataframe and the error dataframe. Include error checking and loggging.

import pandas as pd
import re

# Function to remove blank records or records with too many blank fields.
def remove_blank_records(df, threshold=None, specific_columns=None):
    """
    Removes records with blank fields from a dataframe based on the threshold or specific columns.
    Appends records with blank fields to a new dataframe and drops them from the original dataframe.

    Args:
        df (pd.DataFrame): The dataframe to process.
        threshold (int, optional): The minimum number of non-blank fields required in a record.
            If None, removes records with all blank fields.
        specific_columns (list, optional): A list of specific column names to check for blank fields.
            If provided, threshold is ignored and records with any blank field in the specified columns
            are removed.

    Returns:
        tuple: A tuple containing the updated dataframe with valid records
               and a new dataframe with records containing blank fields.
    """
    try:
        error_df = pd.DataFrame()

        if specific_columns:
            for index, row in df.iterrows():
                if any(pd.isnull(row[col]) or row[col] == '' for col in specific_columns):
                    error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                    df.drop(index, inplace=True)
        else:
            for index, row in df.iterrows():
                if threshold is None:
                    if all(pd.isnull(val) or val == '' for val in row):
                        error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                        df.drop(index, inplace=True)
                else:
                    non_blank_count = sum(1 for val in row if not (pd.isnull(val) or val == ''))
                    if non_blank_count < threshold:
                        error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                        df.drop(index, inplace=True)

        print("Blank record removal complete. Records with blank fields appended to error_df.")
        return df, error_df

    except Exception as e:
        print(f"Error occurred during blank record removal: {e}")
        return df, pd.DataFrame()


Function to remove duplicate records based on specified columns.

In [None]:
# prompt: Create a function to remove duplicate records based on specified columns. Add the duplicates records to separate dataframe and drop them from the original. Include error checking and logging.

def remove_duplicate_records(df, columns):
    """
    Removes duplicate records based on specified columns.
    Adds duplicate records to a separate dataframe and drops them from the original.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list): A list of column names to consider for duplicate detection.

    Returns:
        tuple: A tuple containing the updated dataframe with unique records
               and a new dataframe with duplicate records.
    """
    try:
        duplicate_df = pd.DataFrame()
        df_deduplicated = df.drop_duplicates(subset=columns, keep='first')
        duplicate_rows = df[~df.index.isin(df_deduplicated.index)]

        if not duplicate_rows.empty:
            duplicate_df = pd.concat([duplicate_df, duplicate_rows], ignore_index=True)

        print(f"Duplicate removal complete. Duplicate records appended to duplicate_df.")
        return df_deduplicated, duplicate_df

    except Exception as e:
        print(f"Error occurred during duplicate removal: {e}")
        return df, pd.DataFrame()


Function to validate email addresses. It removes records with invalid email addresses and saves them to a separate dataframe.

In [None]:
# prompt: create a function to validate emails addresses in a dataframe, append the records with an invalid email address to a dataframe and drop them from the original dataframe. Return both the updated dataframe and the error dataframe. Include error checking and loggging.

import pandas as pd
import re

def validate_and_remove_invalid_emails(df, email_column):
    """
    Validates email addresses in a dataframe, appends records with invalid email
    addresses to a new dataframe, and removes them from the original dataframe.

    Args:
        df (pd.DataFrame): The dataframe containing email addresses.
        email_column (str): The name of the column containing email addresses.

    Returns:
        tuple: A tuple containing the updated dataframe with valid email addresses
            and a new dataframe with records containing invalid email addresses.
    """
    try:
        # Regular expression for basic email validation
        email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

        # Create a new dataframe to store records with invalid email addresses
        error_df = pd.DataFrame()

        # Iterate through the dataframe and validate email addresses
        for index, row in df.iterrows():
            email = row[email_column]
            if not re.match(email_regex, email):
                # Append record to the error dataframe
                error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                # Drop the record from the original dataframe
                df.drop(index, inplace=True)

        print("Validation complete. Invalid email records appended to error_df.")
        return df, error_df

    except Exception as e:
        print(f"Error occurred during email validation: {e}")
        return df, pd.DataFrame()  # Return empty error dataframe in case of error



Function to remove time from the date

In [None]:
# prompt: Create a function to remove the time from a date in specified columns

def remove_time_from_date(df, columns):
  """Removes the time component from date columns in a DataFrame.

  Args:
    df: The DataFrame containing the date columns.
    columns: A list of column names to process.

  Returns:
    The DataFrame with the time component removed from the specified columns.
  """
  try:
    for column in columns:
      if column in df.columns and pd.api.types.is_datetime64_any_dtype(df[column]):
        df[column] = df[column].dt.date

    return df
  except Exception as e:
    print(f"An error occurred: {e}")
    return df


# Example usage:
# Assuming your DataFrame is 'df' and you want to remove time from columns 'date_column1' and 'date_column2'
# df = remove_time_from_date(df, ['date_column1', 'date_column2'])


Function to validate phone numbers

In [None]:
# prompt: create a function to validate phone numbers in a dataframe including the characters "+", "-", "(", ")", "."  If it is  valid phone number but includes these characters, remove them. Append the records with an invalid phone number to a dataframe and drop them from the original dataframe. Return both the updated dataframe and the error dataframe. Include error checking and logging.

def validate_and_remove_invalid_phone_numbers(df, phone_column):
    """
    Validates phone numbers in a dataframe, appends records with invalid phone
    numbers to a new dataframe, and removes them from the original dataframe.
    If a phone number is valid but contains special characters, they are removed.

    Args:
        df (pd.DataFrame): The dataframe containing phone numbers.
        phone_column (str): The name of the column containing phone numbers.

    Returns:
        tuple: A tuple containing the updated dataframe with valid phone numbers
            and a new dataframe with records containing invalid phone numbers.
    """
    try:
        error_df = pd.DataFrame()

        for index, row in df.iterrows():
            phone_number = row[phone_column]

            # Remove special characters from the phone number
            phone_number = re.sub(r"[+()\-. ]", "", str(phone_number))

            # Check if the phone number contains only digits
            if not phone_number.isdigit():
                error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                df.drop(index, inplace=True)
            else:
                # Update the dataframe with the cleaned phone number
                df.at[index, phone_column] = phone_number

        print("Phone number validation complete. Invalid phone number records appended to error_df.")
        return df, error_df

    except Exception as e:
        print(f"Error occurred during phone number validation: {e}")
        return df, pd.DataFrame()  # Return empty error dataframe in case of error



Function to check and truncate any fields over 52 bits

In [None]:
# prompt: Create a function to check and truncate any fields over 52 bits.

import pandas as pd

def truncate_large_fields(df, columns_to_check=None):
    """
    Checks and truncates any fields in a dataframe that exceed 52 bits in size.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns_to_check (list, optional): A list of column names to check.
            If None, all numeric columns are checked.

    Returns:
        pd.DataFrame: The dataframe with truncated values for fields exceeding 52 bits.
    """
    if columns_to_check is None:
        columns_to_check = df.select_dtypes(include=['number']).columns

    for column in columns_to_check:
        if pd.api.types.is_numeric_dtype(df[column]):
            df[column] = df[column].apply(lambda x: min(x, (2**52) - 1) if x > (2**52) - 1 else x)

    return df


Function to remove non-utf characters

In [None]:
# prompt: Create a function to remove non-utf characters from a dataset.

def remove_non_utf_characters(df):
    """
    Removes non-UTF-8 characters from all string columns in a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to process.

    Returns:
        pd.DataFrame: The DataFrame with non-UTF-8 characters removed from string columns.
    """
    for column in df.select_dtypes(include=['object']):
        df[column] = df[column].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8') if isinstance(x, str) else x)
    return df


In [None]:
# prompt: Create a function to remove non-utf characters from specified columns in a dataset.

def remove_non_utf_characters_from_columns(df, columns_to_check):
    """
    Removes non-UTF-8 characters from specified columns in a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        columns_to_check (list): A list of column names to check and remove non-UTF-8 characters from.

    Returns:
        pd.DataFrame: The DataFrame with non-UTF-8 characters removed from specified columns.
    """
    for column in columns_to_check:
        if column in df.columns:
            df[column] = df[column].apply(lambda x: x.encode('utf-8', 'ignore').decode('utf-8') if isinstance(x, str) else x)
    return df


Function to validate names in a datafaframe, converting characters with accents to regular characters. This code snippet removes records with invalid characters but should be duplicated and refined to remove the invalid characters.

In [None]:
# prompt: Create a function to validate names in a dataframe, detecting and removing any invalid characters, convert any alphabetical characters with accents to regular alphabetical characters and drop them from the original dataframe. Return the updated dataframe. Include error checking and logging.

import pandas as pd
import re
import unicodedata


def validate_and_remove_invalid_names(df, name_column):
    """
    Validates names in a dataframe, detects and removes any invalid characters,
    converts any alphabetical characters with accents to regular alphabetical
    characters, and removes them from the original dataframe.

    Args:
        df (pd.DataFrame): The dataframe containing names.
        name_column (str): The name of the column containing names.

    Returns:
        tuple: A tuple containing the updated dataframe with valid names and a
            new dataframe with records containing invalid names.
    """
    try:
        # Regular expression to match invalid characters in names
        #invalid_char_regex = r"[^a-zA-Z\s]" # must be updated to include "-"
        invalid_char_regex = r"[^a-zA-Z\s\-']" # includes - like in Pitt-Browne and ' as in O'Connor

        # Create a new dataframe to store records with invalid names
        error_df = pd.DataFrame()

        # Iterate through the dataframe and validate names
        for index, row in df.iterrows():
            name = row[name_column]
            if re.search(invalid_char_regex, name):
                # Append record to the error dataframe
                error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                # Drop the record from the original dataframe
                df.drop(index, inplace=True)
            else:
                # Remove accents from the name
                name = ''.join((c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn'))
                df.at[index, name_column] = name

        print("Name validation complete. Invalid name records appended to error_df.")
        return df, error_df

    except Exception as e:
        print(f"Error occurred during name validation: {e}")
        return df, pd.DataFrame()  # Return empty error dataframe in case of error


In [None]:
# prompt: Create a function to validate names in a dataframe based on specified columns, detecting and removing any invalid characters, convert any alphabetical characters with accents to regular alphabetical characters and drop them from the original dataframe. Return the updated dataframe. Include error checking and logging.

def validate_and_remove_invalid_names_in_columns(df, name_columns):
    """
    Validates names in a dataframe based on specified columns, detecting and removing any invalid characters,
    converts any alphabetical characters with accents to regular alphabetical characters, and drops them
    from the original dataframe.

    Args:
        df (pd.DataFrame): The dataframe containing names.
        name_columns (list): A list of column names containing names to validate.

    Returns:
        pd.DataFrame: The updated dataframe with valid names.
    """
    try:
        invalid_char_regex = r"[^a-zA-Z\s\-']"  # Includes '-', like in Pitt-Browne, and ' as in O'Connor

        for name_column in name_columns:
            if name_column in df.columns:
                # Create a copy of the dataframe to avoid modifying the original
                df_copy = df.copy()

                # Iterate through the dataframe and validate names
                for index, row in df_copy.iterrows():
                    name = row[name_column]
                    if name is not None and isinstance(name, str) and re.search(invalid_char_regex, name):
                        # Drop the row if invalid characters are found
                        df.drop(index, inplace=True)
                    else:
                        # Remove accents from the name
                        if name is not None and isinstance(name, str):
                            name = ''.join((c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn'))
                            df.at[index, name_column] = name

                print(f"Name validation complete for column: {name_column}. Rows with invalid names removed.")
            else:
                print(f"Warning: Column '{name_column}' not found in the dataframe.")

        return df

    except Exception as e:
        print(f"Error occurred during name validation: {e}")
        return df  # Return the original dataframe in case of error


Function to go through each CSV chunk, run the validation functions and merge the chunks into the final CSV. **Use when chunks are created using chunksize in read_csv.**

In [None]:
# prompt: Create a function to get chunked csvs from a specified folder, runs the validation functions and merges the chunks into a specified final valid csv file and final error csv file. INclude error checking and logging.

import csv
import pandas as pd
import re
import unicodedata
import os
import logging

def process_chunked_csvs(input_folder, output_valid_csv, output_error_csv, email_column_name='email', phone_column_name='phone', name_column_name='name'):
  """
  Processes chunked CSV files from a specified folder, runs validation functions,
  and merges the results into final valid and error CSV files.

  Args:
      input_folder (str): The path to the folder containing chunked CSV files.
      output_valid_csv (str): The path to the output CSV file for valid records.
      output_error_csv (str): The path to the output CSV file for error records.
      email_column_name (str): The name of the email column.
      phone_column_name (str): The name of the phone column.
      name_column_name (str): The name of the name column.
  """

  try:
    # Setup logging
    logging.basicConfig(filename='processing_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    valid_df = pd.DataFrame()
    error_df = pd.DataFrame()

    for filename in os.listdir(input_folder):
      if filename.endswith(".csv"):
        file_path = os.path.join(input_folder, filename)
        logging.info(f"Processing file: {file_path}")

        try:
          df = pd.read_csv(file_path, low_memory=False) #add the column names from the first chunk

          # Run validation functions
          df, chunk_error_df = validate_and_remove_invalid_emails(df, email_column_name)
          df, chunk_duplicates_df = remove_duplicate_records(df, columns)
          df, chunk_error_df_phone = validate_and_remove_invalid_phone_numbers(df, phone_column_name)
          df, chunk_error_df_name = validate_and_remove_invalid_names(df, name_column_name)
          df = truncate_large_fields(df)
          df = remove_non_utf_characters(df)

          # Concatenate error dataframes
          chunk_error_df = pd.concat([chunk_error_df, chunk_error_df_phone, chunk_error_df_name], ignore_index=True)


          valid_df = pd.concat([valid_df, df], ignore_index=True)
          error_df = pd.concat([error_df, chunk_error_df], ignore_index=True)

          logging.info(f"File {file_path} processed successfully.")

        except Exception as e:
          logging.error(f"Error processing file {file_path}: {e}")

    # Save final dataframes
    valid_df.to_csv(output_valid_csv, index=False)
    error_df.to_csv(output_error_csv, index=False)

    logging.info(f"Final valid data saved to {output_valid_csv}.")
    logging.info(f"Final error data saved to {output_error_csv}.")

  except Exception as e:
    logging.critical(f"Critical error during processing: {e}")


# Example usage:
process_chunked_csvs('/content/chunks', 'final_valid_data.csv', 'final_error_data.csv')


# *Final code with necessary functions*

In [None]:
# prompt: Create a function to get chunked csvs from a specified folder, runs the validation functions and merges the chunks into a specified final valid csv file and final error csv file. INclude error checking and logging.

import csv
import pandas as pd
import re
import unicodedata
import os
import logging
import datetime as dt


# prompt: Create a function to remove duplicate records based on specified columns. Add the duplicates records to separate dataframe and drop them from the original. Include error checking and logging.

def remove_duplicate_records(df, columns):
    """
    Removes duplicate records based on specified columns.
    Adds duplicate records to a separate dataframe and drops them from the original.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list): A list of column names to consider for duplicate detection.

    Returns:
        tuple: A tuple containing the updated dataframe with unique records
               and a new dataframe with duplicate records.
    """
    try:
        duplicate_df = pd.DataFrame()
        df_deduplicated = df.drop_duplicates(subset=columns, keep='first')
        duplicate_rows = df[~df.index.isin(df_deduplicated.index)]

        if not duplicate_rows.empty:
            duplicate_df = pd.concat([duplicate_df, duplicate_rows], ignore_index=True)

        print(f"Duplicate removal complete. Duplicate records appended to duplicate_df.")
        return df_deduplicated, duplicate_df

    except Exception as e:
        print(f"Error occurred during duplicate removal: {e}")
        return df, pd.DataFrame()


# prompt: Create a function to remove the time from a date in specified columns

def remove_time_from_date(df, columns):
  """Removes the time component from date columns in a DataFrame.

  Args:
    df: The DataFrame containing the date columns.
    columns: A list of column names to process.

  Returns:
    The DataFrame with the time component removed from the specified columns.
  """
  try:
    for column in columns:
      if column in df.columns:
        # Convert to datetime if not already
        df[column] = pd.to_datetime(df[column], errors='coerce')
        # Remove the time component
        df[column] = df[column].dt.date

    return df
  except Exception as e:
    print(f"An error occurred: {e}")
    return df


# prompt: create a function to validate emails addresses in a dataframe, append the records with an invalid email address to a dataframe and drop them from the original dataframe. Return both the updated dataframe and the error dataframe. Include error checking and loggging.

def validate_and_remove_invalid_emails(df, email_column):
    """
    Validates email addresses in a dataframe, appends records with invalid email
    addresses to a new dataframe, and removes them from the original dataframe.

    Args:
        df (pd.DataFrame): The dataframe containing email addresses.
        email_column (str): The name of the column containing email addresses.

    Returns:
        tuple: A tuple containing the updated dataframe with valid email addresses
            and a new dataframe with records containing invalid email addresses.
    """
    try:
        # Regular expression for basic email validation
        email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

        # Create a new dataframe to store records with invalid email addresses
        error_df = pd.DataFrame()

        # Iterate through the dataframe and validate email addresses
        for index, row in df.iterrows():
            email = row[email_column]
            if not re.match(email_regex, email):
                # Append record to the error dataframe
                error_df = pd.concat([error_df, pd.DataFrame([row])], ignore_index=True)
                # Drop the record from the original dataframe
                df.drop(index, inplace=True)

        print("Validation complete. Invalid email records appended to error_df.")
        return df, error_df

    except Exception as e:
        print(f"Error occurred during email validation: {e}")
        return df, pd.DataFrame()  # Return empty error dataframe in case of error


# prompt: Create a function to get chunked csvs from a specified folder, runs the validation functions and merges the chunks into a specified final valid csv file and final error csv file. INclude error checking and logging.

def process_chunked_csvs(input_folder, output_valid_csv, output_error_csv, output_duplicates_csv, email_column_name='mail_address', date_columns=['created_at']):
  """
  Processes chunked CSV files from a specified folder, runs validation functions,
  and merges the results into final valid and error CSV files.

  Args:
      input_folder (str): The path to the folder containing chunked CSV files.
      output_valid_csv (str): The path to the output CSV file for valid records.
      output_error_csv (str): The path to the output CSV file for error records.
      output_duplicates_csv (str): The path to the output CSV file for duplicate records.
      email_column_name (str): The name of the email column.
      date_columns (list): A list of column names to consider for date validation.
  """

  try:
    # Setup logging
    logging.basicConfig(filename='processing_log.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    valid_df = pd.DataFrame()
    error_df = pd.DataFrame()

    for filename in os.listdir(input_folder):
      if filename.endswith(".csv"):
        file_path = os.path.join(input_folder, filename)
        logging.info(f"Processing file: {file_path}")

        try:
          df = pd.read_csv(file_path, low_memory=True)

          # Run validation functions
          df, chunk_error_df = validate_and_remove_invalid_emails(df, email_column_name)
          df = remove_time_from_date(df, date_columns)
          #df['gender'] = df['gender'].astype(int)
          #df['birthday_on'] = df['birthday_on'].dt.date
          # df = truncate_large_fields(df)
          # df = remove_non_utf_characters(df)

          valid_df = pd.concat([valid_df, df], ignore_index=True)

          # Concatenate error dataframes
          #chunk_error_df = pd.concat([chunk_error_df,chunk_dup_error_df], ignore_index=True)
          error_df = pd.concat([error_df, chunk_error_df], ignore_index=True)

          logging.info(f"File {file_path} processed successfully.")

        except Exception as e:
          logging.error(f"Error processing file {file_path}: {e}")

    # Remove duplicates from full dataframe
    valid_df, duplicates_df = remove_duplicate_records(valid_df, ['mail_address']) #remove duplicates based on email
    #error_df = pd.concat([error_df, chunk_dup_error_df], ignore_index=True)

    # Save final dataframes
    valid_df.to_csv(output_valid_csv, index=False)
    error_df.to_csv(output_error_csv, index=False)
    duplicates_df.to_csv(output_duplicates_csv, index=False)

    logging.info(f"Final valid data saved to {output_valid_csv}.")
    logging.info(f"Final error data saved to {output_error_csv}.")
    logging.info(f"Final duplicates data saved to {output_duplicates_csv}.")

  except Exception as e:
    logging.critical(f"Critical error during processing: {e}")


# Example usage:
process_chunked_csvs('/content/chunks', 'final_valid_data.csv', 'final_error_data.csv', 'duplicates_df.csv')

  df = pd.read_csv(file_path, low_memory=True)


Validation complete. Invalid email records appended to error_df.
Validation complete. Invalid email records appended to error_df.
Validation complete. Invalid email records appended to error_df.
Validation complete. Invalid email records appended to error_df.
Duplicate removal complete. Duplicate records appended to duplicate_df.


In [None]:
# prompt: Create code to read a csv and show sample of dataframe

import pandas as pd

# Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv('/content/final_valid_data.csv')

# Show a sample of the dataframe (e.g., the first 5 rows)
print(df.head())


  df = pd.read_csv('/content/final_valid_data.csv')


   id    login_id              mail_address                          password  \
0   1    sugimoto   sugimoto@lifebear.co.jp  f0bac04aa1b45cf443d722d6f71c0250   
1   2         kou  nakanishi@lifebear.co.jp  48207c322ee5bb156ffec9f08c960aaa   
2   3      yusuke     yuozawa1208@gmail.com  048261a8024ce51d379eb53cc51aaf33   
3   4  entyan1106        endo1106@gmail.com  cd77a9dac26260a104facda5665eb3ab   
4   5      kuriki          kuriki@wavy4.com  a026597c294cc48cd20ae361f10cbab1   

   created_at          salt birthday_on  gender  
0  2012-01-13  yGwBKynnsctI  1984-11-09     0.0  
1  2012-01-14  aha6EuRYCDvU  1986-11-13     0.0  
2  2012-01-17  PVS59dPWk9BH  1984-12-08     0.0  
3  2012-01-17  vLZI6TVCJowN  1987-11-06     0.0  
4  2012-01-17  swFznWWk79fg  1986-10-21     0.0  


In [None]:
# prompt: Generate code to show all duplicate mail_address from lifebear.csv

import pandas as pd

# Replace 'lifebear.csv' with the actual path to your CSV file
df = pd.read_csv('/content/lifebear.csv', sep=";", low_memory=True)

# Check if 'mail_address' exists in the DataFrame
if 'mail_address' in df.columns:
    # Find duplicate mail_address entries
    duplicate_emails = df[df.duplicated(subset=['mail_address'], keep=False)]  # keep=False shows all duplicates
    # Print the duplicate email addresses
    print(duplicate_emails)
else:
    print("The 'mail_address' column does not exist in the DataFrame.")

duplicate_emails.to_csv('duplicate_emails.csv', index=False)

  df = pd.read_csv('/content/lifebear.csv', sep=";", low_memory=True)


               id         login_id                    mail_address  \
136           138        maaam1120             ammma1120@gmail.com   
221           223         exuernok            2br02b1215@gmail.com   
227           229     takayuki0930          takayuki0930@gmail.com   
332           334           hiromi       hiromi.sono.111@gmail.com   
621           623          UZUMAME       uzumame.uzumame@gmail.com   
...           ...              ...                             ...   
3679894  11593957          0enaka0           dara0o0arad@gmail.com   
3679926  11594071         ns109097             ns109097@icloud.com   
3679964  11594208      takayamasae        as_coco_0520@yahoo.co.jp   
3680042  11594511          hyx0630             hyx0630@ezweb.ne.jp   
3680353  11595638  relapishoptest9  relapishoptest9@lifebear.co.jp   

                                 password           created_at          salt  \
136      f2dea97eab78a50d6cc615c2f172c890  2012-05-28 11:58:21  48dpW9JT7u1w   