In [10]:
import pandas as pd
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Path to your data file
file_path = '/Users/san./Documents/GitHub/text-generator-sagemaker/dataset/finefoods_processed.csv'

def custom_parser(file_path):
    """
    Custom parser for the file.
    Assumes each review is separated by a blank line and each attribute within a review is on a new line.
    Handles malformed lines by appending them to the last key-value pair.
    """
    structured_data = []
    current_review = {}
    last_key = None  # Keep track of the last key to handle malformed lines
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, 1):
            line = line.strip()
            if line:
                if ': ' in line:
                    key, value = line.split(': ', 1)
                    current_review[key] = value
                    last_key = key  # Update the last key
                elif last_key:  # Handle malformed line by appending it to the last key's value
                    current_review[last_key] += ' ' + line
                else:
                    logging.error(f"Malformed line {line_number} has no key to append to: {line}")
            else:  # Empty line indicates the end of a review
                if current_review:
                    structured_data.append(current_review)
                    current_review = {}
                    last_key = None
        if current_review:  # Add the last review if the file doesn't end with a blank line
            structured_data.append(current_review)
    return structured_data

def clean_data(df):
    """
    Cleans the DataFrame by handling missing values, duplicates, and data types.
    """
    # Fill missing text with an empty string and drop rows with missing critical fields
    df.fillna('', inplace=True)
    df.dropna(subset=['product/productId', 'review/userId', 'review/score'], inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Correct data types
    df['review/score'] = df['review/score'].astype(float)
    df['review/time'] = pd.to_datetime(df['review/time'].astype(int), unit='s')

    return df

def main(file_path):
    """
    Main function to handle the parsing and cleaning of the data file.
    """
    try:
        # Use the custom parser
        data = custom_parser(file_path)
        df = pd.DataFrame(data)
        
        # Clean the DataFrame
        df = clean_data(df)
        
        # Display the shape of the cleaned DataFrame
        logging.info(f"Shape of the cleaned DataFrame: {df.shape}")
        
        # Quick check to ensure the DataFrame looks correct
        logging.info("First few rows of the cleaned DataFrame:")
        logging.info(df.head())
        
    except Exception as e:
        logging.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main(file_path)


2024-04-14 13:50:21,198 - INFO - First 5 lines from the file:
2024-04-14 13:50:21,199 - INFO - product/productId: B001E4KFG0
2024-04-14 13:50:21,200 - INFO - review/userId: A3SGXH7AUHU8GW
2024-04-14 13:50:21,201 - INFO - review/profileName: delmartian
2024-04-14 13:50:21,201 - INFO - review/helpfulness: 1/1
2024-04-14 13:50:21,201 - INFO - review/score: 5.0
2024-04-14 13:50:21,392 - ERROR - Line 753580 is malformed: 88 years old. ...
2024-04-14 13:50:21,559 - ERROR - Line 1416685 is malformed: ...creative powers b...
2024-04-14 13:50:21,585 - ERROR - Line 1521590 is malformed: School Princi...
2024-04-14 13:50:21,774 - ERROR - Line 2270671 is malformed: School Princi...
2024-04-14 13:50:21,910 - ERROR - Line 2809464 is malformed: I am a voracious reader/li...
2024-04-14 13:50:21,963 - ERROR - Line 3018833 is malformed: School Princi...
2024-04-14 13:50:22,276 - ERROR - Line 4306898 is malformed: ...creative powers b...
  df['review/time'] = pd.to_datetime(df['review/time'], unit='s')
2