In [1]:
import pytesseract

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
from PIL import Image
import pandas as pd
import re
import os

# Folder containing the images
folder_path = "Sleep Time Data"  # Replace with your actual folder path

# Function to extract date from filename (e.g., "2-27 sleep time.jpg" -> "2025-02-27")
def extract_date_from_filename(filename):
    match = re.search(r'(\d+)-(\d+)', filename)
    if match:
        month, day = match.groups()
        return f"2025-{int(month):02d}-{int(day):02d}"  # Format YYYY-MM-DD (assuming 2025)
    return "Unknown"

# Function to extract sleep values based on keywords
def extract_value(text, keyword):
    """Extracts time values in formats like '7h 53m', '7h', '53m', or '9m'."""
    pattern = rf"{keyword}\s*\n([\d]+h\s*[\d]*m?|[\d]+m)"
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(1).strip() if match else "Not Found"

# List to store data
sleep_data_list = []

# Process each image in the folder
for filename in os.listdir(folder_path):
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):  # Ensure it's an image
        image_path = os.path.join(folder_path, filename)
        image = Image.open(image_path)
        
        # Extract text from image
        extracted_text = pytesseract.image_to_string(image)
        
        # Extract sleep metrics
        time_in_bed = extract_value(extracted_text, "Time in bed")
        sleep_latency = extract_value(extracted_text, "Sleep latency")
        sleep_time = extract_value(extracted_text, "Sleep time")
        actual_sleep_time = extract_value(extracted_text, "Actual sleep time")
        
        # Extract date from filename
        date = extract_date_from_filename(filename)
        
        # Append to list
        sleep_data_list.append({
            "Date": date,
            "Time in Bed": time_in_bed,
            "Sleep Latency": sleep_latency,
            "Sleep Time": sleep_time,
            "Actual Sleep Time": actual_sleep_time
        })

# Convert list to DataFrame
df = pd.DataFrame(sleep_data_list)

# Sort data by Date
df = df.sort_values(by="Date")

# Save to CSV
csv_filename = "all_sleep_data_sorted.csv"
df.to_csv(csv_filename, index=False)

print(f"All sleep data sorted by date saved to {csv_filename}")



All sleep data sorted by date saved to all_sleep_data_sorted.csv
