In [15]:
import pandas as pd
import json
import logging
from datetime import datetime

In [19]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [20]:
def parse_log_line(line):
    try:
        # Extract JSON data from log line
        if 'Collected battery data:' in line:
            json_str = line.split('Collected battery data: ')[1]
            # Replace single quotes with double quotes for valid JSON format
            json_str = json_str.replace("'", '"')
            # Replace Python None with JSON null for compatibility
            json_str = json_str.replace('None', 'null')
            # Strip leading/trailing spaces and extra characters
            json_str = json_str.strip()
            
            # Ensure the string is valid JSON
            if json_str.startswith('{') and json_str.endswith('}'):
                return json.loads(json_str)
            else:
                logger.error(f"Invalid JSON format in line: {line}")
                return None
    except json.JSONDecodeError as e:
        logger.error(f"JSONDecodeError: {e} for line: {line}")
        return None
    except Exception as e:
        logger.error(f"Error parsing line: {e}")
        return None

def preprocess_battery_data():
    try:
        # Read raw log file
        data_list = []
        log_file_path = '../data/battery_data.csv'
        
        # Check if the file exists
        try:
            with open(log_file_path, 'r') as f:
                for line in f:
                    data = parse_log_line(line)
                    if data:
                        data_list.append(data)
        except FileNotFoundError:
            logger.error(f"File not found: {log_file_path}")
            return None
        
        # Convert to DataFrame
        df = pd.DataFrame(data_list)

        # Check if required columns exist
        expected_columns = ['battery_percentage', 'voltage', 'time', 'timestamp']
        missing_columns = [col for col in expected_columns if col not in df.columns]
        if missing_columns:
            logger.error(f"Missing columns: {missing_columns}")
            return None

        # Convert types
        df['battery_percentage'] = pd.to_numeric(df['battery_percentage'], errors='coerce')
        df['voltage'] = pd.to_numeric(df['voltage'], errors='coerce')
        df['time'] = pd.to_numeric(df['time'], errors='coerce')
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

        # Calculate time differences
        df['time_diff'] = df['time'].diff()

        # Feature engineering
        df['voltage_change'] = df['voltage'].diff()
        df['battery_change'] = df['battery_percentage'].diff()

        # Handle missing values (if any NaNs)
        df.fillna(method='ffill', inplace=True)

        # Save cleaned data
        cleaned_data_path = '../data/clean_battery_data.csv'
        df.to_csv(cleaned_data_path, index=False)
        logger.info(f"Cleaned data saved to {cleaned_data_path}, shape: {df.shape}")

        return df
        
    except Exception as e:
        logger.error(f"Error preprocessing data: {e}")
        raise

In [21]:
if __name__ == "__main__":
    df = preprocess_battery_data()
    if df is not None:
        print("\nFirst few rows of cleaned data:")
        print(df.head())

  df.fillna(method='ffill', inplace=True)
INFO:__main__:Cleaned data saved to ../data/clean_battery_data.csv, shape: (300, 9)



First few rows of cleaned data:
   battery_percentage  voltage temperature full_charge_capacity          time  \
0                  53    11165        None                 None  1.732023e+09   
1                  53    12043        None                 None  1.732023e+09   
2                  54    12130        None                 None  1.732023e+09   
3                  55    12176        None                 None  1.732023e+09   
4                  55    12243        None                 None  1.732023e+09   

            timestamp  time_diff  voltage_change  battery_change  
0 2024-11-19 18:57:33        NaN             NaN             NaN  
1 2024-11-19 18:58:33  60.215339           878.0             0.0  
2 2024-11-19 18:59:33  60.273084            87.0             1.0  
3 2024-11-19 19:00:33  60.220394            46.0             1.0  
4 2024-11-19 19:01:33  60.280137            67.0             0.0  
