In [1]:
pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
from glob import glob
from io import StringIO

In [3]:
def json_to_csv(json_file):
    """Convert a JSON file to a CSV file, excluding the 'hover_log' column."""
    dfs = []
    try:
        with open(json_file, encoding="utf8") as file:
            for line in file:
                try:
                    # Convert JSON string to DataFrame
                    df = pd.read_json(StringIO(line))
                    dfs.append(df)
                except ValueError as e:
                    print(f"Error reading JSON line: {line}\n{e}")
        if dfs:
            df = pd.concat(dfs)
            # Remove the 'hover_log' column if it exists
            if 'hover_log' in df.columns:
                df = df.drop(columns=['hover_log'])
            # Save to CSV
            csv_file = json_file.replace('.json', '.csv')
            df.to_csv(csv_file, index=False)
            print(f"Converted {json_file} to {csv_file}")
            return csv_file
        else:
            print(f"No data found in {json_file}")
            return None
    except FileNotFoundError as e:
        print(f"File not found: {json_file}\n{e}")
        return None

def list_files(folder_path, pattern):
    """List all files matching the pattern in the folder and its subfolders."""
    return glob(os.path.join(folder_path, '**', pattern), recursive=True)

def convert_json_files_to_csv(folder_path):
    """Convert all JSON files in the folder and its subfolders to CSV."""
    json_files = list_files(folder_path, '*.json')
    csv_files = []
    for json_file in json_files:
        csv_file = json_to_csv(json_file)
        if csv_file:
            csv_files.append(csv_file)
    return csv_files

def read_csv_files(csv_files):
    """Read all CSV files into a list of DataFrames."""
    dfs = []
    for csv_file in csv_files:
        try:
            dfs.append(pd.read_csv(csv_file))
            print(f"Read {csv_file} into DataFrame")
        except FileNotFoundError as e:
            print(f"CSV file not found: {csv_file}\n{e}")
        except pd.errors.EmptyDataError as e:
            print(f"CSV file is empty: {csv_file}\n{e}")
    return dfs

# Define the main folder path that contains all the batch folders
main_folder_path = 'C:/Users/nicol/Documents/Masters/Ox/MSc PsyRes/Research Project/Data Collection - Actual/Speed Data/batches_all'

# Convert JSON files to CSV within all batch folders
csv_files = convert_json_files_to_csv(main_folder_path)

# Read all CSV files into DataFrames
data_frames = read_csv_files(csv_files)


Converted C:/Users/nicol/Documents/Masters/Ox/MSc PsyRes/Research Project/Data Collection - Actual/Speed Data/batches_all\batch_1\study_result_2301\comp-result_3991\files\20_data_1721392760000.json to C:/Users/nicol/Documents/Masters/Ox/MSc PsyRes/Research Project/Data Collection - Actual/Speed Data/batches_all\batch_1\study_result_2301\comp-result_3991\files\20_data_1721392760000.csv
Converted C:/Users/nicol/Documents/Masters/Ox/MSc PsyRes/Research Project/Data Collection - Actual/Speed Data/batches_all\batch_1\study_result_2302\comp-result_3993\files\21_data_1721393861000.json to C:/Users/nicol/Documents/Masters/Ox/MSc PsyRes/Research Project/Data Collection - Actual/Speed Data/batches_all\batch_1\study_result_2302\comp-result_3993\files\21_data_1721393861000.csv
Converted C:/Users/nicol/Documents/Masters/Ox/MSc PsyRes/Research Project/Data Collection - Actual/Speed Data/batches_all\batch_1\study_result_2303\comp-result_3996\files\22_data_1721394374000.json to C:/Users/nicol/Document

In [4]:
# Optionally, combine all DataFrames into one DataFrame
if data_frames:
    combined_df = pd.concat(data_frames, ignore_index=True)
    print("Combined all DataFrames into one.")
    
    # Save the combined DataFrame to a CSV file
    combined_csv_path = 'combined_data_speed.csv'  # You can change this path and filename as needed
    combined_df.to_csv(combined_csv_path, index=False)
    print(f"Saved combined DataFrame to {combined_csv_path}")
else:
    print("No data frames to combine.")

Combined all DataFrames into one.
Saved combined DataFrame to combined_data_speed.csv
