In [5]:
import os
import pandas as pd
from itertools import product
from glob import glob
from gzip import BadGzipFile


In [6]:
# Define the date range
start_year = 2017
end_year = 2017

# Specify the desired months and days
desired_months = [9] # list(range(1, 13))  # Adjust this for desired months (e.g., [1, 2, 3] for Jan, Feb, Mar)
desired_days = [6] # list(range(1, 32))    # Adjust this for desired days (e.g., [1, 15, 30])

# List of acronyms
acronyms = ["ATHA", "FSIM", "FSMI", "GILL", "PINA", "RANK", "SNKQ"]

# Directory where the files are located (adjust as needed)
data_dir = './carisma_data/'  # Assuming files are in a directory named 'carisma_data'

# Dictionary to hold the DataFrames for each acronym
dfs_dict = {acronym: [] for acronym in acronyms}


In [7]:
# Loop through each combination of year, month, day, and acronym using itertools.product
for year, month, day, acronym in product(range(start_year, end_year + 1), desired_months, desired_days, acronyms):
    folder_name = f'{year}-{month:02d}-{day:02d}'
    file_path = os.path.join(data_dir, folder_name, f'{year}{month:02d}{day:02d}{acronym}.F02.gz.txt')
    
    print(f"Trying to access: {file_path}")
    
    if os.path.exists(file_path):
        try:
            # Attempt to read it as a plain text file
            data = pd.read_csv(file_path, header=None)
            print(f"Successfully read the file as a plain text file: {file_path}")
            
            data_list = data[0].tolist()
            reformatted_data = []
            i = 1
            while i < len(data_list) - 2:
                timestamp = data_list[i]
                if str(timestamp).split()[0].isdigit():
                    timestamp = timestamp.replace(' .', '')
                    timestamp = timestamp.replace(' x', '')
                    timestamp_dt = pd.to_datetime(timestamp, format='%Y%m%d%H%M%S')
                    values1 = list(map(float, data_list[i + 1].split()))
                    values2 = list(map(float, data_list[i + 2].split()))
                    averaged_values = [(v1 + v2) / 2 for v1, v2 in zip(values1, values2)]
                    averaged_row = [timestamp_dt] + averaged_values
                    reformatted_data.append(averaged_row)
                    i += 3
                else:
                    i += 1
            df_averaged = pd.DataFrame(reformatted_data, columns=["Timestamp", "x", "y", "z"])
            dfs_dict[acronym].append(df_averaged)
            
        except Exception as e:
            print(f"Error reading the file as plain text: {e}")
            try:
                data = pd.read_csv(file_path, header=None, compression='gzip')
                print(f"Successfully read the file as a gzipped file: {file_path}")
                
            except BadGzipFile:
                print(f"File {file_path} is not a gzipped file.")
    else:
        print(f"File not found: {file_path}")

Trying to access: ./carisma_data/2017-09-06/20170906ATHA.F02.gz.txt


Successfully read the file as a plain text file: ./carisma_data/2017-09-06/20170906ATHA.F02.gz.txt
Trying to access: ./carisma_data/2017-09-06/20170906FSIM.F02.gz.txt
Successfully read the file as a plain text file: ./carisma_data/2017-09-06/20170906FSIM.F02.gz.txt
Trying to access: ./carisma_data/2017-09-06/20170906FSMI.F02.gz.txt
Successfully read the file as a plain text file: ./carisma_data/2017-09-06/20170906FSMI.F02.gz.txt
Trying to access: ./carisma_data/2017-09-06/20170906GILL.F02.gz.txt
Successfully read the file as a plain text file: ./carisma_data/2017-09-06/20170906GILL.F02.gz.txt
Trying to access: ./carisma_data/2017-09-06/20170906PINA.F02.gz.txt
Successfully read the file as a plain text file: ./carisma_data/2017-09-06/20170906PINA.F02.gz.txt
Trying to access: ./carisma_data/2017-09-06/20170906RANK.F02.gz.txt
Successfully read the file as a plain text file: ./carisma_data/2017-09-06/20170906RANK.F02.gz.txt
Trying to access: ./carisma_data/2017-09-06/20170906SNKQ.F02.gz.tx

In [8]:
# Display the DataFrames
for acronym, dfs in dfs_dict.items():
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f"Data for {acronym}:")
        display(combined_df.head())

Data for ATHA:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,14055.547,-470.0045,55431.8855
1,2017-09-06 00:00:01,14055.4055,-470.1185,55431.846
2,2017-09-06 00:00:02,14055.278,-470.225,55431.8135
3,2017-09-06 00:00:03,14055.2055,-470.3045,55431.81
4,2017-09-06 00:00:04,14054.989,-470.2915,55431.767


Data for FSIM:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,11501.788,2842.651,57300.4465
1,2017-09-06 00:00:01,11501.8995,2842.7975,57300.253
2,2017-09-06 00:00:02,11502.0075,2842.9855,57300.103
3,2017-09-06 00:00:03,11502.1295,2843.181,57299.928
4,2017-09-06 00:00:04,11502.3615,2843.4045,57299.7595


Data for FSMI:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,11203.2515,-533.4965,57648.0565
1,2017-09-06 00:00:01,11203.028,-533.5475,57648.044
2,2017-09-06 00:00:02,11202.7535,-533.5825,57648.056
3,2017-09-06 00:00:03,11202.555,-533.5945,57648.069
4,2017-09-06 00:00:04,11202.3595,-533.552,57648.084


Data for GILL:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,11299.0205,371.179,58435.544
1,2017-09-06 00:00:01,11298.9965,371.1205,58435.5845
2,2017-09-06 00:00:02,11298.895,370.986,58435.728
3,2017-09-06 00:00:03,11298.709,370.8585,58435.972
4,2017-09-06 00:00:04,11298.556,370.7515,58436.05


Data for PINA:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,15477.39,-11.3725,55285.119
1,2017-09-06 00:00:01,15477.3235,-11.459,55285.106
2,2017-09-06 00:00:02,15477.269,-11.5115,55285.0595
3,2017-09-06 00:00:03,15477.221,-11.557,55285.0565
4,2017-09-06 00:00:04,15477.192,-11.641,55285.028


Data for RANK:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,6305.5255,417.3755,58871.8875
1,2017-09-06 00:00:01,6305.5835,417.412,58871.922
2,2017-09-06 00:00:02,6305.579,417.3795,58871.928
3,2017-09-06 00:00:03,6305.4885,417.3735,58871.9345
4,2017-09-06 00:00:04,6305.4615,417.4115,58871.9655


Data for SNKQ:


Unnamed: 0,Timestamp,x,y,z
0,2017-09-06 00:00:00,11414.843,27.581,56240.0705
1,2017-09-06 00:00:01,11416.292,29.642,56242.086
2,2017-09-06 00:00:02,11415.116,26.1825,56239.726
3,2017-09-06 00:00:03,11416.275,29.3055,56242.636
4,2017-09-06 00:00:04,11415.569,29.396,56242.424


In [None]:
# ######## WRITE RESULTING FILES INTO A FOLDER NAMED 'output_data' WHICH USER WILL HAVE TO CREATE

# output_directory = "./output_data/"  # Adjust this to the desired directory for the output .dat files

# # Ensure the output directory exists
# if not os.path.exists(output_directory):
#     os.makedirs(output_directory)

# # Construct the date range string for the filename (Output will look like: ATHA_data_20170906_to_20170906.dat)
# date_range_str = f"{start_year}{desired_months[0]:02d}{desired_days[0]:02d}_to_{end_year}{desired_months[-1]:02d}{desired_days[-1]:02d}"

# # Save the DataFrames to .dat files
# for acronym, df in dfs_dict.items():
#     if df is not None:
#         output_file_path = os.path.join(output_directory, f"{acronym}_data.dat")
#         df.to_csv(output_file_path, index=False, sep="\t")  # Using tab separator for .dat files
#         print(f"Saved data for {acronym} to: {output_file_path}")