In [22]:
import pandas as pd
import glob
import numpy as np

import os
import zipfile
from tqdm import tqdm

### Data concatenation

The code below unzip intial file to the temporary folder and iterate over all available files in such way, that every folder (`nn5`, `mipt_alpha` and `danish_atm_daily`) is packed into separate dataframe, saved as .csv file. After transfer was completed, all temporary folders are removed automatically.

In [23]:
def concatenate_csv_files(folder_path, output_file, exclude_column_name):
    """
    Concatenates CSV files in a folder, dropping the first column and handling
    different column lengths.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        output_file (str): Path to the output CSV file.
    """

    all_files = glob.glob(f"{folder_path}/*.csv")  # Find all CSV files in the folder

    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the folder: {folder_path}")

    concatenated_df = pd.DataFrame()

    for filename in all_files:
        # Load the CSV file, excluding the specified column by name
        temp_df = pd.read_csv(filename).drop(columns=[exclude_column_name], errors='ignore')
        # Concatenate this file's data side-by-side with previous files' data
        concatenated_df = pd.concat([concatenated_df, temp_df], axis=1)
        # Finding and dropping columns "Unnamed" and "Date"
        cols_to_drop = [col for col in concatenated_df.columns if col.startswith('Unnamed') or col.startswith('date')]
        concatenated_df.drop(columns=cols_to_drop, errors='ignore', inplace=True)
        # Remove file after reading
        os.remove(filename)
    
    # Fill NaN values with linear interpolation
    concatenated_df.interpolate(method='linear', inplace=True, limit_direction='forward', axis=0)

    #Save the concatenated DataFrame to a new CSV file
    concatenated_df.to_csv(output_file, index=False)

In [24]:
temp_path = './data/temp'

# Extract zip file to the temporary directory
with zipfile.ZipFile('./data/time_series.zip', 'r') as zip_ref:
    zip_ref.extractall(temp_path)
    
# Define all the directories for the file extractions
dirs = os.listdir(temp_path)
paths_in = [temp_path + '/' + dir for dir in dirs]
paths_out = ['./data/' + dir + '.csv' for dir in dirs]

for path, name in tqdm(zip(paths_in, paths_out), total=len(paths_in)):
    concatenate_csv_files(path, name, ['date'])
    # Remove empty folder
    os.rmdir(path)
    
os.rmdir(temp_path)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.60s/it]


In [25]:
#Checking for NaNs
for name in paths_out:
    df = pd.read_csv(name)
    if np.array(df.isna().sum()).sum() != 0:
        print(f'There are NaNs in {name}')
    else:
        print(f'There is no NaN in {name}')

There is no NaN in ./data/danish_atm_daily.csv
There is no NaN in ./data/mipt_alpha.csv
There is no NaN in ./data/nn5.csv
