In [1]:
import pandas as pd
import glob
import numpy as np

import os
import zipfile
from tqdm import tqdm

### Data concatenation

The code below unzip intial file to the temporary folder and iterate over all available files in such way, that every folder (`nn5`, `mipt_alpha` and `danish_atm_daily`) is packed into separate dataframe, saved as .csv file. After transfer was completed, all temporary folders are removed automatically.

In [32]:
def concatenate_csv_files(folder_path, output_file, exclude_column_name):
    """
    Concatenates CSV files in a folder, dropping the first column and handling
    different column lengths.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        output_file (str): Path to the output CSV file.
    """

    all_files = glob.glob(f"{folder_path}/*.csv")  # Find all CSV files in the folder

    if not all_files:
        raise FileNotFoundError(f"No CSV files found in the folder: {folder_path}")

    concatenated_df = pd.DataFrame()

    for filename in all_files:
        # Extract a base name for the file to use as a prefix: e.g., 'data1' from 'path/to/data1.csv'
        base_name = os.path.splitext(os.path.basename(filename))[0]

        # Load the CSV file, excluding the specified column by name
        temp_df = pd.read_csv(filename).drop(columns=[exclude_column_name], errors='ignore')

        # Rename the columns of temp_df to include the base_name as a prefix
        temp_df.columns = [f"{base_name}_{col}s" if col not in [exclude_column_name, 'Unnamed', 'date'] else col for col in temp_df.columns]

        # Concatenate this file's data side-by-side with previous files' data
        concatenated_df = pd.concat([concatenated_df, temp_df], axis=1)

        # Finding and dropping columns that are "Unnamed" or start with "date" after renaming
        cols_to_drop = [col for col in concatenated_df.columns if 'Unnamed' in col or col.startswith('date')]
        concatenated_df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

        os.remove(filename)

    
    # Fill NaN values with linear interpolation
    concatenated_df.interpolate(method='linear', inplace=True, limit_direction='forward', axis=0)

    #Save the concatenated DataFrame to a new CSV file
    concatenated_df.to_csv(output_file, index=False)


In [33]:
temp_path = './data/temp'

# Extract zip file to the temporary directory
with zipfile.ZipFile('./data/time_series.zip', 'r') as zip_ref:
    zip_ref.extractall(temp_path)
    
# Define all the directories for the file extractions
dirs = os.listdir(temp_path)
paths_in = [temp_path + '/' + dir for dir in dirs]
paths_out = ['./data/' + dir + '.csv' for dir in dirs]

for path, name in tqdm(zip(paths_in, paths_out), total=len(paths_in)):
    concatenate_csv_files(path, name, ['date'])
    # Remove empty folder
    os.rmdir(path)
    
os.rmdir(temp_path)

100%|██████████| 3/3 [00:15<00:00,  5.21s/it]


In [34]:
#Checking for NaNs
for name in paths_out:
    df = pd.read_csv(name)
    if np.array(df.isna().sum()).sum() != 0:
        print(f'There are NaNs in {name}')
    else:
        print(f'There is no NaN in {name}')

There is no NaN in ./data/danish_atm_daily.csv
There is no NaN in ./data/mipt_alpha.csv
There is no NaN in ./data/nn5.csv


In [35]:
pd.read_csv('data\mipt_alpha.csv')

Unnamed: 0,mipt_alpha_0_values,mipt_alpha_1_values,mipt_alpha_10_values,mipt_alpha_100_values,mipt_alpha_101_values,mipt_alpha_102_values,mipt_alpha_103_values,mipt_alpha_104_values,mipt_alpha_105_values,mipt_alpha_106_values,...,mipt_alpha_90_values,mipt_alpha_91_values,mipt_alpha_92_values,mipt_alpha_93_values,mipt_alpha_94_values,mipt_alpha_95_values,mipt_alpha_96_values,mipt_alpha_97_values,mipt_alpha_98_values,mipt_alpha_99_values
0,146100.0,1523400.0,215500.0,1457000.0,169200.0,891000.0,4390000.0,720600.0,1286500.0,2163600.0,...,1853200.0,628000.0,303203.0,346000.0,907800.0,521400.0,432100.0,71500.0,1488000.0,479000.0
1,178000.0,1188200.0,268800.0,951000.0,970900.0,1827200.0,3753300.0,831200.0,1388900.0,3150800.0,...,1037400.0,1549000.0,786500.0,643100.0,1959500.0,969600.0,270500.0,267200.0,1625500.0,1260000.0
2,209900.0,2183400.0,2128100.0,1338000.0,2156200.0,528200.0,512000.0,1006300.0,902900.0,3349900.0,...,610500.0,1629000.0,852900.0,259900.0,0.0,236800.0,502300.0,528400.0,792000.0,999000.0
3,408600.0,1603400.0,2578300.0,1853000.0,1165100.0,956900.0,0.0,646800.0,2121100.0,3606900.0,...,1570700.0,919000.0,980600.0,935800.0,0.0,1365200.0,487500.0,266800.0,679500.0,1435000.0
4,1344200.0,584800.0,1481400.0,1124000.0,1378100.0,2257500.0,1928100.0,1610900.0,961000.0,3740800.0,...,1105500.0,1417500.0,1216900.0,299600.0,886700.0,1294800.0,1219000.0,253100.0,756900.0,3452000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,5111100.0,5493000.0,305900.0,3665000.0,12416300.0,4165000.0,1276700.0,4004600.0,6617100.0,5818700.0,...,9411500.0,8229000.0,3792000.0,4120800.0,4559600.0,3504900.0,457000.0,3031000.0,9237800.0,11500000.0
727,3316700.0,3679400.0,305900.0,3665000.0,14976700.0,9851500.0,1276700.0,6008700.0,9788800.0,5818700.0,...,8726200.0,13753000.0,6332000.0,7118800.0,4559600.0,3504900.0,457000.0,4300000.0,9864100.0,12018000.0
728,11365600.0,4134100.0,305900.0,3665000.0,11522600.0,4888000.0,1276700.0,9972700.0,7439800.0,5818700.0,...,11040000.0,19665500.0,7678000.0,4883400.0,4559600.0,3504900.0,457000.0,5473700.0,8655100.0,13376600.0
729,8075500.0,6649900.0,305900.0,3665000.0,23420900.0,756300.0,1276700.0,7092200.0,7319500.0,5818700.0,...,5795900.0,11452000.0,5560000.0,7431800.0,4559600.0,3504900.0,457000.0,3784300.0,4182000.0,10341000.0
