In [1]:
import pandas as pd
import glob
import os
import re

import re
import pandas as pd
import os
import pytz
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
from concurrent.futures import ThreadPoolExecutor

#### Data Merging

In [3]:
def merge(source_dir, location, type, target_dir):
    print(f'Merging for {location} : {type}')
    print('='*50)

    path = f"{source_dir}/{location}/{type}"

    # Print the path for debugging
    print(f"Looking for CSV files in: {path}")

    # Use glob to get all the CSV files in the directory
    all_files = glob.glob(os.path.join(path, "*.csv"))


    # Initialize an empty list to hold the DataFrames
    df_list = []

    # Loop through the list of files and read each file into a DataFrame
    print("Reading file...")
    for file in tqdm(all_files):
        try:
            df = pd.read_csv(file, encoding='ascii')
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Check if df_list is still empty after the loop
    if len(df_list) == 0:
        raise ValueError("No DataFrames were created. Check if the CSV files are valid and readable.")

    # Concatenate all the DataFrames in the list into a single DataFrame
    try:
        merged_df = pd.concat(df_list, ignore_index=True)
    except Exception as e:
        raise ValueError(f"Error concatenating DataFrames: {e}")

    ## Taking cols related to time, meteorological parameters and pm2.5
    # necessary_cols = ['UTCDateTime', 'current_humidity', 'current_temp_f', 'current_dewpoint_f', 'pressure', 'pm2_5_cf_1', 'pm2_5_atm', 'pm2_5_cf_1_b', 'pm2_5_atm_b']
    # merged_df = merged_df[necessary_cols].copy()

    # # New algorithm
    # def calculate_average_with_error_check(value1, value2, threshold=0.10): ## Updated logic
    #     avg = (value1 + value2)/2
    #     if avg < 5: avg = np.nan
    #     if avg > 1000 : avg = np.nan   ## initially taking upper limit of 1000. It was farther readjusted to 500 afterwards
    #     if value1 > 100:
    #         if value1 == 0 or value2 == 0:
    #             error = float('inf')
    #         else:
    #             error = abs(value1 - value2) / value1
    #         if error <= threshold:
    #             return avg
    #         else:
    #             return np.nan
    #     else:
    #         if np.absolute(value1-value2) <= 10:
    #             return avg
    #         else:
    #             return np.nan
        

    # merged_df['pm2_5_atm_avg'] = merged_df.apply(
    #         lambda row: calculate_average_with_error_check(row['pm2_5_atm'], row['pm2_5_atm_b']), axis=1)


    # merged_df['pm2_5_cf_1_avg'] = merged_df.apply(
    #         lambda row: calculate_average_with_error_check(row['pm2_5_cf_1'], row['pm2_5_cf_1_b']), axis=1)

    os.makedirs(f'{target_dir}/{location}', exist_ok=True)
    try:
        merged_df.to_csv(f'{target_dir}/{location}/{type}.csv', index = False)
    except Exception as e:
        print("Error in saving file...")
        print(e)
        while True:
            print("1. Retry")
            print("2. Skip")
            option = int(input())
            if option == 1:
                try: 
                    merged_df.to_csv(f'Data/Raw files/{location}/{type}_merged.csv', index = False)
                except Exception as e:
                    print("Error again", e)
                    continue
            elif option == 2:
                break
            else:
                print("invalid input")
                continue


    print(f'file saved : Data/Raw files/{location}/{type}_merged.csv')

    # length = merged_df.shape[0]
    # atmdata = length - merged_df['pm2_5_atm_avg'].isna().sum()
    # cfdata = length - merged_df['pm2_5_cf_1_avg'].isna().sum()

    # return atmdata/length*100, cfdata/length*100, length


# merge("Ajimpur home data", "indoor type No 12")

In [4]:
source_dir = 'data'
target_dir = 'merged'
locations = [f for f in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, f))]


for location in locations:
    types = ["Indoor", "Outdoor"]
    for type in types:
        merge(source_dir, location, type, target_dir)



Merging for H1 : Indoor
Looking for CSV files in: data/H1/Indoor
Reading file...


  0%|          | 0/482 [00:00<?, ?it/s]

file saved : Data/Raw files/H1/Indoor_merged.csv
Merging for H1 : Outdoor
Looking for CSV files in: data/H1/Outdoor
Reading file...


  0%|          | 0/415 [00:00<?, ?it/s]

file saved : Data/Raw files/H1/Outdoor_merged.csv
Merging for H2 : Indoor
Looking for CSV files in: data/H2/Indoor
Reading file...


  0%|          | 0/471 [00:00<?, ?it/s]

file saved : Data/Raw files/H2/Indoor_merged.csv
Merging for H2 : Outdoor
Looking for CSV files in: data/H2/Outdoor
Reading file...


  0%|          | 0/504 [00:00<?, ?it/s]

file saved : Data/Raw files/H2/Outdoor_merged.csv
Merging for H3 : Indoor
Looking for CSV files in: data/H3/Indoor
Reading file...


  0%|          | 0/167 [00:00<?, ?it/s]

file saved : Data/Raw files/H3/Indoor_merged.csv
Merging for H3 : Outdoor
Looking for CSV files in: data/H3/Outdoor
Reading file...


  0%|          | 0/163 [00:00<?, ?it/s]

file saved : Data/Raw files/H3/Outdoor_merged.csv
Merging for H4 : Indoor
Looking for CSV files in: data/H4/Indoor
Reading file...


  0%|          | 0/423 [00:00<?, ?it/s]

file saved : Data/Raw files/H4/Indoor_merged.csv
Merging for H4 : Outdoor
Looking for CSV files in: data/H4/Outdoor
Reading file...


  0%|          | 0/471 [00:00<?, ?it/s]

file saved : Data/Raw files/H4/Outdoor_merged.csv
Merging for H5 : Indoor
Looking for CSV files in: data/H5/Indoor
Reading file...


  0%|          | 0/319 [00:00<?, ?it/s]

file saved : Data/Raw files/H5/Indoor_merged.csv
Merging for H5 : Outdoor
Looking for CSV files in: data/H5/Outdoor
Reading file...


  0%|          | 0/274 [00:00<?, ?it/s]

file saved : Data/Raw files/H5/Outdoor_merged.csv
Merging for H6 : Indoor
Looking for CSV files in: data/H6/Indoor
Reading file...


  0%|          | 0/478 [00:00<?, ?it/s]

file saved : Data/Raw files/H6/Indoor_merged.csv
Merging for H6 : Outdoor
Looking for CSV files in: data/H6/Outdoor
Reading file...


  0%|          | 0/422 [00:00<?, ?it/s]

file saved : Data/Raw files/H6/Outdoor_merged.csv
