In [1]:
import os

from glob import glob
from datetime import datetime

import numpy as np
import pandas as pd

In [2]:
base_location = "../data/raw"
files = sorted(glob(f"{base_location}/*.txt"))

## Creating directory for dumping

In [3]:
os.makedirs("../data/cleaned/", exist_ok=True)

## Processing the raw txt files

In [4]:
def read_metadata(line_gen):
    country = next(line_gen).strip("% ").split(" ")[-1]
    
    next(line_gen)
    city = next(line_gen).strip("% ").split(" ")[-1]
    
    state = " ".join(next(line_gen).strip("% ").split(" ")[1:])
    next(line_gen)
    
    population = int(next(line_gen).strip("% ").split(" ")[-1])
    latitude = float(next(line_gen).strip("% ").split(" ")[-1])
    longitude = float(next(line_gen).strip("% ").split(" ")[-1])
    
    return [country, city, state, population, latitude, longitude]

In [5]:
def read_rows(line_gen):
    next(line_gen)
    next(line_gen)
    
    aq_rows = []
    
    for row in line_gen:
        splitted_row = row.split("\t")
        
        if len(splitted_row) > 1:
            record_time = f"{'-'.join(splitted_row[:3])} {splitted_row[3]}"
            
            # Standardizing the time format
            record_time = datetime.strftime(
                datetime.strptime(
                    record_time, 
                    "%Y-%m-%d %H"
                ),
                "%Y-%m-%d %H:%M:%S"
            )
            
            pm_25 = float(splitted_row[4])
            aq_rows.append([record_time, pm_25])
    
    return np.array(aq_rows)

In [6]:
metadata = []

for file in files:
    city = file.split("/")[-1].strip(".txt")
    
    print(f"Dumping raw txt to CSV for {city.capitalize()}")
    with open(file) as fp:
        content = fp.read()
    
    content_gen = (line for line in content.split("\n"))
    
    # Reading metadata
    metadata.append(read_metadata(content_gen))
    
    # Dumping air quality data to CSV
    city_aq_df = pd.DataFrame(read_rows(content_gen), columns=["timestamp_utc", "pm2.5"])
    city_aq_df.to_csv(f"../data/cleaned/{city}_aq.csv", index=False)
    
    print(f"Dumped successfully!!")
    print("-" * 50)

Dumping raw txt to CSV for Aligarh
Dumped successfully!!
--------------------------------------------------
Dumping raw txt to CSV for Bengaluru
Dumped successfully!!
--------------------------------------------------
Dumping raw txt to CSV for Gurgaon
Dumped successfully!!
--------------------------------------------------
Dumping raw txt to CSV for Mohali
Dumped successfully!!
--------------------------------------------------


In [7]:
metadata_df = pd.DataFrame(
    metadata,
    columns=["country", "city", "state", "population", "lat", "lon"]
)

display(metadata_df)

Unnamed: 0,country,city,state,population,lat,lon
0,India,Aligarh,Uttar Pradesh,753207,27.8815,78.0746
1,India,Bengaluru,Karnataka,5104047,12.9719,77.5937
2,India,Gurgaon,Haryana,197340,28.4601,77.0263
3,India,Mohali,Punjab,123484,30.68,76.7221


### Dumping metadata to CSV

In [8]:
metadata_df.to_csv("../data/cleaned/aq_metadata.csv", index=False)