<a href="https://colab.research.google.com/github/st20310132/air-pollution-analysis/blob/main/CMP7005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
import os

# For better visualization
plt.style.use('seaborn-v0_8-whitegrid')  # Updated seaborn style name
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define the path to your CSV files
# Replace with your actual path to the folder containing the CSV files
data_path = '/content/drive/MyDrive/Colab_Notebooks'

# Step 3: Select and read datasets from different site types
sites = {
    'urban': 'Dongsi',
    'suburban': 'Shunyi',
    'rural': 'Dingling',
    'industrial': 'Gucheng'
}

dfs = {}
for area_type, site in sites.items():
    # Look for files containing the site name
    file_pattern = f"*{site}*.csv"
    matching_files = [f for f in os.listdir(data_path) if site in f and f.endswith('.csv')]

    if matching_files:
        file_name = matching_files[0]  # Take the first matching file
        file_path = os.path.join(data_path, file_name)
        dfs[area_type] = pd.read_csv(file_path)
        print(f"Loaded {area_type} site: {site} - Shape: {dfs[area_type].shape}")
    else:
        print(f"No file found for {area_type} site: {site}")

# Step 4: Add columns to identify the site type
for area_type, df in dfs.items():
    df['site_type'] = area_type
    df['site_name'] = sites[area_type]

# Step 5: Merge all datasets
if dfs:
    merged_df = pd.concat(dfs.values(), ignore_index=True)
    print("\nMerged Dataset Shape:", merged_df.shape)

    # Display the first few rows
    print("\nFirst 5 rows of the merged dataset:")
    print(merged_df.head())

    # Basic information about the merged dataset
    print("\nDataset Information:")
    merged_df.info()

    # Check for missing values
    print("\nMissing Values Count:")
    print(merged_df.isnull().sum())

    # Save the merged dataset for future use
    merged_file_path = os.path.join(data_path, 'merged_air_quality_data.csv')
    merged_df.to_csv(merged_file_path, index=False)
    print(f"\nMerged dataset saved to: {merged_file_path}")
else:
    print("No datasets were loaded. Please check file names and try again")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded urban site: Dongsi - Shape: (35064, 18)
Loaded suburban site: Shunyi - Shape: (35064, 18)
Loaded rural site: Dingling - Shape: (35064, 18)
Loaded industrial site: Gucheng - Shape: (35064, 18)

Merged Dataset Shape: (140256, 20)

First 5 rows of the merged dataset:
   No  year  month  day  hour  PM2.5  PM10  SO2   NO2     CO    O3  TEMP  \
0   1  2013      3    1     0    9.0   9.0  3.0  17.0  300.0  89.0  -0.5   
1   2  2013      3    1     1    4.0   4.0  3.0  16.0  300.0  88.0  -0.7   
2   3  2013      3    1     2    7.0   7.0  NaN  17.0  300.0  60.0  -1.2   
3   4  2013      3    1     3    3.0   3.0  5.0  18.0    NaN   NaN  -1.4   
4   5  2013      3    1     4    3.0   3.0  7.0   NaN  200.0  84.0  -1.9   

     PRES  DEWP  RAIN   wd  WSPM station site_type site_name  
0  1024.5 -21.4   0.0  NNW   5.7  Dongsi     urban    Dongsi  
1  1025.1 -22.1 