In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/airfinal-csv/Air3final.csv
/kaggle/input/airpolllution-csv/AirPollution.csv


In [5]:
pip install networkx torch-geometric

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
import networkx as nx

# Load the dataset
df = pd.read_csv('/kaggle/input/airfinal-csv/Air3final.csv')

# Function to classify pollutant
def classify_pollutant(value, pollutant):
    if pollutant == 'NO2':
        return value >= 180
    elif pollutant == 'O3':
        return value >= 168
    elif pollutant == 'PM2.5':
        return value >= 90
    elif pollutant == 'PM10':
        return value >= 200
    elif pollutant == 'SO2':
        return value >= 380
    elif pollutant == 'CO':
        return value >= 10
    else:
        return False

# Columns of pollutants
pollutant_columns = ['PM2.5', 'PM10', 'NO2', 'O3', 'SO2', 'CO', 'NO', 'NOx', 'Benzene', 'Toluene', 'Xylene']

# Convert Datetime column to datetime objects
df['Datetime'] = pd.to_datetime(df['Datetime'])

# Filter rows based on pollutants exceeding threshold
for pollutant in pollutant_columns:
    df[pollutant + '_exceeds'] = df.apply(lambda x: classify_pollutant(x[pollutant], pollutant), axis=1)

# Extract day for grouping
df['Day'] = df['Datetime'].dt.to_period('D')

# Verify the classification logic
print("Sample data with exceedance columns:")
print(df.head(10)[['City', 'Datetime', 'Day'] + [pollutant + '_exceeds' for pollutant in pollutant_columns]])

# Create a graph
G = nx.Graph()

# Add nodes and edges to the graph
for _, row in df.iterrows():
    city_node = row['City']
    day_node = row['Day'].strftime('%Y-%m-%d')
    combined_node = f"{city_node}_{day_node}"
    
    if not G.has_node(city_node):
        G.add_node(city_node, type='city')
    
    if not G.has_node(combined_node):
        G.add_node(combined_node, type='city_day')
    
    # Connect city node to city_date node
    G.add_edge(city_node, combined_node, type='city_to_date', weight=0)
    
    for pollutant in pollutant_columns:
        if row[pollutant + '_exceeds']:
            if not G.has_node(pollutant):
                G.add_node(pollutant, type='pollutant')
            G.add_edge(combined_node, pollutant, type='date_to_pollutant', weight=row[pollutant])  # Set weight to pollutant value

# Ensure all edges have the same attributes
for u, v, data in G.edges(data=True):
    if 'type' not in data:
        data['type'] = 'unknown'
    if 'weight' not in data:
        data['weight'] = 0

# Convert to PyTorch Geometric Data
data = from_networkx(G)

# Node features (dummy example, replace with actual features)
num_nodes = G.number_of_nodes()
num_node_features = len(pollutant_columns)
data.x = torch.rand(num_nodes, num_node_features)  # Dummy features for each node

# Get the list of nodes
node_list = list(G.nodes)

# Target variable: number of exceedances
exceed_counts = df.groupby(['Day', 'City'])[list(map(lambda x: x + '_exceeds', pollutant_columns))].sum().sum(axis=1).reset_index()
exceed_counts['combined_node'] = exceed_counts['City'] + '_' + exceed_counts['Day'].astype(str)

# Map combined nodes to indices in data
node_indices = {node: idx for idx, node in enumerate(node_list)}
exceed_counts['node_idx'] = exceed_counts['combined_node'].map(node_indices)

# Filter valid node indices
exceed_counts = exceed_counts.dropna(subset=['node_idx'])

# Create target tensor
target = torch.zeros(num_nodes, dtype=torch.long)
target[exceed_counts['node_idx'].astype(int)] = torch.tensor(exceed_counts[0].values, dtype=torch.long)
data.y = target

# Verify nodes and edges
print("Sample nodes and their attributes:")
print(list(G.nodes(data=True))[:10])

print("Sample edges and their attributes:")
print(list(G.edges(data=True))[:10])

# Verify target tensor
print("Sample target tensor values:")
print(target[:10])

from torch_geometric.nn import GCNConv
import torch.nn as nn

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Hyperparameters
in_channels = num_node_features
hidden_channels = 16
out_channels = target.max().item() + 1  # Number of classes

# Model, optimizer, and loss
model = GCN(in_channels, hidden_channels, out_channels)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Train the model
for epoch in range(200):  # Example number of epochs
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')

# Evaluation
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    correct = (pred == data.y).sum()
    acc = int(correct) / int(data.y.size(0))
    print(f'Accuracy: {acc:.4f}')

# Prepare data for prediction
x = data.x
edge_index = data.edge_index

# Predict with the trained model
model.eval()
with torch.no_grad():
    out = model(x, edge_index)
    pred = out.argmax(dim=1)

# Verify model output before argmax
print("Sample model output (logits):")
print(out[:10])

# Verify predictions
print("Sample predictions after argmax:")
print(pred[:10])

# Convert predictions to a DataFrame for easier processing
results_df = pd.DataFrame({
    'Node': node_list,
    'Prediction': pred.numpy()
})

# Filter nodes where Prediction is non-zero (indicating exceedance)
exceed_nodes = results_df[results_df['Prediction'] != 0]['Node']

# Debugging: Print the number of nodes with exceedances
print(f"Number of nodes with exceedances: {len(exceed_nodes)}")

# Print results for each exceeding node
for node in exceed_nodes:
    if '_' in node:  # Ensure the node is a combined city_date node
        city, day = node.split('_')
        pollutants_exceeded = []
        for pollutant in pollutant_columns:
            day_period = pd.Period(day, freq='D')  # Convert day string back to period
            if df[(df['City'] == city) & (df['Day'] == day_period)][pollutant + '_exceeds'].any():
                pollutants_exceeded.append(pollutant)
        
        if pollutants_exceeded:
            print(f"Day: {day}, City: {city}, Pollutants Exceeded: {', '.join(pollutants_exceeded)}")
            

Sample data with exceedance columns:
        City            Datetime         Day  PM2.5_exceeds  PM10_exceeds  \
0  Ahmedabad 2015-01-01 01:00:00  2015-01-01          False         False   
1  Ahmedabad 2015-01-01 02:00:00  2015-01-01          False         False   
2  Ahmedabad 2015-01-01 03:00:00  2015-01-01          False         False   
3  Ahmedabad 2015-01-01 04:00:00  2015-01-01          False         False   
4  Ahmedabad 2015-01-01 05:00:00  2015-01-01          False         False   
5  Ahmedabad 2015-01-01 06:00:00  2015-01-01          False         False   
6  Ahmedabad 2015-01-01 07:00:00  2015-01-01          False         False   
7  Ahmedabad 2015-01-01 08:00:00  2015-01-01          False         False   
8  Ahmedabad 2015-01-01 09:00:00  2015-01-01          False         False   
9  Ahmedabad 2015-01-01 10:00:00  2015-01-01          False         False   

   NO2_exceeds  O3_exceeds  SO2_exceeds  CO_exceeds  NO_exceeds  NOx_exceeds  \
0        False       False        F

In [7]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
import networkx as nx
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score
from math import sqrt

# Load the dataset
df = pd.read_csv('/kaggle/input/airfinal-csv/Air3final.csv')

# Function to classify pollutant
def classify_pollutant(value, pollutant):
    if pollutant == 'NO2':
        return value >= 180
    elif pollutant == 'O3':
        return value >= 168
    elif pollutant == 'PM2.5':
        return value >= 90
    elif pollutant == 'PM10':
        return value >= 200
    elif pollutant == 'SO2':
        return value >= 380
    elif pollutant == 'CO':
        return value >= 10
    else:
        return False

# Columns of pollutants
pollutant_columns = ['PM2.5', 'PM10', 'NO2', 'O3', 'SO2', 'CO', 'NO', 'NOx', 'Benzene', 'Toluene', 'Xylene']

# Convert Datetime column to datetime objects
df['Datetime'] = pd.to_datetime(df['Datetime'])

# Filter rows based on pollutants exceeding threshold
for pollutant in pollutant_columns:
    df[pollutant + '_exceeds'] = df.apply(lambda x: classify_pollutant(x[pollutant], pollutant), axis=1)

# Extract day for grouping
df['Day'] = df['Datetime'].dt.to_period('D')

# Verify the classification logic
print("Sample data with exceedance columns:")
print(df.head(10)[['City', 'Datetime', 'Day'] + [pollutant + '_exceeds' for pollutant in pollutant_columns]])

# Create a graph
G = nx.Graph()

# Add nodes and edges to the graph
for _, row in df.iterrows():
    city_node = row['City']
    day_node = row['Day'].strftime('%Y-%m-%d')
    combined_node = f"{city_node}_{day_node}"
    
    if not G.has_node(city_node):
        G.add_node(city_node, type='city')
    
    if not G.has_node(combined_node):
        G.add_node(combined_node, type='city_day')
    
    # Connect city node to city_date node
    G.add_edge(city_node, combined_node, type='city_to_date', weight=0)
    
    for pollutant in pollutant_columns:
        if row[pollutant + '_exceeds']:
            if not G.has_node(pollutant):
                G.add_node(pollutant, type='pollutant')
            G.add_edge(combined_node, pollutant, type='date_to_pollutant', weight=row[pollutant])  # Set weight to pollutant value

# Ensure all edges have the same attributes
for u, v, data in G.edges(data=True):
    if 'type' not in data:
        data['type'] = 'unknown'
    if 'weight' not in data:
        data['weight'] = 0

# Convert to PyTorch Geometric Data
data = from_networkx(G)

# Node features (dummy example, replace with actual features)
num_nodes = G.number_of_nodes()
num_node_features = len(pollutant_columns)
data.x = torch.rand(num_nodes, num_node_features)  # Dummy features for each node

# Get the list of nodes
node_list = list(G.nodes)

# Target variable: number of exceedances
exceed_counts = df.groupby(['Day', 'City'])[list(map(lambda x: x + '_exceeds', pollutant_columns))].sum().sum(axis=1).reset_index()
exceed_counts['combined_node'] = exceed_counts['City'] + '_' + exceed_counts['Day'].astype(str)

# Map combined nodes to indices in data
node_indices = {node: idx for idx, node in enumerate(node_list)}
exceed_counts['node_idx'] = exceed_counts['combined_node'].map(node_indices)

# Filter valid node indices
exceed_counts = exceed_counts.dropna(subset=['node_idx'])

# Create target tensor
target = torch.zeros(num_nodes, dtype=torch.long)
target[exceed_counts['node_idx'].astype(int)] = torch.tensor(exceed_counts[0].values, dtype=torch.long)
data.y = target

# Verify nodes and edges
print("Sample nodes and their attributes:")
print(list(G.nodes(data=True))[:10])

print("Sample edges and their attributes:")
print(list(G.edges(data=True))[:10])

# Verify target tensor
print("Sample target tensor values:")
print(target[:10])

from torch_geometric.nn import GCNConv
import torch.nn as nn

from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import global_max_pool

class EnhancedGCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_heads=4, lstm_hidden_size=16, lstm_num_layers=2, dropout=0.5):
        super(EnhancedGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.conv3 = GCNConv(hidden_channels, out_channels)
        self.bn3 = nn.BatchNorm1d(out_channels)

        self.attention = nn.MultiheadAttention(embed_dim=hidden_channels, num_heads=num_heads, batch_first=True)
        self.lstm = nn.LSTM(hidden_channels, lstm_hidden_size, num_layers=lstm_num_layers, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_size, out_channels)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Apply GCN layers with batch normalization and dropout
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x, edge_index)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x, edge_index)))
        x = self.dropout(x)

        # Prepare data for attention layer
        x = x.unsqueeze(1)  # Add sequence dimension for attention

        # Apply multihead attention
        x, _ = self.attention(x, x, x)

        # Reshape for LSTM input
        x = x.squeeze(1)  # Remove sequence dimension
        x = x.permute(0, 2, 1)  # LSTM expects (batch, channels, seq_len)

        # Apply LSTM layers
        _, (x, _) = self.lstm(x)
        x = x[-1]  # Take the output of the last LSTM layer

        # Final linear layer
        x = self.fc(x)

        return F.log_softmax(x, dim=1)

# Hyperparameters
in_channels = num_node_features
hidden_channels = 32
out_channels = target.max().item() + 1  # Number of classes

# Model, optimizer, and loss
model = GCN(in_channels, hidden_channels, out_channels)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Train the model
for epoch in range(100):  # Example number of epochs
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss}')

# Evaluation
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)

# Accuracy
acc = accuracy_score(data.y.cpu(), pred.cpu())
print(f'Accuracy: {acc:.4f}')

# RMSE
rmse = sqrt(mean_squared_error(data.y.cpu(), pred.cpu()))
print(f'RMSE: {rmse:.4f}')

# R-squared
r2 = r2_score(data.y.cpu(), pred.cpu())
print(f'R-squared: {r2:.4f}')

# Precision, Recall, F1 Score
precision = precision_score(data.y.cpu(), pred.cpu(), average='weighted')
recall = recall_score(data.y.cpu(), pred.cpu(), average='weighted')
f1 = f1_score(data.y.cpu(), pred.cpu(), average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Prepare data for prediction
x = data.x
edge_index = data.edge_index

# Predict with the trained model
model.eval()
with torch.no_grad():
    out = model(x, edge_index)
    pred = out.argmax(dim=1)

# Verify model output before argmax
print("Sample model output (logits):")
print(out[:10])

# Verify predictions
print("Sample predictions after argmax:")
print(pred[:10])

# Convert predictions to a DataFrame for easier processing
results_df = pd.DataFrame({
    'Node': node_list,
    'Prediction': pred.numpy()
})

# Filter nodes where Prediction is non-zero (indicating exceedance)
exceed_nodes = results_df[results_df['Prediction'] != 0]['Node']
exceedances_data = []
# Debugging: Print the number of nodes with exceedances
print(f"Number of nodes with exceedances: {len(exceed_nodes)}")

# Identify the maximum pollutants exceeded per day and the most exceeded pollutant across cities
exceedance_info = df[df[[pollutant + '_exceeds' for pollutant in pollutant_columns]].any(axis=1)]
exceedance_info_grouped = exceedance_info.groupby('Day')

for day, group in exceedance_info_grouped:
    pollutant_count = group[[pollutant + '_exceeds' for pollutant in pollutant_columns]].sum()
    max_pollutant = pollutant_count.idxmax().replace('_exceeds', '')
    max_pollutant_count = pollutant_count.max()

    print(f"Day: {day}, Pollutant exceeded the most across all cities: {max_pollutant}, Count: {max_pollutant_count}")
    exceedances_data.append({
                'Day': day,
        'Pollutants Exceeded': max_pollutant
    })

# Create DataFrame from the collected data
exceedances_df = pd.DataFrame(exceedances_data)

# Save DataFrame to CSV
exceedances_df.to_csv('exceedances_dataset.csv', index=False)

# Debugging: Print the number of nodes with exceedances
print(f"Number of nodes with exceedances: {len(exceedances_df)}")

# Print results for each exceeding node
print("Exceedances Dataset:")
print(exceedances_df.head())

Sample data with exceedance columns:
        City            Datetime         Day  PM2.5_exceeds  PM10_exceeds  \
0  Ahmedabad 2015-01-01 01:00:00  2015-01-01          False         False   
1  Ahmedabad 2015-01-01 02:00:00  2015-01-01          False         False   
2  Ahmedabad 2015-01-01 03:00:00  2015-01-01          False         False   
3  Ahmedabad 2015-01-01 04:00:00  2015-01-01          False         False   
4  Ahmedabad 2015-01-01 05:00:00  2015-01-01          False         False   
5  Ahmedabad 2015-01-01 06:00:00  2015-01-01          False         False   
6  Ahmedabad 2015-01-01 07:00:00  2015-01-01          False         False   
7  Ahmedabad 2015-01-01 08:00:00  2015-01-01          False         False   
8  Ahmedabad 2015-01-01 09:00:00  2015-01-01          False         False   
9  Ahmedabad 2015-01-01 10:00:00  2015-01-01          False         False   

   NO2_exceeds  O3_exceeds  SO2_exceeds  CO_exceeds  NO_exceeds  NOx_exceeds  \
0        False       False        F

  _warn_prf(average, modifier, msg_start, len(result))


Day: 2015-07-29, Pollutant exceeded the most across all cities: PM2.5, Count: 26
Day: 2015-07-30, Pollutant exceeded the most across all cities: PM2.5, Count: 25
Day: 2015-07-31, Pollutant exceeded the most across all cities: PM2.5, Count: 25
Day: 2015-08-01, Pollutant exceeded the most across all cities: PM2.5, Count: 30
Day: 2015-08-02, Pollutant exceeded the most across all cities: PM2.5, Count: 27
Day: 2015-08-03, Pollutant exceeded the most across all cities: PM2.5, Count: 24
Day: 2015-08-04, Pollutant exceeded the most across all cities: PM2.5, Count: 28
Day: 2015-08-05, Pollutant exceeded the most across all cities: PM2.5, Count: 24
Day: 2015-08-06, Pollutant exceeded the most across all cities: PM2.5, Count: 26
Day: 2015-08-07, Pollutant exceeded the most across all cities: PM2.5, Count: 28
Day: 2015-08-08, Pollutant exceeded the most across all cities: PM2.5, Count: 27
Day: 2015-08-09, Pollutant exceeded the most across all cities: PM2.5, Count: 27
Day: 2015-08-10, Pollutant e

In [8]:
from IPython.display import FileLink

# Save DataFrame to CSV
exceedances_df.to_csv('exceedances_dataset.csv', index=False)

# Create a link to download the file
display(FileLink('exceedances_dataset.csv'))

# Verify that the file has been saved correctly
print("Exceedances dataset has been saved to 'exceedances_dataset.csv'.")


Exceedances dataset has been saved to 'exceedances_dataset.csv'.


In [9]:
print(df)

             City            Datetime      PM2.5        PM10     NO    NO2  \
0       Ahmedabad 2015-01-01 01:00:00  63.398114  108.334538   1.00  40.01   
1       Ahmedabad 2015-01-01 02:00:00  63.398114  108.334538   0.02  27.75   
2       Ahmedabad 2015-01-01 03:00:00  63.398114  108.334538   0.08  19.32   
3       Ahmedabad 2015-01-01 04:00:00  63.398114  108.334538   0.30  16.45   
4       Ahmedabad 2015-01-01 05:00:00  63.398114  108.334538   0.12  14.90   
...           ...                 ...        ...         ...    ...    ...   
292772      Patna 2020-06-30 20:00:00  23.500000   65.980000  43.56  41.44   
292773      Patna 2020-06-30 21:00:00  28.420000   70.990000  56.87  37.94   
292774      Patna 2020-06-30 22:00:00  33.940000   70.170000  46.80  33.86   
292775      Patna 2020-06-30 23:00:00  38.620000   64.560000  41.00  27.34   
292776      Patna 2020-07-01 00:00:00  35.420000   57.820000  44.50  31.15   

          NOx   NH3    CO     SO2  ...  NO2_exceeds  O3_exceeds