In [1]:
#Code to cleanse India AQI data
import pandas as pd

def process_data(file_path): #Function to cleanse data from the csv file. 
    # Load the original data
    original_data = pd.read_csv(file_path)

    # Identify attributes with the most missing values. 
    missing_data = original_data.isnull().sum().sort_values(ascending=False)
    print("Attributes with the most missing values:")
    print(missing_data)

    # Remove "Xylene", "Toluene", and "Benzene" and save to a new file. These compounds are not used in the AQI calculation
    data_without_xtb = original_data.drop(columns=['Xylene', 'Toluene', 'Benzene'])
    data_without_xtb.to_csv('AQI_Data_First_Cleanse.csv', index=False)

    # Calculate completeness of dataset for each city. 
    city_completeness = data_without_xtb.groupby('City').apply(lambda group: group.notnull().sum().sum() / group.size)
    city_completeness_percent = city_completeness * 100

    # Identify cities with 90-100% completeness - Select cities that are >90% complete
    top_cities = city_completeness_percent[city_completeness_percent >= 90].index

    # Print the cities with their completeness percentages - show the results to make decsion 
    print("\nCities with 90-100% completeness:")
    print(city_completeness_percent[city_completeness_percent >= 90])

    # Create a dataset of only the cities with >=90% completeness - only >90% included
    data_top_cities = data_without_xtb[data_without_xtb['City'].isin(top_cities)]

    # Remove rows with missing 'AQI' and 'AQI_Bucket'- remove any AQI or AQI buckets for training model
    data_top_cities = data_top_cities.dropna(subset=['AQI', 'AQI_Bucket'])

    # Save the final dataset - this is the dataset to take into WEKA
    # consider how I am going to remove outliers from the dataset
    data_top_cities.to_csv('Cities_Above_90_Completeness.csv', index=False)

# Specify the path to your CSV file
file_path = 'city_day.csv'  # please replace with your actual file path

# Run the data processing function
process_data(file_path)


Attributes with the most missing values:
Xylene        18109
PM10          11140
NH3           10328
Toluene        8041
Benzene        5623
AQI            4681
AQI_Bucket     4681
PM2.5          4598
NOx            4185
O3             4022
SO2            3854
NO2            3585
NO             3582
CO             2059
City              0
Date              0
dtype: int64

Cities with 90-100% completeness:
City
Aizawl                98.910824
Amaravati             93.763650
Amritsar              92.389592
Bengaluru             95.849447
Bhopal                97.258451
Chandigarh            98.810729
Coimbatore            94.001594
Delhi                 98.828349
Guwahati              99.662887
Hyderabad             95.390751
Jaipur                98.225383
Kochi                 98.480532
Kolkata               96.276696
Thiruvananthapuram    96.755672
dtype: float64
