In [4]:
import pandas as pd
import numpy as np
#Function created to assess the data from kaggle.

def process_data(file_path):
    # Load the original data- load the csv file from its raw status
    original_data = pd.read_csv(file_path)

    # Identify attributes with the most missing values. 
    missing_data = original_data.isnull().sum().sort_values(ascending=False)
    print("Attributes with the most missing values:")
    print(missing_data)

    # Remove "Xylene", "Toluene", and "Benzene" and save to a new file. These compounds are not used in the AQI calculation
    data_without_xtb = original_data.drop(columns=['Xylene', 'Toluene', 'Benzene'])
    data_without_xtb.to_csv('AQI_Data_First_Cleanse.csv', index=False)

    # Calculate completeness of dataset for each city. 
    city_completeness = data_without_xtb.groupby('City').apply(lambda group: group.notnull().sum().sum() / group.size)
    city_completeness_percent = city_completeness * 100

    # Identify cities with 90-100% completeness - Select cities that are >90% complete
    top_cities = city_completeness_percent[city_completeness_percent >= 90].index

    # Print the cities with their completeness percentages - show the results to make decision: how many cities are left?
    print("\nCities with 90-100% completeness:")
    print(city_completeness_percent[city_completeness_percent >= 90])

    # Create a dataset of only the cities with >=90% completeness - only >90% included: adjust above %if needed.
    data_top_cities = data_without_xtb[data_without_xtb['City'].isin(top_cities)]

    # Remove rows with missing 'AQI' and 'AQI_Bucket'- remove any AQI or AQI buckets for training model
    data_top_cities = data_top_cities.dropna(subset=['AQI', 'AQI_Bucket'])

    # Save the final dataset - this is the dataset to take into WEKA
    data_top_cities.to_csv('Cities_Above_90_Completeness.csv', index=False)

    # Return the cleaned data for further processing (like outlier detection)
    return data_top_cities


# Specify the path to your CSV file
file_path = 'city_day.csv'  # please replace with your actual file path

# Run the data processing function
cleaned_data = process_data(file_path)


Attributes with the most missing values:
Xylene        18109
PM10          11140
NH3           10328
Toluene        8041
Benzene        5623
AQI            4681
AQI_Bucket     4681
PM2.5          4598
NOx            4185
O3             4022
SO2            3854
NO2            3585
NO             3582
CO             2059
City              0
Date              0
dtype: int64

Cities with 90-100% completeness:
City
Aizawl                98.910824
Amaravati             93.763650
Amritsar              92.389592
Bengaluru             95.849447
Bhopal                97.258451
Chandigarh            98.810729
Coimbatore            94.001594
Delhi                 98.828349
Guwahati              99.662887
Hyderabad             95.390751
Jaipur                98.225383
Kochi                 98.480532
Kolkata               96.276696
Thiruvananthapuram    96.755672
dtype: float64


In [10]:
from scipy.stats import shapiro
import numpy as np

#function created to assess whether the attributes are normally distributed. 

def test_normality(data):
    # Results will be stored in a dictionary
    results = {}
    
    # Select only numeric columns
    numeric_columns = data.select_dtypes(include=[np.number])
    
    # Perform Shapiro-Wilk test for each numeric column
    for col in numeric_columns.columns:
        _, p_value = shapiro(data[col].dropna())  # dropna to remove missing values
        results[col] = p_value
    
    # Print the p-values
    print("\nP-values from Shapiro-Wilk test:")
    for col, p_value in results.items():
        print(f"{col}: {p_value}")

# Specify the path to your CSV file
file_path = 'Cities_Above_90_Completeness.csv'  #  replace with your actual file path


# Call the test_normality function and print the results
test_normality(cleaned_data)


P-values from Shapiro-Wilk test:
PM2.5: 0.0
PM10: 0.0
NO: 0.0
NO2: 0.0
NOx: 0.0
NH3: 0.0
CO: 0.0
SO2: 0.0
O3: 0.0
AQI: 0.0


In [9]:
import numpy as np
#function to calculate outliers based on IQR

def calculate_outliers(df):
    # Selecting only numeric columns to calculate outliers
    numeric_columns = df.select_dtypes(include=[np.number])
    
    # Calculating IQR for each column
    Q1 = numeric_columns.quantile(0.25)
    Q3 = numeric_columns.quantile(0.75)
    IQR = Q3 - Q1

    # Determining upper and lower bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Counting the number of outliers in each column
    return ((numeric_columns < lower_bound) | (numeric_columns > upper_bound)).sum()

# Call the calculate_outliers function separately and print the results
print("\nNumber of outliers for each attribute:")
print(calculate_outliers(cleaned_data))


Number of outliers for each attribute:
PM2.5     912
PM10      765
NO       1361
NO2       352
NOx       934
NH3       410
CO        760
SO2       437
O3        312
AQI      1164
dtype: int64


In [12]:
#remove the outliers and write to a newcsv file 

def remove_outliers(df):
    # Selecting only numeric columns to calculate outliers
    numeric_columns = df.select_dtypes(include=[np.number])
    
    # Calculating IQR for each column
    Q1 = numeric_columns.quantile(0.25)
    Q3 = numeric_columns.quantile(0.75)
    IQR = Q3 - Q1

    # Determining upper and lower bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filtering out the outliers and keeping only valid values
    no_outliers = df[~((numeric_columns < lower_bound) | (numeric_columns > upper_bound)).any(axis=1)]

    return no_outliers

# Remove outliers and save the resulting DataFrame to 'data_outliers_removed.csv'
data_without_outliers = remove_outliers(cleaned_data)
data_without_outliers.to_csv('data_outliers_removed.csv', index=False)

""""The section below is for the  handling of null values and normalising the datasets for those numeric clumns which are not the AQI index. The method for the normalisation was to apply the average value and the normalisation was based on a scaller method to create values between 0-1"""



In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

#It is important to check that the correct files are being used to normalise and impute in this function. 
def impute_and_normalize(file_path):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Impute missing values with mean
    data.fillna(data.mean(), inplace=True)

    # Select columns to normalize (numeric columns excluding 'AQI') Do not want to normalise the data for the AQI result
    cols_to_normalize = data.select_dtypes(include=[float]).columns.tolist()
    cols_to_normalize.remove('AQI')

    # Initialize a scaler, then apply it to the columns: Applying the normalisation to the dataset base on scaling 0-1
    scaler = MinMaxScaler()
    data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

    # Save the imputed and normalized dataset to a new CSV file
    output_file = 'imputed_normalized_data_nooutliers.csv'#change the file name output as required for different datasets
    data.to_csv(output_file, index=False)

    print(f"Imputed and normalized data saved to {output_file}") #Check this before completing

# Replace the 'filename'as appropriate to normalise the correct dataset - with or without the outliers.
impute_and_normalize('data_outliers_removed.csv')


Imputed and normalized data saved to imputed_normalized_data_nooutliers.csv


  data.fillna(data.mean(), inplace=True)


In [3]:
import pandas as pd
import numpy as np

#Function to check the overall completeness of any csv files that is selected.
def calculate_completeness(file_path):
    # Load the data
    data = pd.read_csv(file_path)
    
    # Calculate the total number of cells in the dataframe
    total_cells = np.product(data.shape)
    
    # Count the number of missing values per column
    missing_counts = data.isnull().sum()
    
    # Count the total number of missing values
    total_missing = missing_counts.sum()
    
    # Calculate completeness
    completeness = ((total_cells - total_missing) / total_cells) * 100
    
    # Print the result
    print(f"Completeness of the dataset: {completeness:.2f}%")

# Specify the path to your CSV file
#file_path = 'your_data.csv'  # please replace with your actual file path if needed.

# Call the function - change the file name below as required to call the function on the correct file. 
calculate_completeness('Cities_Above_90_Completeness.csv')

Completeness of the dataset: 98.34%
