In [1]:
#Preprocessing on Kaggle DataSet

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('DataSet.csv')
print(df.head())

# Missing/NULL values were replaced by the mean of all the values of the column.
imputer = SimpleImputer(strategy='mean')
numerical_columns = ['AQI Value', 'CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value']
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

# Duplicate rows were removed in order to avoid biasness.
df = df.drop_duplicates()

# Outliers in the pollutants were filtered out using InterQuartile Range Method.
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
    return df

for col in numerical_columns:
    df = remove_outliers(df, col)

# Lable Encoding i.e categorizing labled columns as 0 1 2
label_encoder = LabelEncoder()
categorical_columns = ['AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

# DataSet Normalization
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

df.to_csv('Cleaned_DataSet.csv', index=False)
print(df.head())

              Country              City  AQI Value AQI Category  CO AQI Value  \
0  Russian Federation        Praskoveya         51     Moderate             1   
1              Brazil  Presidente Dutra         41         Good             1   
2               Italy   Priolo Gargallo         66     Moderate             1   
3              Poland         Przasnysz         34         Good             1   
4              France          Punaauia         22         Good             0   

  CO AQI Category  Ozone AQI Value Ozone AQI Category  NO2 AQI Value  \
0            Good               36               Good              0   
1            Good                5               Good              1   
2            Good               39               Good              2   
3            Good               34               Good              0   
4            Good               22               Good              0   

  NO2 AQI Category  PM2.5 AQI Value PM2.5 AQI Category  
0             Good     