
## **DATA CLEANING**

In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv(r"C:\Users\NOOR AL MUSABAH\Downloads\global_air_quality_data_100.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   City         10000 non-null  object 
 1   Country      10000 non-null  object 
 2   Date         10000 non-null  object 
 3   PM2.5        10000 non-null  float64
 4   PM10         10000 non-null  float64
 5   NO2          10000 non-null  float64
 6   SO2          10000 non-null  float64
 7   CO           10000 non-null  float64
 8   O3           10000 non-null  float64
 9   Temperature  10000 non-null  float64
 10  Humidity     10000 non-null  float64
 11  Wind Speed   10000 non-null  float64
dtypes: float64(9), object(3)
memory usage: 937.6+ KB


### **DATATYPE CORRECTIONS**

In [17]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
numeric_columns = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'Wind Speed']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
print(df.dtypes)

City                 category
Country                object
Date           datetime64[ns]
PM2.5                 float64
PM10                  float64
NO2                   float64
SO2                   float64
CO                    float64
O3                    float64
Temperature           float64
Humidity              float64
Wind Speed            float64
dtype: object


### **STANDARDIZE CATEGORICAL VARIABLES**

In [31]:
print(df["City"].unique())
print(df["Country"].unique())
df['City'] = df['City'].str.strip().str.title()
df['Country'] = df['Country'].str.strip().str.title()
df["City"]=df["City"].astype('category')
df["Country"]=df["Country"].astype("category")

['Bangkok', 'Istanbul', 'Rio De Janeiro', 'Mumbai', 'Paris', ..., 'Dubai', 'Toronto', 'Madrid', 'Johannesburg', 'Moscow']
Length: 20
Categories (20, object): ['Bangkok', 'Beijing', 'Berlin', 'Cairo', ..., 'Seoul', 'Sydney', 'Tokyo', 'Toronto']
['Thailand', 'Turkey', 'Brazil', 'India', 'France', ..., 'Uae', 'Canada', 'Spain', 'South Africa', 'Russia']
Length: 19
Categories (19, object): ['Australia', 'Brazil', 'Canada', 'China', ..., 'Turkey', 'Uae', 'Uk', 'Usa']


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   City         10000 non-null  category      
 1   Country      10000 non-null  category      
 2   Date         10000 non-null  datetime64[ns]
 3   PM2.5        10000 non-null  float64       
 4   PM10         10000 non-null  float64       
 5   NO2          10000 non-null  float64       
 6   SO2          10000 non-null  float64       
 7   CO           10000 non-null  float64       
 8   O3           10000 non-null  float64       
 9   Temperature  10000 non-null  float64       
 10  Humidity     10000 non-null  float64       
 11  Wind Speed   10000 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(9)
memory usage: 802.3 KB


### **DUPLICATE REMOVAL**

In [33]:
df.duplicated().sum()

np.int64(0)

### **HANDLING MISSING VALUE**

In [35]:
df.isnull().sum()

City           0
Country        0
Date           0
PM2.5          0
PM10           0
NO2            0
SO2            0
CO             0
O3             0
Temperature    0
Humidity       0
Wind Speed     0
dtype: int64

### **OUTLIER DETECTION** (IQR METHOD)

In [None]:
numerical_columns=df.select_dtypes(include=['int64','float64']).columns
Q1=df[numerical_columns].quantile(0.25)
Q3=df[numerical_columns].quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR
outliers=df[(df[numerical_columns]<lower_bound)|(df[numerical_columns]>upper_bound).any(axis=1)]
print(outliers)
df_outliers = df[outliers]
df_outliers
print("Number of outliers:", outliers.sum())

In [3]:
numerical_columns = df.select_dtypes(include=['int64','float64']).columns
Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[((df[numerical_columns] < lower_bound) | 
               (df[numerical_columns] > upper_bound)).any(axis=1)]

print(outliers)

print("Number of outliers:", outliers.shape[0])


Empty DataFrame
Columns: [City, Country, Date, PM2.5, PM10, NO2, SO2, CO, O3, Temperature, Humidity, Wind Speed]
Index: []
Number of outliers: 0


In [6]:
df.to_csv("cleaned_day2.csv")
