### Problem Statement 

The project checks the factors affecting the Air Quality is affected by the factors like pm25, pm10, o3, n02, co etc. 

### Data Collection 

#### Installing the Library for generating the Data 

In [3]:
pip install faker 


Note: you may need to restart the kernel to use updated packages.


#### Generating the synthetic data  and saving the folder 

In [4]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta


fake = Faker()


cities = [
    {"city": "Washington", "lat": 38.9072, "lon": -77.0369},
    {"city": "Los Angeles", "lat": 34.0522, "lon": -118.2437},
    {"city": "Chicago", "lat": 41.8781, "lon": -87.6298},
    {"city": "Miami", "lat": 25.7617, "lon": -80.1918},
    {"city": "Phoenix", "lat": 33.4484, "lon": -112.0740},
]

start_date = datetime(2019, 1, 1)
end_date = datetime(2023, 12, 31)


date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Function to simulate pollutant values with some seasonal variation
def seasonal_value(base, variation, month):
    seasonal_factor = 1 + 0.2 * np.sin((month - 1) / 12 * 2 * np.pi)  # seasonal cycle
    return round(np.clip(base * seasonal_factor + random.uniform(-variation, variation), 0, None), 2)


records = []

for city in cities:
    for date in date_range:
        month = date.month
        
   
        pm25 = seasonal_value(12, 5, month)
        pm10 = seasonal_value(25, 10, month)
        o3 = seasonal_value(30, 12, month)
        no2 = seasonal_value(20, 8, month)
        so2 = seasonal_value(5, 2, month)
        co = seasonal_value(0.5, 0.2, month)
        
        
        aqi = int(max(pm25, pm10/2, o3, no2) + random.uniform(-5, 5))
        aqi = np.clip(aqi, 0, 500)
        
       
        temp = seasonal_value(15, 10, month)  # °C
        humidity = round(random.uniform(30, 90), 1)  # %
        wind_speed = round(random.uniform(0.5, 10), 1)  # m/s
        
        records.append({
            "date": date.strftime("%Y-%m-%d"),
            "city": city["city"],
            "latitude": city["lat"],
            "longitude": city["lon"],
            "pm25": pm25,
            "pm10": pm10,
            "o3": o3,
            "no2": no2,
            "so2": so2,
            "co": co,
            "aqi": aqi,
            "temperature_c": temp,
            "humidity_percent": humidity,
            "wind_speed_mps": wind_speed,
            "month": month,
            "day": date.day
        })


df = pd.DataFrame(records)

print(f"Dataset generated: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()



Dataset generated: 9130 rows, 16 columns


Unnamed: 0,date,city,latitude,longitude,pm25,pm10,o3,no2,so2,co,aqi,temperature_c,humidity_percent,wind_speed_mps,month,day
0,2019-01-01,Washington,38.9072,-77.0369,15.66,16.18,41.83,19.95,6.67,0.35,44,9.92,87.5,8.8,1,1
1,2019-01-02,Washington,38.9072,-77.0369,14.83,26.0,19.03,22.74,3.75,0.34,25,13.82,32.6,9.7,1,2
2,2019-01-03,Washington,38.9072,-77.0369,7.21,26.41,41.9,13.15,4.08,0.59,41,7.55,65.0,4.2,1,3
3,2019-01-04,Washington,38.9072,-77.0369,13.27,33.97,35.2,20.98,5.64,0.59,31,22.04,65.1,6.6,1,4
4,2019-01-05,Washington,38.9072,-77.0369,13.92,26.74,28.19,16.44,4.59,0.55,29,21.0,66.1,3.8,1,5


#### Importing the necessary libraries 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#### Shape of the dataset

In [6]:
df.shape

(9130, 16)

Data Checks to perform

- Check Missing values
- Check Duplicates
- Check data type
- Check the number of unique values of each column
- Check statistics of data set
- Check various categories present in the different categorical column

#### Check Missing values

In [8]:
df.isna().sum()

date                0
city                0
latitude            0
longitude           0
pm25                0
pm10                0
o3                  0
no2                 0
so2                 0
co                  0
aqi                 0
temperature_c       0
humidity_percent    0
wind_speed_mps      0
month               0
day                 0
dtype: int64

There are no missing values in the data set

#### Check Duplicates 

In [9]:
df.duplicated().sum()

np.int64(0)

There are no duplicates  values in the data set

#### Check data types

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9130 entries, 0 to 9129
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              9130 non-null   object 
 1   city              9130 non-null   object 
 2   latitude          9130 non-null   float64
 3   longitude         9130 non-null   float64
 4   pm25              9130 non-null   float64
 5   pm10              9130 non-null   float64
 6   o3                9130 non-null   float64
 7   no2               9130 non-null   float64
 8   so2               9130 non-null   float64
 9   co                9130 non-null   float64
 10  aqi               9130 non-null   int64  
 11  temperature_c     9130 non-null   float64
 12  humidity_percent  9130 non-null   float64
 13  wind_speed_mps    9130 non-null   float64
 14  month             9130 non-null   int64  
 15  day               9130 non-null   int64  
dtypes: float64(11), int64(3), object(2)
memory

#### Checking the number of unique values of each column

In [11]:
df.nunique()

date                1826
city                   5
latitude               5
longitude              5
pm25                1437
pm10                2645
o3                  3075
no2                 2202
so2                  593
co                    61
aqi                   45
temperature_c       2419
humidity_percent     601
wind_speed_mps        96
month                 12
day                   31
dtype: int64

#### Check statistics of data set

In [12]:
df.describe()

Unnamed: 0,latitude,longitude,pm25,pm10,o3,no2,so2,co,aqi,temperature_c,humidity_percent,wind_speed_mps,month,day
count,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0,9130.0
mean,34.80952,-95.03524,12.025487,24.962176,30.079262,19.936947,4.98994,0.497703,30.058488,15.079566,60.045181,5.267711,6.523549,15.72782
std,5.496573,16.900987,3.370121,6.72765,8.122108,5.379282,1.354437,0.136629,8.157374,6.186226,17.259138,2.734293,3.448722,8.799806
min,25.7617,-118.2437,4.61,10.05,12.01,8.0,2.0,0.2,8.0,2.0,30.0,0.5,1.0,1.0
25%,33.4484,-112.074,9.5,20.03,24.19,15.96,3.98,0.4,24.0,10.07,45.0,2.9,4.0,8.0
50%,34.0522,-87.6298,12.08,24.94,30.02,19.995,4.99,0.5,30.0,15.12,60.0,5.3,7.0,16.0
75%,38.9072,-80.1918,14.55,29.94,36.01,23.88,6.0,0.6,36.0,20.21,75.0,7.6,10.0,23.0
max,41.8781,-77.0369,19.39,39.99,47.85,31.98,7.98,0.8,52.0,27.99,90.0,10.0,12.0,31.0
