## Air pollution data analysis

In [None]:
'''importing module'''
import numpy as np 
import pandas as pd 
import matplotlib                  
import matplotlib.pyplot as plt
import seaborn as sns              
plt.style.use('fivethirtyeight')
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
'''importing data'''
summary_data = pd.read_csv('/kaggle/input/air-pollution-in-seoul/AirPollutionSeoul/Measurement_summary.csv')
summary_data.head()

In [None]:
'''Remove extra columns'''
summary_data.drop(['Address', 'Latitude', 'Longitude'], axis=1, inplace=True)
summary_data.head()

In [None]:
'Switch type of "Measurement date" type column to datetime'
summary_data['Measurement date'] = pd.to_datetime(summary_data['Measurement date'])
summary_data.head()

In [None]:
'''Check dataset info'''
summary_data.info()

In [None]:
'''Check empty values'''
summary_data.isnull().sum()

In [None]:
'''showing unique stations'''
summary_data['Station code'].nunique()

In [None]:
'''Check negative values'''
negative_data = summary_data[(summary_data.SO2 < 0) | (summary_data.NO2 < 0) | (summary_data.O3 < 0) | (summary_data.CO < 0) | (summary_data.PM10 < 0) | (summary_data['PM2.5'] < 0)]
negative_data

In [None]:
'''Delete negative data from dataframe'''
summary_data.drop(negative_data.index, axis=0, inplace=True)
summary_data.head()

In [None]:
summary_data.describe()

In [None]:
'''Check data for first station'''
fig, ax = plt.subplots(3, 2, figsize=(25, 16))
fig.suptitle("Station 101")
df = summary_data[summary_data['Station code'] == 101]
sns.lineplot(x=df['Measurement date'], y=df['SO2'], linewidth = 1, ax=ax[0, 0])
sns.lineplot(x=df['Measurement date'], y=df['NO2'], linewidth = 1, ax=ax[0, 1])
sns.lineplot(x=df['Measurement date'], y=df['O3'], linewidth = 1, ax=ax[1, 0])
sns.lineplot(x=df['Measurement date'], y=df['CO'], linewidth = 1, ax=ax[1, 1])
sns.lineplot(x=df['Measurement date'], y=df['PM10'], linewidth = 1, ax=ax[2, 0])
sns.lineplot(x=df['Measurement date'], y=df['PM2.5'], linewidth = 1, ax=ax[2, 1])
fig.show()

In [None]:
# sns.pairplot(summary_data[['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']])
# sns.pairplot(summary_data[['SO2', 'NO2', 'O3']])

In [None]:
x = summary_data[(summary_data['Measurement date'] > np.asdate'2017-01-01') and (summary_data['Measurement date'] < '2017-12-31')]
x
# sns.pairplot(summary_data[['SO2', 'NO2', 'O3']])

# Check mean monthly chart

In [None]:

summary_data['day'] = summary_data['Measurement date'].dt.day
summary_data.head()

In [None]:
monthly_summary_data = summary_data.groupby('day').mean()
monthly_summary_data

In [None]:
'''Check data for first station'''
fig, ax = plt.subplots(3, 2, figsize=(25, 16))
fig.suptitle("Station 101 monthly chart")
df = monthly_summary_data
sns.lineplot(x=df.index, y=df['SO2'], linewidth = 1, ax=ax[0, 0])
sns.lineplot(x=df.index, y=df['NO2'], linewidth = 1, ax=ax[0, 1])
sns.lineplot(x=df.index, y=df['O3'], linewidth = 1, ax=ax[1, 0])
sns.lineplot(x=df.index, y=df['CO'], linewidth = 1, ax=ax[1, 1])
sns.lineplot(x=df.index, y=df['PM10'], linewidth = 1, ax=ax[2, 0])
sns.lineplot(x=df.index, y=df['PM2.5'], linewidth = 1, ax=ax[2, 1])
fig.show()

# Check weekly chart

In [None]:
summary_data['day'] = summary_data['Measurement date'].dt.day_name()
summary_data.head()

In [None]:
daily_summary_data = summary_data.groupby('day').mean()
daily_summary_data

In [None]:
# daily_summary_data.sort_values(daily_summary_data.index)
daily_summary_data

In [None]:
indexes = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [None]:
'''Check data for first station'''
fig, ax = plt.subplots(3, 2, figsize=(25, 16))
fig.suptitle("Station 101 monthly chart")
df = daily_summary_data
sns.lineplot(x=indexes, y=df['SO2'], linewidth = 1, ax=ax[0, 0])
sns.lineplot(x=indexes, y=df['NO2'], linewidth = 1, ax=ax[0, 1])
sns.lineplot(x=indexes, y=df['O3'], linewidth = 1, ax=ax[1, 0])
sns.lineplot(x=indexes, y=df['CO'], linewidth = 1, ax=ax[1, 1])
sns.lineplot(x=indexes, y=df['PM10'], linewidth = 1, ax=ax[2, 0])
sns.lineplot(x=indexes, y=df['PM2.5'], linewidth = 1, ax=ax[2, 1])
fig.show()

In [None]:
summary_data['month'] = summary_data['Measurement date'].dt.month
summary_data.head()

In [None]:
month_summary_data = summary_data.groupby('month').mean()
month_summary_data

In [None]:
'''Check data for first station'''
fig, ax = plt.subplots(3, 2, figsize=(25, 16))
fig.suptitle("Station 101 monthly chart")
df = month_summary_data
sns.lineplot(x=df.index, y=df['SO2'], linewidth = 1, ax=ax[0, 0])
sns.lineplot(x=df.index, y=df['NO2'], linewidth = 1, ax=ax[0, 1])
sns.lineplot(x=df.index, y=df['O3'], linewidth = 1, ax=ax[1, 0])
sns.lineplot(x=df.index, y=df['CO'], linewidth = 1, ax=ax[1, 1])
sns.lineplot(x=df.index, y=df['PM10'], linewidth = 1, ax=ax[2, 0])
sns.lineplot(x=df.index, y=df['PM2.5'], linewidth = 1, ax=ax[2, 1])
fig.show()