In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

np.set_printoptions(precision=4)
sns.set(font_scale=1.5)
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Loading thge data

In [None]:
df = pd.read_csv('/Users/sandra/Desktop/Projects/air_clean_data.csv')
df.head()

In [None]:
df['dateInt']=df['year'].astype(str) + df['month'].astype(str).str.zfill(2)+ df['day'].astype(str).str.zfill(2)+df['hour'].astype(str).str.zfill(2)
df['Date'] = pd.to_datetime(df['dateInt'], format='%Y%m%d%H')
df['dateInt']=df['year'].astype(str) + df['month'].astype(str).str.zfill(2)+ df['day'].astype(str).str.zfill(2)
df['date'] = pd.to_datetime(df['dateInt'], format='%Y%m%d')

In [None]:
df.set_index('Date', inplace=True)
df.head()

#  Explanatory Data Analysis

## Interactive graphs using Plotly

In [None]:
import plotly
import plotly.graph_objs as go
import plotly.offline as py
#import plotly.io as pio
#pio.renderers.default = 'colab'
plotly.offline.init_notebook_mode(connected=True)

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
unique_values = df['station'].unique()
unique_values

In [None]:
stations = ['Dingling', 'Wanshouxigong', 'Huairou', 'Dongsi', 'Wanliu',
       'Shunyi', 'Guanyuan', 'Aotizhongxin', 'Changping', 'Gucheng',
       'Nongzhanguan', 'Tiantan']

### Visualization of air pollution and weather condition levels of the whole dataset

In [None]:
air_variables = df[['SO2','CO','NO2', 'O3','PM10','PM2.5']]
weather_variables = df[['TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']]

In [None]:
import cufflinks as cf
cf.go_offline()
fig = go.Figure()
for variable in air_variables:
    fig.add_traces(go.Scatter(x=df.date, y=df[variable], mode='lines', name = variable))
#fig.show()
py.iplot(fig)


In [None]:
fig = go.Figure()
for variable in weather_variables:
    fig.add_traces(go.Scatter(x=df.date, y=df[variable], mode='lines', name = variable))
fig.show()
fig.write_html("output.html",
               include_plotlyjs="cdn")


### Visualisation of air pollution and weather condition sampled by station

# air pollution
# Sampling a single station data
#station= df.loc[df['station'] == 'Dingling']

for station in unique_values:

    layout = go.Layout(title='Weather conditions (Station : Dingling)',
                       yaxis={'title': 'Level (ug/m^3) '},
                       xaxis={'title': 'Date'})                   

    fig = go.Figure(layout=layout)

    for variable in air_variables:
        fig.add_traces(go.Scatter(x=df.date, y=df[variable], mode='lines', name = variable))
    fig.show()

#py.iplot(fig)

In [None]:
# air pollution
# Sampling a single station data
#station= df.loc[df['station'] == 'Dingling']

for station in stations:

    layout = go.Layout(title='Air pollution at the station: {}'.format(station),
                       yaxis={'title': 'Level (ug/m^3) '},
                       xaxis={'title': 'Date'})                   

    fig = go.Figure(layout=layout)

    for variable in air_variables:
        fig.add_traces(go.Scatter(x=df.date, y=df[variable], mode='lines', name = variable))
    fig.show()

#py.iplot(fig)

In [None]:
# weather conditions
# Sampling a single station data

for station in stations:

    layout = go.Layout(title='Weather conditions levels at the station: {}'.format(station),
                       yaxis={'title': 'Level (ug/m^3) '},
                       xaxis={'title': 'Date'})                   

    fig = go.Figure(layout=layout)

    for variable in weather_variables:
        fig.add_traces(go.Scatter(x=df.date, y=df[variable], mode='lines', name = variable))
    fig.show()


#py.iplot(fig)

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

%matplotlib inline


In [None]:
df_daily = df.resample('D').mean()
df_daily.head()

In [None]:
df_weekly = df_daily.resample('W').mean()
df_weekly.head()

In [None]:
df_monthly = df_daily.resample('M').mean()
df_monthly.head()

In [None]:
pollutant_corr = df_daily.corr()  # Pearson correlation
f, ax = plt.subplots(figsize=(15, 10))
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)
sns.heatmap(pollutant_corr, cmap=cmap, annot=True, vmax=1, center=0,
            square=True, linewidth=.5)

In [None]:
pm10_top5 = df.groupby(by='station').agg({'PM10': 'mean'}).sort_values(by='PM10', ascending=False).head(5).reset_index()
pm10_top5_st_codes = pm10_top5['station'].tolist()
print('PM Top 5 station codes: {}'.format(pm10_top5_st_codes))
# pm10_top5_district = df.set_index('station').loc[pm10_top5_st_codes].reset_index()['station']
# pm10_top5.insert(loc=1, column='station)', value=pm10_top5_district)
pm10_top5

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(data=pm10_top5, x='station', y='PM10', palette='Set3')

In [None]:
cols_to_drop = [ 'PM10', 'SO2', 'NO2', 'CO',
       'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']

weekly_data = df_weekly.drop(cols_to_drop, axis=1)

In [None]:
pm_rate = weekly_data
pm_rate

In [None]:
df.station.unique()