# Loading datasets

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# dataset with the meteo data
meteo =  pd.read_csv('../input/dataset/MeteoManzIzvorno.csv')
meteo_raw = meteo.copy()

# dataset with air pollution
poll_nbg = pd.read_csv('../input/dataset/SviPodaci_mStanica_BeogradNBG.csv')
poll_nbg_raw = poll_nbg.copy()
# dropping 'MernaStanica' column
poll_nbg.drop('MernaStanica', axis=1, inplace=True)

In [None]:
meteo_raw.

In [None]:
meteo.describe()
poll_nbg.describe()

# Features manipulation

In [None]:
# type of our features
meteo.dtypes

In [None]:
meteo['Rel. Hum. (%)']

We are removing rows in which humidity or pressure is not well defined, since there were not many of them.

In [None]:
meteo = meteo_raw.copy()
n = meteo.shape[0]  

for i in range(n):
    if ('%' not in str(meteo.loc[i,'Rel. Hum. (%)']) or '-' in str(meteo.loc[i,'Pressure/ Geopot.']) or
        'm' in str(meteo.loc[i,'Pressure/ Geopot.']) or
        '-' in str(meteo.loc[i,'Wind dir']) or str(meteo.loc[i,'Wind dir']) == ''):
        meteo.drop(i, inplace=True)

meteo['Rel. Hum. (%)'] = [str(x)[:-1] for x in meteo['Rel. Hum. (%)']] 
meteo['Rel. Hum. (%)'] = [int(x) for x in meteo['Rel. Hum. (%)']] 

meteo['Pressure/ Geopot.'] = [str(x)[:-4] for x in meteo['Pressure/ Geopot.']] 
meteo['Pressure/ Geopot.'] = [float(x) for x in meteo['Pressure/ Geopot.']] 

meteo['Wind dir'] = [str(x).replace(" ", "") for x in meteo['Wind dir']]
meteo['Wind dir']= [s[s.find("(")+1:s.find(")")] if ( '(' in s and ')' in s) else 'C' for s in meteo['Wind dir']] 

In [None]:
meteo['Wind dir']

In [None]:
meteo['Wind dir'] = pd.Categorical(meteo['Wind dir'])
meteo['Wind dir'].cat.categories


In [None]:
meteo['Wins speed (Km/h)'] = [str(x) for x in meteo['Wins speed (Km/h)']]
meteo['Wins speed (Km/h)'] = [0 if s == '' else float(s) for s in meteo['Wins speed (Km/h)']]

In [None]:
meteo['Low clouds'] = [str(x) for x in meteo['Low clouds']]
meteo['Low clouds'] = [0 if s == '-' else 1 for s in meteo['Low clouds']]

meteo['Medium clouds'] = [str(x) for x in meteo['Medium clouds']]
meteo['Medium clouds'] = [0 if s == '-' else 1 for s in meteo['Medium clouds']]

meteo['High clouds'] = [str(x) for x in meteo['High clouds']]
meteo['High clouds'] = [0 if s == '-' else 1 for s in meteo['High clouds']]

meteo['Clouds'] = [str(x)[0] for x in meteo['Clouds']]
meteo['Clouds'] = [9 if s == 'N' else int(s) for s in meteo['Clouds']]

for i, row in meteo.iterrows():
    if meteo.at[i,'Clouds'] == 9:    
        meteo.at[i,'Low clouds'] = 1

Precipitation is being dropped, since ~90% of it is unknown. Also, humidity is a good replace for the amount of rain, and we will add one new feature 'Prec' (precipitaion) based on the column 'Conditions'.

We are dropping 'Max temp. (ºC)'. 'Min temp. (ºC)', and 'Prec. (mm)', since they are mostly empty.

In [None]:
meteo.drop('Max temp. (ºC)', axis = 1, inplace = True)
meteo.drop('Min temp. (ºC)', axis = 1, inplace = True)
meteo.drop('Prec. (mm)', axis = 1, inplace = True)

Now, we are creating 2 new feature - 'Fog', 'Prec'. This is extracted from the column 'Conditions'.

In [None]:
meteo['Conditions'] = [str(s).lower() for s in meteo['Conditions']]
fog = [1 if ('fog' in s or 'overcast' in s) else 0 for s in meteo['Conditions']]
rain = [1 if ('rain' in s or 'snow' in s or 'drizzle' in s) else 0 for s in meteo['Conditions']]

meteo.insert(12,"Fog", fog, True)
meteo.insert(13,"Prec", rain, True)

We are dropping conditions in the end.

In [None]:
meteo.drop('Conditions', axis = 1, inplace = True)

Type of the feature now is what we want it to be.

In [None]:
meteo.dtypes

Let's transform categorical variables, i.e. 'Wind dir' into dummy variables.


In [None]:
meteo = meteo.join(pd.get_dummies(meteo['Wind dir']))

In [None]:
#meteo.drop('Wind dir', axis = 1, inplace = True)

In [None]:
poll_nbg.head()

In [None]:
poll_nbg.loc[1,:][4]

In [None]:
[not str(x) for x in (poll_nbg.loc[4,:])]

In [None]:
poll_nbg['PM25'].replace('', np.nan, inplace = True)
poll_nbg.dropna(subset = ['PM25'], inplace = True)

In [None]:
poll_nbg.shape

primary key - date
format - YYYYMMDDHH

In [None]:
datum, vreme = zip(*(s.split(" ") for s in poll_nbg['Datum_i_Vreme']))
hours = [(x[0:2]) for x in vreme]
year, month, day = zip(*(s.split("-") for s in datum))
poll_nbg_key = [y+m+d+h for y,m,d, h in zip(year,month,day,hours)]

In [None]:
poll_nbg.insert(0,"Key", poll_nbg_key, True)

In [None]:

poll_nbg.head()

In [None]:
hours = [(x[0:2]) for x in meteo['UTC time']]
day, month, year = zip(*(s.split("/") for s in meteo['Date']))
meteo_key = [y+m+d+h for y,m,d, h in zip(year,month,day,hours)]

In [None]:
meteo.insert(0,"Key", meteo_key, True)

In [None]:
meteo_poll = meteo.merge(poll_nbg, left_on='Key', right_on='Key')
meteo_poll.sort_values(by=['Key'])

In [None]:
meteo_raw

In [None]:
meteo_poll.insert(1,"Month", [int(x[3:5]) for x in meteo_poll['Date']], True)

In [None]:
meteo_poll.insert(2,"Hours", [int(x[0:2]) for x in meteo_poll['UTC time']], True)

In [None]:
meteo_poll.head()

In [None]:
meteo_poll.drop('Date', axis = 1, inplace = True)
meteo_poll.drop('UTC time', axis = 1, inplace = True)
meteo_poll.drop('Datum_i_Vreme', axis = 1, inplace = True)

In [None]:
Morning = [1 if x >= 5 and x < 11 else 0 for x in meteo_poll['Hours'] ]
MidDay = [1 if x >= 11 and x < 17 else 0 for x in meteo_poll['Hours'] ]
Afternoon = [1 if x >= 17 and x < 23 else 0 for x in meteo_poll['Hours'] ]
Night = [1 if x < 5 or x == 23 else 0 for x in meteo_poll['Hours'] ]

In [None]:
meteo_poll.insert(3,"Morning", [1 if x >= 5 and x < 11 else 0 for x in meteo_poll['Hours'] ], True)
meteo_poll.insert(4,"MidDay", [1 if x >= 11 and x < 17 else 0 for x in meteo_poll['Hours'] ], True)
meteo_poll.insert(5,"Afternoon",[1 if x >= 17 and x <= 23 else 0 for x in meteo_poll['Hours']],True)
meteo_poll.insert(6,"Night",[1 if x < 5  else 0 for x in meteo_poll['Hours'] ], True)


In [None]:
meteo_poll.head()

In [None]:
meteo_poll.to_csv('./out.csv', index = False, header = True)

In [None]:
meteo_poll.describe()

In [None]:
n = meteo_poll.shape[0]
for i in range(n):
    if (meteo_poll.loc[i,'Pressure/ Geopot.']>1060):
        meteo_poll.drop(i, inplace=True)

In [None]:
plt.plot(meteo_poll['Pressure/ Geopot.'], meteo_poll['PM25'],'r,')