In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv("datasets/air_quality_dataset.csv")

In [3]:
df.head()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,PM 2.5
0,7.4,9.8,4.8,1017.6,93.0,0.5,4.3,9.4,219.720833
1,7.8,12.7,4.4,1018.5,87.0,0.6,4.4,11.1,182.1875
2,6.7,13.4,2.4,1019.4,82.0,0.6,4.8,11.1,154.0375
3,8.6,15.5,3.3,1018.7,72.0,0.8,8.1,20.6,223.208333
4,12.4,20.9,4.4,1017.3,61.0,1.3,8.7,22.2,200.645833


### What is PM 2.5

Fine particulate matter (PM2.5) is an air pollutant that is a concern for people's health when levels in air are high. PM2.5 are tiny particles in the air that reduce visibility and cause the air to appear hazy when levels are elevated.

### Air Quality Index Scale
The table below defines the Air Quality Index scale as defined by the US-EPA 2016 standard:

| PM 2.5 Range| Label                          |
| ------------| -------------------------------|
| 0-50        | Good                           |
| 51-100      | Moderate                       |
| 101-150     | Unhealthy for Sensitive Groups |
| 151-200     | Unhealthy                      |
| 201-300     | Very Unhealthy                 |
| 301+        | Hazardous                      |

In [4]:
df=df.dropna()

In [5]:
df.columns

Index(['T', 'TM', 'Tm', 'SLP', 'H', 'VV', 'V', 'VM', 'PM 2.5'], dtype='object')

In [6]:
## creating an (1 X n) matrix of air quality index
def air_quality_index_label(row):
    val = row['PM 2.5']
    if val <= 50:
        return "Good"
    elif val <= 100:
        return "Moderate"
    elif val <= 150:
        return "Unhealthy"
    elif val <= 200:
        return "Unhealthy for Strong People"
    else:
        return "Hazardous"
    
air_quality = df.apply(air_quality_index_label, axis=1)

In [7]:
# removing PM 2.5 column and adding quality column in another dataframe
categorical_dataframe = df.iloc[:,:-1]
categorical_dataframe.columns

Index(['T', 'TM', 'Tm', 'SLP', 'H', 'VV', 'V', 'VM'], dtype='object')

In [8]:
categorical_dataframe["Quality"] = air_quality
categorical_dataframe.head()

Unnamed: 0,T,TM,Tm,SLP,H,VV,V,VM,Quality
0,7.4,9.8,4.8,1017.6,93.0,0.5,4.3,9.4,Hazardous
1,7.8,12.7,4.4,1018.5,87.0,0.6,4.4,11.1,Unhealthy for Strong People
2,6.7,13.4,2.4,1019.4,82.0,0.6,4.8,11.1,Unhealthy for Strong People
3,8.6,15.5,3.3,1018.7,72.0,0.8,8.1,20.6,Hazardous
4,12.4,20.9,4.4,1017.3,61.0,1.3,8.7,22.2,Hazardous


In [9]:
import seaborn as sns

In [None]:
sns.pairplot(categorical_dataframe, hue="Quality", height=1)

<seaborn.axisgrid.PairGrid at 0x7fdb979dc8b0>

In [None]:
cor_mat = df.corr()

In [None]:
cor_mat

In [None]:
top_corr_features = cor_mat.index
plt.figure(figsize=(10,10))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest, mutual_info_regression, f_regression
from sklearn.neural_network import MLPRegressor

In [None]:
X=df.iloc[:,:-1] ## independent features
Y=df.iloc[:,-1] ## dependent features

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
reg = MLPRegressor()

In [None]:
reg.fit(X_train, Y_train)

In [None]:
reg.score(X_train, Y_train)

In [None]:
reg.score(X_test, Y_test)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(reg, X_train, Y_train, cv=5)

In [None]:
prediction=reg.predict(X_test)

In [None]:
sns.distplot(Y_test-prediction)