This notebook was created to:
1. impute the missing values using MICE (Multi-variate Imputation by Chained Equation) technique
2. identify and eliminate the outliers using the Minimum Covariance Determinant (MCD) method
3. visually represent the processed data
4. analyze the data and draw conclusions.

# 1. Importing packages

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import plotly as py

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 2. Data Importation

In [None]:
air_data = "../input/air-quality-data/fin_data.csv"
df= pd.read_csv(air_data)
df = df.drop(["pm2_5"], axis=1)
print("Dataset obtained: \n")
df.head()

In [None]:
df.describe()

# 3. Missing Value Visualization

### 3.1.  Number of Missing Values

In [None]:
print("Missing values per column:")
print("So2  :",df["so2"].isnull().sum())
print("No2  :",df["no2"].isnull().sum())
print("RSPM :",df["rspm"].isnull().sum())
print("\nTotal number of missing values: ")
print(df["so2"].isnull().sum() + df["no2"].isnull().sum() + df["rspm"].isnull().sum())

### 3.2. Visualization

In [None]:
import missingno as msn
fig = msn.matrix(df)
fig

In [None]:
msn.heatmap(df)

In [None]:
msn.dendrogram(df)

# 4. Data Preprocessing

### 4.1. Selecting the required columns

In [None]:
x=df.iloc[:,8:10]
y=df.iloc[:,10]
print("X data: \n",x[:5])
print("\nY data: \n",y[:5])

### 4.2. Applying Multivariate Imputation by Chained Equations - MICE

In [None]:
mice_imputer=IterativeImputer()
x=mice_imputer.fit_transform(x)
print("X after imputation: \n",x[:5])
y=mice_imputer.fit_transform(y.values.reshape(-1,1))
print("\nY after imputation:\n ",y[:5])

print("\nData sucessfully imputed.")

### 4.3. Exporting Preprocessed Data into Excel

In [None]:
loc=df.iloc[:,5]
lati=df.iloc[:,3]
longi=df.iloc[:,4]
data = df.iloc[:,8:10]
    
data['date'] = df.iloc[:,-2]
data['state'] = df.iloc[:,2]
data['so2']=x[:,0]
data['no2']=x[:,1]
data['rspm']=y
data.to_csv("processed_data.csv")

print("Preprocessed data sucessfully writen to another file.")

In [None]:
data['date'] = pd.to_datetime(data['date'],format='%m-%d-%Y') # date parse
data['year'] = data['date'].dt.year # year
data['year'] = data['year'].fillna(0.0).astype(int)
data = data[(data['year']>0)]

# 5. Preprocessed Data Visualization

In [None]:
data.head()

### 5.1.  Number of Missing Values

In [None]:
print("Missing values per column:")
print("So2  :",data["so2"].isnull().sum())
print("No2  :",data["no2"].isnull().sum())
print("RSPM :",data["rspm"].isnull().sum())
print("\nTotal number of missing values: ")
print(data["so2"].isnull().sum() + data["no2"].isnull().sum() + data["rspm"].isnull().sum())

### 5.2. Visualization

In [None]:
temp = df.iloc[:,8:10]
temp["so2"] = df["so2"]
temp["no2"] = df["no2"]
temp["date"] = df["date"]
temp["state"] = df["state"]
temp["rspm"] = df['rspm']
temp["year"] = df["year"]
temp.head()

In [None]:
msn.matrix(temp,figsize=(8,5))
msn.matrix(data,figsize=(8,5))

In [None]:
msn.heatmap(temp,figsize=(8,5))
msn.heatmap(data,figsize=(8,5))

In [None]:
msn.dendrogram(temp,figsize=(8,5))
msn.dendrogram(data,figsize=(8,5))

In [None]:
msn.bar(temp,figsize=(8,5), color="r")
plt.figure()
msn.bar(data,figsize=(8,5), color="g")

# 6. Outlier Detection

### 6.1. Splitting Data into Train and Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
print("X_train.shape = ",X_train.shape, "\ny_train.shape = ",y_train.shape)
plt.scatter(x[:,0],x[:,1], label="RSMP: So2 vs No2")
plt.xlabel("So2")
plt.ylabel("No2") 
plt.title("So2 vs No2")
plt.legend()
plt.show()

### 6.2. Minimum Covariance Determinant

In [None]:
ee = EllipticEnvelope(contamination=0.0003)
yhat = ee.fit_predict(x)
mask1 = yhat == -1
values = x[mask1]
print("MCD sucessfully applied for data")

### 6.3. Outliers Found Using MCD 

In [None]:
plt.scatter(x[:,0], x[:,1], label="RSMP: So2 vs No2")
plt.scatter(values[:,0],values[:,1], color='r', label="Outliers Detected")
plt.xlabel("So2")
plt.ylabel("No2") 
plt.title("So2 vs No2")
plt.legend()
plt.show()

### 6.4. Predicting and Fitting Data 

In [None]:
ypred = ee.fit_predict(X_train)
mask = ypred != -1
X_train, y_train = X_train[mask, :], y_train[mask]
print("Shape of dataset after MCD:")
print("X_train.shape = ",X_train.shape, "\ny_train.shape = ",y_train.shape)

model = LinearRegression()
model.fit(X_train, y_train)
ypred = model.predict(X_test)
mae = mean_absolute_error(y_test, ypred)
print('\nMAE of MCD: ',mae) 

### 6.5. Visualization After Oultier Detection

In [None]:
f, ax = plt.subplots(figsize=(15,15))
ax.set_title('{} by state and year'.format('rspm'))
sns.heatmap(data.pivot_table('rspm', index='state',
                             columns=['year'],aggfunc='median',margins=True),
            annot=True,cmap="BuPu", linewidths=.5, ax=ax,cbar_kws={'label': 'Annual Average'})

In [None]:
data[['rspm','state']].groupby(["state"]).median().sort_values(by='rspm',
                                                               ascending=False).head(34).plot.bar(color='b')
plt.show()

In [None]:
temp = data[['rspm','year','state']].groupby(["year"]).median().reset_index().sort_values(by='year',
                                                                                          ascending=False)
f,ax=plt.subplots(figsize=(15,5))
sns.pointplot(x='year', y='rspm', data=temp)