In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [2]:
#Setup Helper Functions
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

# Loading Data and Basic Exploration

In [3]:
data = pd.read_csv("../input/pollution_us_2000_2016.csv")

In [4]:
data.shape

In [5]:
data.columns

In [6]:
data.head(3)

# Extracting Data for New York City and Preprocessing and Exploring

In [7]:
NY_mask = data['City'].str.contains('New York')
ny = data[NY_mask]

In [8]:
ny.head(2)

In [9]:
del ny['State']
del ny['Unnamed: 0']
del ny['State Code']
del ny['City']
del ny['Address']

In [10]:
ny.head()

In [11]:
ny.shape

In [12]:
ny.info()

In [13]:
del ny['NO2 Units']
del ny['NO2 1st Max Hour']
del ny['O3 Units']
del ny['O3 1st Max Hour']
del ny['SO2 Units']
del ny['SO2 1st Max Hour']
del ny['CO Units']
del ny['CO 1st Max Hour']

In [14]:
ny.head(2)

In [15]:
ny.shape

In [16]:
ny.describe()

In [17]:
plot_correlation_map(ny)

# Creating New data frame by removing unnecessary columns. Then using interpolate() and dropna() to treat NaN and NA values.

In [18]:
newdata = pd.DataFrame(ny, columns = ['NO2 Mean', 'NO2 1st Max Value', 'NO2 AQI', 'O3 Mean', 'O3 1st Max Value', 'O3 AQI', 'SO2 Mean', 'SO2 1st Max Value', 'SO2 AQI', 'CO Mean', 'CO 1st Max Value', 'CO AQI']) 

In [19]:
newdata.isnull().any()

In [20]:
newdata.shape

In [21]:
X = newdata.interpolate()
X.shape

In [22]:
X.isnull().any()

In [23]:
X = X.dropna()
X.shape

# Creating new dataset free of NaN values and exploring.

In [24]:
x_before_pca = pd.DataFrame(X)
x_before_pca.describe()

In [25]:
x_before_pca.shape

In [26]:
plot_correlation_map(x_before_pca)

# Using PCA to reduce variables.

### First we apply PCA for all the 12 variables (n_components = 12), i.e., we create 12 PCs. Then we see the amount of variance that each PC explains and plot that. Based on that, we select the number of PCs that we need.

In [27]:
from sklearn.decomposition import PCA, RandomizedPCA

In [28]:
pca = PCA(n_components=12)

In [29]:
pca.fit(x_before_pca)

In [30]:
#The amount of variance that each PC explains
var = pca.explained_variance_ratio_

In [31]:
#Cumulative Variance explains
var1 = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [32]:
var1

In [33]:
plt.plot(var1)

### So we will have 3 (although the ideal is 2) PCs.

In [34]:
x_pca = PCA(n_components=3)

In [35]:
x_pca.fit(x_before_pca)

In [36]:
x = x_pca.fit_transform(x_before_pca)

In [37]:
type(x[:,0])

In [38]:
x.shape

In [39]:
d = {'pc1': x[:,0], 'pc2': x[:, 1], 'pc3': x[:,2]}
x_df = pd.DataFrame(d)

### x_df is the principal component data frame. Each column represents a principal component, and each row represents the set of PCs that explains that specific training example.

In [40]:
x_df.shape

In [41]:
x_df.head(3)

In [42]:
x_df.describe()

In [43]:
x_new_ndarray = x_pca.inverse_transform(x_df)
x_new = pd.DataFrame(x_new_ndarray)
x_new.columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12']


### x_new is the reconstructed data from the PC data frame x_df.

In [44]:
x_new.shape

In [45]:
x_new.head(3)

In [46]:
x_before_pca.head(3)

In [47]:
plt.scatter(x_df['pc1'], x_df['pc2'], color = 'blue')

In [48]:
from mpl_toolkits.mplot3d import Axes3D

In [49]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs = x_df['pc1'], ys = x_df['pc2'], zs= x_df['pc3'], zdir='z')