# Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## 1. K-means cluster

In [None]:
data_file = '../data/weather_clustering.csv'
dataset = pd.read_csv(data_file)
print ('Data shape: ', dataset.shape)
print ('Columns: ', dataset.columns)
dataset.head(3)

In [None]:
# describe the dataset
dataset.describe().transpose()

### Basic Data Exploratory Analysis

In [None]:
print ('Zero rain accumulation: ', dataset[dataset['rain_accumulation'] == 0].shape)
print ('NaN rain accumulation: \t', dataset[dataset['rain_accumulation'].isna()].shape)

print ('\nZero rain duration: \t', dataset[dataset['rain_duration'] == 0].shape)
print ('NaN rain duration: \t', dataset[dataset['rain_duration'].isna()].shape)

print ('\nCount number of rows for each column having NaN value')
print (dataset.isna().sum())

### Basic Data Cleaning

In [None]:
# Drop all the Rows with NaN in columns: 'rain_accumulation' & 'rain_duration'
subset_cols = ['rain_accumulation', 'rain_duration']
dataset_2 = dataset.dropna(subset = subset_cols)

print ('\nCount number of rows for each column having NaN value')
print (dataset_2.isna().sum())

In [None]:
# Drop all the Rows with NaN in any column
subset_cols = dataset.columns
dataset_2 = dataset.dropna(subset = dataset.columns)

print ('\nCount number of rows for each column having NaN value')
print (dataset_2.isna().sum())
print ('\nShape of new dataset: ', dataset_2.shape)

### Select subset of features to train the model

In [None]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 
        'max_wind_speed','relative_humidity']

select_df = dataset_2[features]
print ('Shape of select_df: ', select_df.shape)

### Scale the features
-- `Previously we used MinMaxScaler, let's use StandardScaler now` <br>
<br>
__MinMaxScaler:__ `For each value in a feature, MinMaxScaler subtracts the minimum value in the feature and then divides by the range. The range is the difference between the original maximum and original minimum.`
<br>
__StandardScaler:__ `standardizes a feature by subtracting the mean and then scaling to unit variance. Unit variance means dividing all the values by the standard deviation.`
<br>
Properties: `StandardScaler does distort the relative distances between the feature values, but MinMaxScaler do not.`
<br>
<br> Choose your scaler wisely

In [None]:
X = StandardScaler().fit_transform(select_df)
print ('X shape: ', X.shape, '\n')
sample_records = X[0:3].tolist()
for i in range(len(sample_records)):
    print (str(i) + " >> " + str(sample_records[i]) + '\n')

### Build Model

In [None]:
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(X)
print("model\n", model)

### Understanding clusters

In [None]:
centers = model.cluster_centers_
print ('Centroid shape: ', centers.shape)
print ('\nCentroid space: \n', centers)

### Plots

In [None]:
# Function that creates a DataFrame with a column for Cluster Number
def pd_centers(featuresUsed, centers):
    colNames = list(featuresUsed)
    colNames.append('cluster_number')
    
    # Zip with a column called 'prediction' (index)
    Z = [np.append(A, index) for index, A in enumerate(centers)]
    
    # Convert to pandas data frame for plotting
    P = pd.DataFrame(Z, columns=colNames)
    P['cluster_number'] = P['cluster_number'].astype(int)
    return P

# Function that creates Parallel Plots
def parallel_plot(data):
    my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k']), None, len(data)))
    plt.figure(figsize=(15,8)).gca().axes.set_ylim([-3,+3])
    parallel_coordinates(data, 'cluster_number', color = my_colors, marker='o')

In [None]:
# value of centroid based on features
P = pd_centers(features, centers)
P

In [None]:
## DRY DAYS
dry_days_centroid = P[P['relative_humidity'] < -0.5]
print ('\nDRY DAYS Centroid')
print (dry_days_centroid)

## WARM DAYS
warm_days_centroid = P[P['air_temp'] > 0.5]
print ('\n\nWARM DAYS Centroid')
print (warm_days_centroid)

## COOL DAYS
cool_days_centroid = P[(P['relative_humidity'] > 0.5) & (P['air_temp'] < 0.5)]
print ('\n\nCOOL DAYS Centroid')
print (cool_days_centroid)

In [None]:
parallel_plot(dry_days_centroid)

In [None]:
parallel_plot(warm_days_centroid)

In [None]:
parallel_plot(cool_days_centroid)