In [None]:
import pandas as pd
%matplotlib inline

In [None]:
df = pd.read_csv('downsampled_weatherdata.csv')
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df[(df.rain_accumulation == 0.0)].shape 

# Notice that rain_accumulation is zero for a large # of rows

In [None]:
df[(df.rain_duration == 0.0)].shape

# Notice that rain_duration is zero for a large # of rows

In [None]:
# Delete columns we don't need for clustering

del df['rain_accumulation']
del df['rain_duration']
del df['hpwren_timestamp']

In [None]:
workingDF = df.copy(deep=True)

In [None]:
before = workingDF.shape[0]
workingDF = workingDF.dropna()
after = workingDF.shape[0]
before - after

In [None]:
workingDF.columns

# Which features to use for Clustering ?

In [None]:
featuresUsed = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 
        'max_wind_speed','relative_humidity']

workingDF = workingDF[featuresUsed]

In [None]:
workingDF.head(3)

# KMeans Clustering

In [None]:
from sklearn.cluster import KMeans


In [None]:
kmeans = KMeans(n_clusters=12, random_state=0, n_jobs=1).fit(workingDF)

In [None]:
centers = kmeans.cluster_centers_

In [None]:
centers

In [None]:
import numpy as np

# Function to keep centers (0,1,2,....) and features together 

def pd_centers(featuresUsed, centers):
	colNames = list(featuresUsed)
	colNames.append('prediction')

	# Zip with a column called 'prediction' (index)
	Z = [np.append(A, index) for index, A in enumerate(centers)]

	# Convert to pandas for plotting
	P = pd.DataFrame(Z, columns=colNames)
	P['prediction'] = P['prediction'].astype(int)
	return P

P = pd_centers(featuresUsed, centers)
P

In [None]:
from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates

#Function to help plot the centers on a Parallel Plot

def parallel_plot(data, P):
	my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k']), None, len(P)))
	plt.figure(figsize=(15,8)) #.gca().axes.set_ylim([-3,+3])
	parallel_coordinates(data, 'prediction', color = my_colors, marker='o')

## Dry Days

In [None]:
parallel_plot(P[P['relative_humidity'] < 30.0], P)

## Warm Days

In [None]:
parallel_plot(P[P['air_temp'] > 60.0], P)

## Cool Days

In [None]:
parallel_plot(P[(P['relative_humidity'] > 30) & (P['air_temp'] < 60)], P)