In [61]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice
import matplotlib as plt
from pandas.plotting import parallel_coordinates

%matplotlib inline

### Using Minute-Granularitydata

In [62]:
D = pd.read_csv('minute_weather.csv')
D.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


### Data Sampling

In [63]:
# goal here is to find patterns and trends in our dataset under specific considerations.

# take  every 10th row 

sample_df = D[(D['rowID']%10)==0]
sample_df.shape

(158726, 13)

In [64]:
# Drop all rows with missing values

del sample_df['rain_accumulation']
del sample_df['rain_duration']

B = sample_df.shape[0]
sample_df = sample_df.dropna()
A = sample_df.shape[0]


print("No of rows deleted:", B - A)

No of rows deleted: 46


In [65]:
# after applying `drop.na()` we get 46 deleted rows.

### Features of interest in the data

In [66]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed', 'relative_humidity']
select_df = sample_df[features]

### Scale: Data Normalization (mean = 0, SD = 1)

In [67]:
X = StandardScaler().fit_transform(select_df)

### Apply K Means Clustering

In [68]:
KM  = KMeans(n_clusters = 10)
model = KM.fit(X)
centers = model.cluster_centers_ #collect all central points of 10 clusters

  super()._check_params_vs_input(X, default_n_init=10)


### Data Visualization

In [69]:
# 1. Create a function that generates a dataframe with a clusters number column

def pd_centers(featuresUsed, centers):
    colNames = list(featuresUsed)
    colNames.append('prediction')
    Z = [np.append(A,index) for index, A in enumerate(centers)]
    P = pd.DataFrame(Z, columns = colNames)
    P['prediction'] = P['prediction'].astype(int)
    return P

# 2. Create a function that creates parallel plots

def parallel_plots(data):
    my_colors = list(islice(cycle['b','r','g','y','k'], None, len(data)))
    plt.figure(figsize=(15,8)).gca().axes.set_ylim([-3,+3])
    parallel_coordinates(data,'prediction',color = my_colors, marker = 'o')


P = pd_centers(features, centers)

### Visualize Warm Days

In [70]:
parallel_plots(P[P['air_temp'] > 0.5])

TypeError: type 'itertools.cycle' is not subscriptable

### Visualize Dry Days

In [None]:
parallel_plots(P[P['relative_humidity'] > -0.5])