In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice

In [2]:
data = pd.read_csv('minute_weather.csv')

In [3]:
data.shape

(1587257, 13)

In [4]:
data.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [5]:
#data sampling
sampled_df =data[(data['rowID'] % 10) == 0]
sampled_df.shape
sampled_df

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
10,10,2011-09-10 00:10:49,912.3,62.24,144.0,1.2,167.0,1.8,115.0,0.6,0.0,0.0,38.5
20,20,2011-09-10 00:20:49,912.2,63.32,100.0,2.0,122.0,2.5,91.0,1.5,0.0,0.0,58.3
30,30,2011-09-10 00:30:49,912.2,62.60,91.0,2.0,103.0,2.4,71.0,1.4,0.0,0.0,57.9
40,40,2011-09-10 00:40:49,912.2,64.04,81.0,2.6,88.0,2.9,68.0,1.4,0.0,0.0,57.4
50,50,2011-09-10 00:50:49,912.1,63.68,102.0,1.2,119.0,1.5,92.0,1.0,0.0,0.0,51.4
60,60,2011-09-10 01:00:49,912.0,64.04,83.0,0.7,101.0,0.9,73.0,0.5,0.0,0.0,51.4
70,70,2011-09-10 01:10:49,911.9,64.22,82.0,2.0,97.0,2.4,69.0,1.7,0.0,0.0,62.2
80,80,2011-09-10 01:20:49,911.9,61.70,67.0,3.3,70.0,3.5,63.0,3.1,0.0,0.0,71.5
90,90,2011-09-10 01:30:49,911.9,61.34,67.0,3.6,75.0,4.2,62.0,3.1,0.0,0.0,72.5


In [6]:
sampled_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,158726.0,793625.0,458203.937509,0.0,396812.5,793625.0,1190437.5,1587250.0
air_pressure,158726.0,916.830161,3.051717,905.0,914.8,916.7,918.7,929.5
air_temp,158726.0,61.851589,11.833569,31.64,52.7,62.24,70.88,99.5
avg_wind_direction,158680.0,162.1561,95.278201,0.0,62.0,182.0,217.0,359.0
avg_wind_speed,158680.0,2.775215,2.057624,0.0,1.3,2.2,3.8,31.9
max_wind_direction,158680.0,163.462144,92.452139,0.0,68.0,187.0,223.0,359.0
max_wind_speed,158680.0,3.400558,2.418802,0.1,1.6,2.7,4.6,36.0
min_wind_direction,158680.0,166.774017,97.441109,0.0,76.0,180.0,212.0,359.0
min_wind_speed,158680.0,2.134664,1.742113,0.0,0.8,1.6,3.0,31.6
rain_accumulation,158725.0,0.000318,0.011236,0.0,0.0,0.0,0.0,3.12


In [7]:
sampled_df[sampled_df['rain_accumulation'] == 0].shape

(157812, 13)

In [8]:
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']

In [9]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [10]:
rows_before - rows_after

46

In [11]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [12]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed','relative_humidity']

In [13]:
select_df = sampled_df[features]

In [14]:
select_df.columns

Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_direction', 'max_wind_speed', 'relative_humidity'],
      dtype='object')

In [15]:
select_df

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity
0,912.3,64.76,97.0,1.2,106.0,1.6,60.5
10,912.3,62.24,144.0,1.2,167.0,1.8,38.5
20,912.2,63.32,100.0,2.0,122.0,2.5,58.3
30,912.2,62.60,91.0,2.0,103.0,2.4,57.9
40,912.2,64.04,81.0,2.6,88.0,2.9,57.4
50,912.1,63.68,102.0,1.2,119.0,1.5,51.4
60,912.0,64.04,83.0,0.7,101.0,0.9,51.4
70,911.9,64.22,82.0,2.0,97.0,2.4,62.2
80,911.9,61.70,67.0,3.3,70.0,3.5,71.5
90,911.9,61.34,67.0,3.6,75.0,4.2,72.5


In [16]:
X = StandardScaler().fit_transform(select_df)
X

array([[-1.48456281,  0.24544455, -0.68385323, ..., -0.62153592,
        -0.74440309,  0.49233835],
       [-1.48456281,  0.03247142, -0.19055941, ...,  0.03826701,
        -0.66171726, -0.34710804],
       [-1.51733167,  0.12374562, -0.65236639, ..., -0.44847286,
        -0.37231683,  0.40839371],
       ...,
       [-0.30488381,  1.15818654,  1.90856325, ...,  2.0393087 ,
        -0.70306017,  0.01538018],
       [-0.30488381,  1.12776181,  2.06599745, ..., -1.67073075,
        -0.74440309, -0.04948614],
       [-0.30488381,  1.09733708, -1.63895404, ..., -1.55174989,
        -0.62037434, -0.05711747]])

In [17]:
#Using kmeans clustering
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(X)
print("model\n", model)

model
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)


In [18]:
centers = model.cluster_centers_
centers

array([[ 0.23378992,  0.3194701 ,  1.88794624, -0.65194259, -1.55164204,
        -0.57678283, -0.28275652],
       [-0.69528839,  0.54419858,  0.17720147, -0.58426127,  0.34650733,
        -0.59765038, -0.11466484],
       [ 0.25374433, -0.99439418,  0.66017132, -0.54749651,  0.85142629,
        -0.53008398,  1.15760049],
       [-1.179016  , -0.87692454,  0.44662363,  1.97532102,  0.53859346,
         1.93651789,  0.91516278],
       [ 1.36942778, -0.08328948, -1.2067548 , -0.04581039, -1.07573813,
        -0.02531431, -0.97759623],
       [-0.16111949,  0.86266388, -1.31114296, -0.5897787 , -1.16685247,
        -0.60511719, -0.64273013],
       [ 1.1896684 , -0.254953  , -1.15504874,  2.12639616, -1.05346193,
         2.24364996, -1.13439992],
       [-0.21119294,  0.63149365,  0.40852628,  0.73502573,  0.51666795,
         0.673021  , -0.1500372 ],
       [ 0.13085923,  0.84422124,  1.41042317, -0.63850905,  1.67442749,
        -0.58924018, -0.71416728],
       [ 0.06038633, -0.7877