DATASET: https://www.kaggle.com/julianjose/minute-weather?select=minute_weather.csv

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from itertools import cycle, islice

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/minute_weather.csv')

In [4]:
data.shape

(65989, 13)

In [5]:
data.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [6]:
#data sampling
sampled_df = data[(data['rowID'] % 10) == 0]
sampled_df.shape

(6599, 13)

In [7]:
sampled_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,6599.0,32990.0,19051.115453,0.0,16495.0,32990.0,49485.0,65980.0
air_pressure,6599.0,916.551114,1.683936,910.8,915.3,916.6,917.8,920.7
air_temp,6599.0,68.298409,9.638098,45.68,62.96,70.34,75.56,90.68
avg_wind_direction,6596.0,149.82732,98.709646,0.0,50.0,172.0,212.0,359.0
avg_wind_speed,6596.0,2.208187,1.48922,0.0,1.1,1.9,3.0,23.4
max_wind_direction,6596.0,151.588235,95.102894,0.0,56.0,178.0,218.0,359.0
max_wind_speed,6596.0,2.725606,1.730347,0.1,1.5,2.4,3.6,24.5
min_wind_direction,6596.0,160.028502,103.468194,0.0,57.0,173.0,212.0,359.0
min_wind_speed,6596.0,1.676107,1.28563,0.0,0.8,1.4,2.3,21.6
rain_accumulation,6598.0,3e-05,0.000937,0.0,0.0,0.0,0.0,0.04


In [8]:
sampled_df[sampled_df['rain_accumulation'] == 0].shape

(6589, 13)

In [9]:
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']

In [10]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [13]:
rows_before - rows_after

3

In [14]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [15]:
features = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed','relative_humidity']

In [16]:
select_df = sampled_df[features]

In [17]:
select_df.columns

Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_direction', 'max_wind_speed', 'relative_humidity'],
      dtype='object')

In [18]:
select_df

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity
0,912.3,64.76,97.0,1.2,106.0,1.6,60.5
10,912.3,62.24,144.0,1.2,167.0,1.8,38.5
20,912.2,63.32,100.0,2.0,122.0,2.5,58.3
30,912.2,62.60,91.0,2.0,103.0,2.4,57.9
40,912.2,64.04,81.0,2.6,88.0,2.9,57.4
...,...,...,...,...,...,...,...
65940,917.2,49.64,226.0,2.3,240.0,3.1,91.4
65950,917.3,49.82,208.0,1.6,223.0,2.4,91.5
65960,917.2,49.64,220.0,2.9,231.0,3.8,91.5
65970,917.1,49.82,217.0,2.4,233.0,2.7,91.5


In [20]:
X = StandardScaler().fit_transform(select_df)

X

array([[-2.52515769, -0.36820367, -0.53521947, ..., -0.47939332,
        -0.65055822,  0.63107531],
       [-2.52515769, -0.62983139, -0.05903943, ...,  0.16206587,
        -0.53496573, -0.2991539 ],
       [-2.58454731, -0.51770522, -0.504825  , ..., -0.31114173,
        -0.13039204,  0.53805239],
       ...,
       [ 0.38493411, -1.93796997,  0.71095382, ...,  0.83507223,
         0.62095911,  1.94185282],
       [ 0.32554448, -1.91928228,  0.68055935, ...,  0.85610367,
        -0.01479955,  1.94185282],
       [ 0.26615485, -1.91928228,  0.66029637, ...,  0.81404078,
        -0.30378076,  1.94608113]])

In [21]:
#Using kmeans clustering
kmeans = KMeans(n_clusters=12)
model = kmeans.fit(X)
print("model\n", model)

model
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


In [22]:
centers = model.cluster_centers_
centers

array([[-0.05791413,  0.63915391, -1.03866671,  1.20214432, -0.96598456,
         1.20173192, -0.77021317],
       [-0.65386525, -1.55020032,  0.01193414, -0.64475532,  0.09599329,
        -0.67036252,  1.70568814],
       [-0.0376301 ,  0.20266521,  0.53713061,  1.14013084,  0.62105317,
         1.08886017,  0.09666561],
       [-0.58068283,  0.32553868,  0.28089698, -0.74848886,  0.49920689,
        -0.76018158, -0.33605807],
       [ 0.91840533,  0.79850887,  0.33882659, -0.1970925 ,  0.52891143,
        -0.17890345, -0.58100818],
       [ 0.64828812, -1.36808968,  0.7473095 , -0.19525963,  0.9204461 ,
        -0.16512816,  1.41022768],
       [ 0.12010036,  0.34275927,  1.9371268 , -0.49074998, -1.35593736,
        -0.36179107, -0.49694505],
       [-2.08697811, -1.66403572,  0.54761443,  3.92466831,  0.68849329,
         4.01049726,  1.71981474],
       [-1.27376942, -1.62848482,  0.68529503,  1.24151814,  0.81585794,
         1.27898488,  1.67782871],
       [-0.02124602,  0.4642