In [1]:
import pandas as pd
import pyarrow.parquet as pq
from numpy import nan

# CLean
## Creat a static dictionary for useful columns

In [2]:
# static cols
dic = {
    'windmills':['GSRN','Turbine_type','Parent_GSRN','BBR_municipal','Placement','UTM_x','UTM_y','Capacity_kw','Rotor_diameter','Navhub_height']
}

## Clean and extract windmill data

In [3]:
windmills = pq.read_table('data/ITU_DATA/masterdatawind.parquet').to_pandas()[dic['windmills']]
windmills.shape

(85150, 10)

Windmills have duplicated data by different reviewers.    
Remove duplicated data

In [4]:
windmills.drop_duplicates(subset ="GSRN", keep = 'first', ignore_index = True, inplace = True)
windmills.shape

(9853, 10)

9853 Contains:    
- H: Household turbine : No UTM_xy locations 
- W: Single turbine : full data
- P: Turbine park : No UTM_xy locations
- M: Turbine in a park : No power record in settlement data


In [5]:
print('Household: ',windmills[windmills['Turbine_type'] == 'H'].shape)
print('Single: ',windmills[windmills['Turbine_type'] == 'W'].shape)
print('Park: ',windmills[windmills['Turbine_type'] == 'P'].shape)
print('In a Park: ',windmills[windmills['Turbine_type'] == 'M'].shape)

Household:  (2328, 10)
Single:  (4195, 10)
Park:  (444, 10)
In a Park:  (2886, 10)


Fill Turbine park xy by calculate their CENTRIODs.

In [6]:
aggs = {'UTM_x':lambda x : x.mean(),
        'UTM_y':lambda x : x.mean()}
w_temp = windmills[windmills['Turbine_type'] == 'M'].groupby(['Parent_GSRN'], as_index = False).agg(aggs).rename(columns={'Parent_GSRN':'GSRN','UTM_x':'x_c','UTM_y':'y_c'})
w_temp

Unnamed: 0,GSRN,x_c,y_c
0,570714700000005640,642373.900000,6.139972e+06
1,570714700000012037,664423.733000,6.072868e+06
2,570714700000012051,664630.731000,6.072840e+06
3,570714700000012105,728122.163500,6.165601e+06
4,570714700000050008,684100.400000,6.097951e+06
...,...,...,...
573,570715000000091803,672332.693500,6.060594e+06
574,570715000000258107,636075.348135,6.275118e+06
575,570715000001613493,415507.469388,6.172906e+06
576,570715000001761682,464444.000000,6.283758e+06


578 != 444    
Fill some missing turbine parks

In [7]:
f = windmills[windmills['Turbine_type'] == 'P']

In [8]:
f[f['GSRN'].isin(w_temp['GSRN'])]

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height
1342,570714700000050008,P,,390,LAND,,,3750.0,,
1343,570714700000050015,P,,360,HAV,,,4950.0,,
1344,570714700000050022,P,,250,LAND,,,3780.0,,
1345,570714700000050039,P,,316,LAND,,,600.0,,
1346,570714700000050046,P,,316,LAND,,,400.0,,
...,...,...,...,...,...,...,...,...,...,...
6911,570715000000091803,P,,376,LAND,,,0.0,,
7423,570715000000258107,P,,707,HAV,,,399600.0,,
8616,570715000001613493,P,,561,HAV,,,406700.0,,
8684,570715000001761682,P,,787,LAND,,,77.0,,


In [9]:
f[~f['GSRN'].isin(w_temp['GSRN'])]

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height
5248,570715000000045837,P,,550,LAND,,,6500.0,,
6909,570715000000091780,P,,326,LAND,,,0.0,,


In [10]:
w_temp[~w_temp['GSRN'].isin(f['GSRN'])]

Unnamed: 0,GSRN,x_c,y_c
0,570714700000005640,642373.900000,6139972.000
1,570714700000012037,664423.733000,6072868.224
2,570714700000012051,664630.731000,6072840.224
3,570714700000012105,728122.163500,6165600.771
108,570715000000000058,531655.720000,6322597.070
...,...,...,...
528,570715000000055638,504163.000000,6277061.000
530,570715000000056017,468672.240000,6145816.035
537,570715000000056857,513111.100000,6320199.400
538,570715000000056918,514683.070000,6320302.490


Fill 136 in to windmills

In [11]:
windmills.shape

(9853, 10)

In [12]:
windmills = pd.merge(windmills, w_temp, on='GSRN', how='outer')
# location
windmills[['UTM_x']] = windmills.apply(lambda x: x[['UTM_x']].fillna(value = x['x_c']), axis=1)[['UTM_x']]
windmills[['UTM_y']] = windmills.apply(lambda x: x[['UTM_y']].fillna(value = x['y_c']), axis=1)[['UTM_y']]
windmills.drop(columns=['x_c','y_c'], inplace=True)
# Turbine type
windmills['Turbine_type'].fillna('P', inplace = True)
windmills

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height
0,000000000000000000,H,,760,,,,11.0,13.0,18.2
1,570714700000000027,M,570714700000050459,101,LAND,720898.353000,6171174.877,225.0,27.0,30.0
2,570714700000000034,M,570714700000050459,101,LAND,720993.352000,6171226.877,225.0,27.0,30.0
3,570714700000000041,M,570714700000050466,101,LAND,727504.304000,6178385.864,600.0,44.0,50.0
4,570714700000000058,M,570714700000050466,101,LAND,727628.303000,6178385.865,600.0,44.0,50.0
...,...,...,...,...,...,...,...,...,...,...
9978,570715000000055638,P,,,,504163.000000,6277061.000,,,
9979,570715000000056017,P,,,,468672.240000,6145816.035,,,
9980,570715000000056857,P,,,,513111.100000,6320199.400,,,
9981,570715000000056918,P,,,,514683.070000,6320302.490,,,


Cause a bug in ArcGIS file import, we need to add a 'z' at the end of GSRN and Parent_GSRN

In [13]:
windmills['GSRN'] = [x+'z' for x in windmills['GSRN']]
windmills['Parent_GSRN'].fillna('',inplace = True)
windmills['Parent_GSRN'] = [x+'z' for x in windmills['Parent_GSRN']]
windmills

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height
0,000000000000000000z,H,z,760,,,,11.0,13.0,18.2
1,570714700000000027z,M,570714700000050459z,101,LAND,720898.353000,6171174.877,225.0,27.0,30.0
2,570714700000000034z,M,570714700000050459z,101,LAND,720993.352000,6171226.877,225.0,27.0,30.0
3,570714700000000041z,M,570714700000050466z,101,LAND,727504.304000,6178385.864,600.0,44.0,50.0
4,570714700000000058z,M,570714700000050466z,101,LAND,727628.303000,6178385.865,600.0,44.0,50.0
...,...,...,...,...,...,...,...,...,...,...
9978,570715000000055638z,P,z,,,504163.000000,6277061.000,,,
9979,570715000000056017z,P,z,,,468672.240000,6145816.035,,,
9980,570715000000056857z,P,z,,,513111.100000,6320199.400,,,
9981,570715000000056918z,P,z,,,514683.070000,6320302.490,,,


In [15]:
windmills.to_csv('data/windmills_gis.csv', index=False)

After ArcGIS    
1. Single grid

In [23]:
windmills_single_grid= pd.read_excel('data/windmills_to_single-grid.xlsx')[['GSRN','grid']]
windmills_single_grid['grid'].fillna(value=0, inplace = True)
windmills_single_grid['grid'] = windmills_single_grid['grid'].astype(int)
windmills_single_grid['grid'] = windmills_single_grid['grid'].astype(str)
windmills = pd.merge(windmills, windmills_single_grid, on='GSRN', how='left')
windmills

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height,grid
0,000000000000000000z,H,z,760,,,,11.0,13.0,18.2,0
1,570714700000000027z,M,570714700000050459z,101,LAND,720898.353000,6171174.877,225.0,27.0,30.0,653
2,570714700000000034z,M,570714700000050459z,101,LAND,720993.352000,6171226.877,225.0,27.0,30.0,653
3,570714700000000041z,M,570714700000050466z,101,LAND,727504.304000,6178385.864,600.0,44.0,50.0,653
4,570714700000000058z,M,570714700000050466z,101,LAND,727628.303000,6178385.865,600.0,44.0,50.0,653
...,...,...,...,...,...,...,...,...,...,...,...
9978,570715000000055638z,P,z,,,504163.000000,6277061.000,,,,1243
9979,570715000000056017z,P,z,,,468672.240000,6145816.035,,,,1235
9980,570715000000056857z,P,z,,,513111.100000,6320199.400,,,,1246
9981,570715000000056918z,P,z,,,514683.070000,6320302.490,,,,1246


In [27]:
windmills_multi_grid = pd.read_excel('data/windmills_to_multi-grid.xlsx')[['GSRN','grid']]
windmills_multi_grid['grid'].fillna(value=0, inplace = True)
windmills_multi_grid['grid'] = windmills_multi_grid['grid'].astype(int)
windmills_multi_grid['grid'] = windmills_multi_grid['grid'].astype(str)

In [29]:
def ab(df):
    return','.join(df.values)
    
windmills_multi_grid = windmills_multi_grid.groupby('GSRN')['grid'].apply(ab).reset_index().rename(columns={'grid':'grid_in_range'})
windmills = pd.merge(windmills, windmills_multi_grid, on='GSRN', how='left')
windmills

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height,grid,grid_in_range
0,000000000000000000z,H,z,760,,,,11.0,13.0,18.2,0,
1,570714700000000027z,M,570714700000050459z,101,LAND,720898.353000,6171174.877,225.0,27.0,30.0,653,"651,693,610,736,695,737,611,653,694,654,652,69..."
2,570714700000000034z,M,570714700000050459z,101,LAND,720993.352000,6171226.877,225.0,27.0,30.0,653,"651,693,610,736,695,737,611,653,694,654,652,69..."
3,570714700000000041z,M,570714700000050466z,101,LAND,727504.304000,6178385.864,600.0,44.0,50.0,653,"651,693,610,695,737,611,653,694,654,652,696,60..."
4,570714700000000058z,M,570714700000050466z,101,LAND,727628.303000,6178385.865,600.0,44.0,50.0,653,"651,693,610,695,737,611,653,694,654,652,696,60..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9978,570715000000055638z,P,z,,,504163.000000,6277061.000,,,,1243,"1283,1241,1326,1285,1284,1199,1158,1242,1202,1..."
9979,570715000000056017z,P,z,,,468672.240000,6145816.035,,,,1235,"1233,1191,1275,1195,1318,1276,1235,1192,1150,1..."
9980,570715000000056857z,P,z,,,513111.100000,6320199.400,,,,1246,"1202,1244,1330,1331,1329,1245,1328,1286,1246,1..."
9981,570715000000056918z,P,z,,,514683.070000,6320302.490,,,,1246,"1202,1244,1330,1331,1329,1245,1328,1286,1246,1..."


In [30]:
windmills.to_csv('data/windmill_cleaned.csv')

## Weather Grid Observation

In [5]:
weather_grids = pd.read_csv('data/ITU_DATA/prognosis/grid_coordinates.csv') 
print(len(weather_grids))
weather_grids.head(10)

354


Unnamed: 0,grid,utm_x,utm_y
0,190,895390,6112543
1,191,899875,6128134
2,231,874801,6101548
3,232,879280,6117148
4,233,883767,6132746
5,273,858691,6106142
6,274,863172,6121751
7,275,867660,6137358
8,314,838109,6095118
9,315,842583,6110737


## Data in Observations
We don't have the join table of municipal names and ids.

In [6]:
observed_pressure = pq.read_table('data/ITU_DATA/observations/observed_pressure.parquet').to_pandas()

In [7]:
print(len(observed_pressure))
observed_pressure.tail(10)

22890252


Unnamed: 0_level_0,municipal,subtype,value,unit
timestamp_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-06-24 10:00:00,Horsens,Time atmosfæretryk,1027.078491,hPa
2019-06-24 07:00:00,Københavns,Time atmosfæretryk,1027.699951,hPa
2019-06-24 10:00:00,Mariagerfjord,Time atmosfæretryk,1027.682251,hPa
2019-06-24 00:00:00,Morsø,Time atmosfæretryk,1026.642578,hPa
2019-06-23 22:00:00,Næstved,Time atmosfæretryk,1026.166016,hPa
2019-06-24 16:00:00,Odsherred,Time atmosfæretryk,1026.213379,hPa
2019-06-24 21:00:00,Odsherred,Time atmosfæretryk,1025.884277,hPa
2019-06-24 06:00:00,Slagelse,Time atmosfæretryk,1027.182861,hPa
2019-06-24 14:00:00,Sorø,Time atmosfæretryk,1026.56665,hPa
2019-06-24 08:00:00,Tønder,Time atmosfæretryk,1025.446533,hPa


In [7]:
observed_wind_speed_high10 = pq.read_table('data/ITU_DATA/observations/observed_wind_speed_high10.parquet').to_pandas()
print(len(observed_wind_speed_high10))
observed_wind_speed_high10.tail(10)

## progbisis ENetNEA
resolution in 1 hour    
data from this model since 2018-02-22.

In [4]:
wind_speed_10m = pq.read_table('data/ITU_DATA/prognosis/ENetNEA/wind_speed_10m.parquet').to_pandas().reset_index()

In [5]:
wind_speed_10m.head()

Unnamed: 0,index,190,191,231,232,233,273,274,275,314,...,1412,1413,1445,1446,1447,1448,1449,1450,1451,predicted_ahead
0,2018-02-22 00:00:00,5.3,4.5,5.9,2.7,4.3,5.5,2.3,2.8,3.5,...,1.3,8.1,6.0,5.9,6.8,1.1,2.9,3.8,3.9,1
1,2018-02-22 01:00:00,5.3,3.3,6.0,2.4,4.2,5.6,2.3,2.5,4.5,...,6.8,8.7,5.8,6.3,4.5,4.2,3.7,3.4,2.1,2
2,2018-02-22 02:00:00,4.1,4.8,5.0,1.8,4.0,5.7,2.4,2.2,5.6,...,7.1,8.6,5.2,5.2,3.1,3.2,3.2,2.2,1.5,3
3,2018-02-22 03:00:00,4.5,4.2,4.2,1.7,3.7,5.2,1.9,2.4,5.6,...,8.0,8.6,5.1,3.9,3.3,4.1,3.3,1.5,1.4,4
4,2018-02-22 04:00:00,4.6,4.0,4.3,2.3,3.6,4.6,2.0,2.2,4.6,...,7.6,8.7,4.4,2.7,3.9,3.1,1.5,1.6,3.7,5


In [37]:
wind_speed_10m = wind_speed_10m.drop_duplicated(['index'], keep = last)

(291280, 356)


Unnamed: 0,index,190,191,231,232,233,273,274,275,314,...,1412,1413,1445,1446,1447,1448,1449,1450,1451,predicted_ahead
291220,2020-01-11 14:00:00,11.5,10.8,11.7,9.1,8.6,11.1,9.2,8.7,10.9,...,15.6,15.7,16.4,16.1,15.6,15.6,15.6,15.5,15.7,51
291221,2020-01-11 15:00:00,12.3,11.8,12.4,9.9,9.2,12.1,9.8,9.3,11.7,...,15.6,15.6,16.2,15.9,15.9,15.5,15.8,15.8,15.9,52
291222,2020-01-11 16:00:00,12.9,11.8,13.1,10.7,9.5,12.6,10.1,9.8,11.8,...,15.3,15.2,16.7,16.3,16.0,15.9,15.8,15.8,15.9,53
291223,2020-01-11 17:00:00,13.5,11.6,13.2,11.0,11.2,13.1,10.3,9.8,12.3,...,14.8,15.0,16.6,16.5,16.5,16.3,16.2,16.0,15.9,54
291224,2020-01-11 18:00:00,13.6,11.8,13.4,11.5,11.5,13.2,10.5,10.1,12.7,...,15.3,15.0,16.7,16.7,16.6,16.5,16.0,16.2,16.1,55
291225,2020-01-10 00:00:00,7.9,7.2,9.4,5.6,5.9,8.9,6.5,6.2,8.7,...,10.2,11.1,8.1,7.1,6.7,6.3,5.1,4.7,3.6,1
291226,2020-01-10 01:00:00,8.2,7.2,8.3,5.3,6.0,7.9,6.1,5.3,7.6,...,12.2,11.9,6.9,6.3,5.3,4.7,4.4,3.6,9.3,2
291227,2020-01-10 02:00:00,6.7,5.7,7.6,4.7,5.2,7.0,2.8,4.1,6.8,...,11.9,12.0,5.8,4.5,4.1,2.6,3.1,10.5,11.7,3
291228,2020-01-10 03:00:00,5.8,5.2,6.7,3.5,4.2,6.1,3.4,4.3,6.3,...,11.8,12.1,3.3,2.6,4.0,9.8,10.0,10.7,11.7,4
291229,2020-01-10 04:00:00,3.6,4.9,6.5,2.0,4.1,6.2,3.5,3.0,6.0,...,12.1,11.6,4.3,9.0,10.5,11.9,12.1,11.9,12.2,5


In [10]:
wind_speed_10m['index'] = wind_speed_10m['index'].astype(str)
wind_speed_10m[['date','time']] = wind_speed_10m['index'].str.split(' ', expand = True)
   

In [2]:
wind_direction_10m = pq.read_table('data/ITU_DATA/prognosis/ENetNEA/wind_direction_10m.parquet').to_pandas().reset_index()

In [6]:
wind_direction_10m.head(30)

Unnamed: 0,index,190,191,231,232,233,273,274,275,314,...,1412,1413,1445,1446,1447,1448,1449,1450,1451,predicted_ahead
0,2018-02-22 00:00:00,74,74,72,60,81,75,79,81,16,...,91,80,265,275,305,300,309,332,306,1
1,2018-02-22 01:00:00,70,70,73,56,78,76,86,90,66,...,83,76,276,290,297,335,331,309,293,2
2,2018-02-22 02:00:00,71,79,76,59,83,82,100,88,89,...,76,74,284,292,325,333,324,299,323,3
3,2018-02-22 03:00:00,76,76,81,60,85,92,98,84,86,...,72,72,292,305,343,329,310,303,11,4
4,2018-02-22 04:00:00,75,69,74,50,73,83,69,76,92,...,74,71,298,11,343,325,321,41,61,5
5,2018-02-22 05:00:00,68,62,76,47,68,72,65,68,78,...,79,76,304,9,341,342,82,78,90,6
6,2018-02-22 06:00:00,63,64,63,42,63,66,70,62,60,...,75,74,325,353,344,12,81,91,89,7
7,2018-02-22 07:00:00,69,72,61,45,71,70,72,59,69,...,75,73,14,358,26,100,89,92,84,8
8,2018-02-22 08:00:00,80,62,77,35,68,76,59,71,75,...,77,77,62,44,100,91,92,92,87,9
9,2018-02-22 09:00:00,68,55,79,66,59,48,82,67,71,...,78,82,135,91,104,98,96,92,88,10


## Settlement
resolution in 15 mins

In [31]:
import pyspark

In [32]:
settlement_2018 = spark.read.parquet('data/ITU_DATA/settlement/2018.parquet')

AttributeError: module 'spark' has no attribute 'read'

In [22]:
settlement_2018 = pq.read_table('data/ITU_DATA/settlement/2018.parquet').to_pandas()

In [23]:
settlement_2018 = settlement_2018[settlement_2018.TIME_CET.str.contains(':00:')].reset_index()

In [24]:
settlement_2018.head()

Unnamed: 0,index,GSRN,TS_ID,VAERDI,TIME_CET
0,3,570715000000062988,1471530,0,2018-07-02 00:00:00
1,7,570715000000062988,1471530,0,2018-07-02 01:00:00
2,11,570715000000062988,1471530,0,2018-07-02 02:00:00
3,15,570715000000062988,1471530,0,2018-07-02 03:00:00
4,19,570715000000062988,1471530,0,2018-07-02 04:00:00


In [25]:
settlement_2018.to_csv('data/ITU_DATA/settlement/2018.csv')

In [1]:
test_2018 = settlement_2018.take(10)

NameError: name 'settlement_2018' is not defined

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
feat_cols = ["1","2","3"]
vec_assembler = VectorAssembler(inputCols=feat_cols, outputCol='features')
final_data = vec_assembler.transform(dataset)

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [None]:
scalerModel = scaler.fit(final_data)

In [None]:
cluster_final_data = scalerModel.transform(final_data)

In [None]:
kmean3 = KMeans(featuresCol='scaledFeatures',k=3)
kmean2 = KMeans(featuresCol='scaledFeatures',k=2)

In [None]:
model_k3 = kmean3.fit(cluster_final_data)
model_k2 = kmean2.fit(cluster_final_data)

In [None]:
wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)