In [None]:
import pandas as pd
import pyarrow.parquet as pq
from numpy import nan

# CLean
## Creat a static dictionary for useful columns

## Clean and extract windmill data

In [None]:
# static cols
cols = ['GSRN','Turbine_type','Parent_GSRN','BBR_municipal','Placement','UTM_x','UTM_y','Capacity_kw','Rotor_diameter','Navhub_height']

In [None]:
windmills = pq.read_table('data/ITU_DATA/masterdatawind.parquet').to_pandas()[cols]
windmills.shape

Windmills have duplicated data by different reviewers.    
Remove duplicated data

In [None]:
windmills.drop_duplicates(subset = "GSRN", keep = 'first', ignore_index = True, inplace = True)
windmills.shape

9853 Contains:    
- H: Household turbine : No UTM_xy locations 
- W: Single turbine : full data
- P: Turbine park : No UTM_xy locations
- M: Turbine in a park : No power record in settlement data


In [None]:
print('Household: ',windmills[windmills['Turbine_type'] == 'H'].shape)
print('Single: ',windmills[windmills['Turbine_type'] == 'W'].shape)
print('Park: ',windmills[windmills['Turbine_type'] == 'P'].shape)
print('In a Park: ',windmills[windmills['Turbine_type'] == 'M'].shape)

Fill Turbine park xy by calculate their CENTRIODs.

In [None]:
aggs = {'UTM_x':lambda x : x.mean(),
        'UTM_y':lambda x : x.mean()}
w_temp = windmills[windmills['Turbine_type'] == 'M'].groupby(['Parent_GSRN'], as_index = False).agg(aggs).rename(columns={'Parent_GSRN':'GSRN','UTM_x':'x_c','UTM_y':'y_c'})
w_temp

578 != 444    
Fill some missing turbine parks

In [None]:
f = windmills[windmills['Turbine_type'] == 'P']

In [None]:
f[f['GSRN'].isin(w_temp['GSRN'])]

In [None]:
f[~f['GSRN'].isin(w_temp['GSRN'])]

In [None]:
w_temp[~w_temp['GSRN'].isin(f['GSRN'])]

Fill 136 in to windmills

In [None]:
windmills.shape

In [None]:
windmills = pd.merge(windmills, w_temp, on='GSRN', how='outer')
# location
windmills[['UTM_x']] = windmills.apply(lambda x: x[['UTM_x']].fillna(value = x['x_c']), axis=1)[['UTM_x']]
windmills[['UTM_y']] = windmills.apply(lambda x: x[['UTM_y']].fillna(value = x['y_c']), axis=1)[['UTM_y']]
windmills.drop(columns=['x_c','y_c'], inplace=True)
# Turbine type
windmills['Turbine_type'].fillna('P', inplace = True)
windmills

Cause a bug in ArcGIS file import, we need to add a 'z' at the end of GSRN and Parent_GSRN

In [None]:
windmills['GSRN'] = [x+'z' for x in windmills['GSRN']]
windmills['Parent_GSRN'].fillna('',inplace = True)
windmills['Parent_GSRN'] = [x+'z' for x in windmills['Parent_GSRN']]
windmills

In [None]:
windmills.to_csv('data/windmills_gis.csv', index=False)

After ArcGIS    
1. Single grid

In [None]:
windmills_single_grid= pd.read_excel('data/windmills_to_single-grid.xlsx')[['GSRN','grid']]
windmills_single_grid['grid'].fillna(value=0, inplace = True)
windmills_single_grid['grid'] = windmills_single_grid['grid'].astype(int)
windmills_single_grid['grid'] = windmills_single_grid['grid'].astype(str)
windmills = pd.merge(windmills, windmills_single_grid, on='GSRN', how='left')
windmills

In [None]:
windmills_multi_grid = pd.read_excel('data/windmills_to_multi-grid.xlsx')[['GSRN','grid']]
windmills_multi_grid['grid'].fillna(value=0, inplace = True)
windmills_multi_grid['grid'] = windmills_multi_grid['grid'].astype(int)
windmills_multi_grid['grid'] = windmills_multi_grid['grid'].astype(str)

In [None]:
def ab(df):
    return','.join(df.values)
    
windmills_multi_grid = windmills_multi_grid.groupby('GSRN')['grid'].apply(ab).reset_index().rename(columns={'grid':'grid_in_range'})
windmills = pd.merge(windmills, windmills_multi_grid, on='GSRN', how='left')
windmills

In [None]:
windmills.to_csv('data/windmill_cleaned.csv')

In [None]:
windmills = pd.read_csv('data/windmill_cleaned.csv',sep=';')
windmills_lc_slope = pd.read_excel('data/windmills_lc_slope.xlsx')

In [None]:
windmills_lc_slope['Land_cover'].fillna(value=-1, inplace = True)
windmills_lc_slope['Slope'].fillna(value=-1, inplace = True)

In [None]:
roughness_dic = {210:0.0, 220:0.004, 200:0.005, 202:0.005, 140:0.03, 150:0.05, 11:0.1, 14:0.1, 130:0.1, 180:11, 10:0.1, 20:0.3, 30:0.3, 160:0.5, 120:0.5, 170:0.6, 190:1.0, 40:1.5, 50:1.5, 60:1.5, 70:1.5, 90:1.5, 100:1.5, 110:1.5, -1:0.0}

windmills_lc_slope['roughness'] = windmills_lc_slope['Land_cover'].apply(lambda x: roughness_dic[x])

In [None]:
windmills_lc_slope

In [None]:
windmills_lc_slope

In [None]:
windmills = pd.merge(windmills, windmills_lc_slope, on='GSRN', how='left')

In [None]:
windmills = windmills.drop(index=0)

In [None]:
windmills

In [None]:
windmills.to_csv('data/windmill_cleaned.csv',index=False)

## Weather Grid Observation

In [None]:
weather_grids = pd.read_csv('data/ITU_DATA/prognosis/grid_coordinates.csv') 
print(len(weather_grids))
weather_grids.head(10)

## Data in Observations
We don't have the join table of municipal names and ids.

In [None]:
observed_pressure = pq.read_table('data/ITU_DATA/observations/observed_pressure.parquet').to_pandas()

In [None]:
print(len(observed_pressure))
observed_pressure.tail(10)

In [None]:
observed_wind_speed_high10 = pq.read_table('data/ITU_DATA/observations/observed_wind_speed_high10.parquet').to_pandas()
print(len(observed_wind_speed_high10))
observed_wind_speed_high10.tail(10)

## progbisis ENetNEA
resolution in 1 hour    
data from this model since 2018-02-22.

In [None]:
wind_speed_10m = pq.read_table('data/ITU_DATA/prognosis/ENetNEA/wind_speed_10m.parquet').to_pandas().reset_index()

In [None]:
wind_speed_10m.head()

In [None]:
wind_speed_10m = wind_speed_10m.drop_duplicated(['index'], keep = last)

In [None]:
wind_speed_10m['index'] = wind_speed_10m['index'].astype(str)
wind_speed_10m[['date','time']] = wind_speed_10m['index'].str.split(' ', expand = True)
   

In [None]:
wind_direction_10m = pq.read_table('data/ITU_DATA/prognosis/ENetNEA/wind_direction_10m.parquet').to_pandas().reset_index()

In [None]:
wind_direction_10m.head(30)

## Settlement
resolution in 15 mins

In [None]:
import pyspark

In [None]:
settlement_2018 = spark.read.parquet('data/ITU_DATA/settlement/2018.parquet')

In [None]:
settlement_2018 = pq.read_table('data/ITU_DATA/settlement/2018.parquet').to_pandas()

In [None]:
settlement_2018 = settlement_2018[settlement_2018.TIME_CET.str.contains(':00:')].reset_index()

In [None]:
settlement_2018.head()

# A single windmill

In [1]:
import pandas as pd
import pyarrow.parquet as pq
from numpy import nan

In [3]:
windmills = pd.read_csv('data/windmill_cleaned.csv')

In [7]:
windmills

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height,grid,grid_in_range,Land_cover,Slope,roughness
0,570714700000000027,M,5.707147e+17,101.0,LAND,720898.353000,6171174.877,225.0,27.0,30.0,653,"651,693,610,736,695,737,611,653,694,654,652,69...",210.0,3.646971,0.0
1,570714700000000034,M,5.707147e+17,101.0,LAND,720993.352000,6171226.877,225.0,27.0,30.0,653,"651,693,610,736,695,737,611,653,694,654,652,69...",100.0,1.012750,1.5
2,570714700000000041,M,5.707147e+17,101.0,LAND,727504.304000,6178385.864,600.0,44.0,50.0,653,"651,693,610,695,737,611,653,694,654,652,696,60...",210.0,8.111279,0.0
3,570714700000000058,M,5.707147e+17,101.0,LAND,727628.303000,6178385.865,600.0,44.0,50.0,653,"651,693,610,695,737,611,653,694,654,652,696,60...",210.0,5.051153,0.0
4,570714700000000065,M,5.707147e+17,101.0,LAND,727817.302000,6178365.866,600.0,44.0,50.0,653,"651,693,610,695,611,653,694,654,652,696,609,65...",210.0,5.885129,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9977,570715000000055638,P,,,,504163.000000,6277061.000,,,,1243,"1283,1241,1326,1285,1284,1199,1158,1242,1202,1...",70.0,2.263636,1.5
9978,570715000000056017,P,,,,468672.240000,6145816.035,,,,1235,"1233,1191,1275,1195,1318,1276,1235,1192,1150,1...",10.0,0.000000,0.1
9979,570715000000056857,P,,,,513111.100000,6320199.400,,,,1246,"1202,1244,1330,1331,1329,1245,1328,1286,1246,1...",130.0,1.432096,0.1
9980,570715000000056918,P,,,,514683.070000,6320302.490,,,,1246,"1202,1244,1330,1331,1329,1245,1328,1286,1246,1...",130.0,0.000000,0.1


In [9]:
windmills[windmills['Turbine_type']=='W']

Unnamed: 0,GSRN,Turbine_type,Parent_GSRN,BBR_municipal,Placement,UTM_x,UTM_y,Capacity_kw,Rotor_diameter,Navhub_height,grid,grid_in_range,Land_cover,Slope,roughness
8,570714700000000102,W,,167.0,LAND,717690.0,6167403.0,300.0,31.0,30.0,652,"735,650,692,651,693,610,736,695,737,611,653,69...",11.0,5.885129,0.1
11,570714700000000133,W,,167.0,LAND,719902.0,6167171.0,1000.0,50.0,55.0,652,"735,608,650,651,693,610,736,695,737,611,653,69...",190.0,1.432096,1.0
12,570714700000000140,W,,167.0,LAND,719970.0,6167337.0,660.0,47.0,40.0,652,"735,650,651,693,610,736,695,737,611,653,694,65...",210.0,0.000000,0.0
13,570714700000000157,W,,167.0,LAND,720031.0,6167484.0,660.0,47.0,40.0,652,"735,650,651,693,610,736,695,737,611,653,694,65...",70.0,0.000000,1.5
14,570714700000000164,W,,169.0,LAND,702314.0,6169097.0,250.0,24.0,30.0,694,"735,650,692,651,693,610,778,736,777,734,695,77...",11.0,2.024868,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9806,571313174001699894,W,,360.0,LAND,644459.0,6073175.0,3600.0,126.0,87.0,771,"855,813,770,771,812,856,687,729,686,730,814,72...",11.0,0.000000,0.1
9807,571313174001699900,W,,360.0,LAND,644136.0,6073257.0,3600.0,126.0,87.0,771,"855,813,770,771,812,856,687,729,686,730,814,72...",11.0,1.432096,0.1
9811,571313174001764448,W,,320.0,LAND,702483.0,6132921.0,225.0,29.0,31.5,650,"690,691,735,733,732,606,648,649,608,650,692,60...",11.0,0.000000,0.1
9841,571313174116253585,W,,265.0,LAND,694932.0,6175477.0,850.0,52.0,44.0,736,"735,692,651,693,778,736,777,780,734,695,779,73...",11.0,1.432096,0.1


In [90]:
settlement = pq.read_table('data/ITU_DATA/settlement/2019.parquet')

In [94]:
settlement = settlement.to_pandas()

In [122]:
single_windmill = settlement[settlement["GSRN"] == "571313174001764448"]

In [124]:
single_windmill = single_windmill[single_windmill['TIME_CET'].str[5:7] == '12']

In [125]:
single_windmill

Unnamed: 0,GSRN,TS_ID,VAERDI,TIME_CET
142379785,571313174001764448,50200502,0,2019-12-01 00:00:00
142379786,571313174001764448,50200502,0,2019-12-01 01:00:00
142379787,571313174001764448,50200502,0,2019-12-01 02:00:00
142379788,571313174001764448,50200502,0,2019-12-01 03:00:00
142379789,571313174001764448,50200502,0,2019-12-01 04:00:00
...,...,...,...,...
154124102,571313174001764448,50200502,0,2019-12-30 20:00:00
154124103,571313174001764448,50200502,0,2019-12-30 21:00:00
154124104,571313174001764448,50200502,0,2019-12-30 22:00:00
154124105,571313174001764448,50200502,0,2019-12-30 23:00:00


In [131]:
test = single_windmill[["VAERDI", "TIME_CET"]]

In [132]:
test["VAERDI"] = test["VAERDI"].astype(float)

In [129]:
test = test[test["TIME_CET"].str[5:10]=="12-01"]

In [133]:
test

Unnamed: 0,VAERDI,TIME_CET
142379785,0.0,2019-12-01 00:00:00
142379786,0.0,2019-12-01 01:00:00
142379787,0.0,2019-12-01 02:00:00
142379788,0.0,2019-12-01 03:00:00
142379789,0.0,2019-12-01 04:00:00
...,...,...
154124102,0.0,2019-12-30 20:00:00
154124103,0.0,2019-12-30 21:00:00
154124104,0.0,2019-12-30 22:00:00
154124105,0.0,2019-12-30 23:00:00


In [77]:
import matplotlib.pyplot as plt

In [137]:
plt.figure(figsize=(30,6))
plt.plot(test["TIME_CET"], test["VAERDI"])

[<matplotlib.lines.Line2D at 0x757a4a080>]

Error in callback <function flush_figures at 0x86fae4f28> (for post_execute):


KeyboardInterrupt: 

In [134]:
test[test["VAERDI"] == 0].count()

VAERDI      486
TIME_CET    486
dtype: int64

In [135]:
test[test["VAERDI"] != 0].count()

VAERDI      235
TIME_CET    235
dtype: int64

In [139]:
test[test["VAERDI"] == 0].count() / test.count() * 100

VAERDI      67.40638
TIME_CET    67.40638
dtype: float64