# Penmanshiel wind farm data exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import folium
import os

In [None]:
data_path = os.path.join('../app/data/Penmanshiel/')

## Fleet information

In [None]:
df_info=pd.read_csv("Penmanshiel_WT/Penmanshiel_WT_static.csv")

In [None]:
df_info.columns

In [None]:
df_info.dropna(how="all",inplace=True)
df_info

The dataset consists of 14 turbines (T03 is missing), they all have the same specifications and started operations. From the coordinates and the fact that they are referred to as a "farm", we assume they are co-located. We confirm this by plotting them on map:

In [None]:
center_loc = df_info[['Latitude', 'Longitude']].mean().values

m = folium.Map(location=center_loc, zoom_start=13)

for _j, row in df_info.iterrows():
    folium.Marker(
    location=row[['Latitude', 'Longitude']].values, # coordinates for the marker (Earth Lab at CU Boulder)
    popup=f"{row['Alternative Title']} ({row['Elevation (m)']}m)", # pop-up label for the marker
    icon=folium.Icon()
).add_to(m)

m

They are indeed co-located in a hilly area in the south of Scotland. As can be seen in the elevation values for each turbine, these can differ substantially, with a maximum difference of 48 meters. This is an important factor to keep in mind.

## Scada data

In [None]:
def read_data_penmanshiel(turbine_number):
    
    local_file_scada = os.path.join(data_path, f'scada_T{turbine_number:02d}.csv')
    df_scada = pd.read_csv(local_file_scada)
    df_scada = df_scada.set_index('Datetime',drop=True)
    
    local_file_logs = os.path.join(data_path, f'logs_T{turbine_number:02d}.csv')
    df_logs = pd.read_csv(local_file_logs)

    return df_scada, df_logs

### Turbine 1
As an example, we'll look into Turbine 1.

In [None]:
df_t1, df_logs_t1 = read_data_penmanshiel(1)

In [None]:
df_t1.head(10)

In [None]:
# Plot the power curve
sns.scatterplot(data=df_t1, x='Wind speed (m/s)', y='Power (kW)')

We see a typical power curve, with some clear curtailment modes as well.

Now let's have a look at the sensors in the dataset. The dataframe has 363 columns. However, most of these are related to the same quantity measured by the SCADA system, reporting the average, standard deviation, minimum, and maximum. We filter out all the columns corresponding to the STD, min, and max (which always have a ',' in their column name). 

In [None]:
len(df_t1.columns)

#### Preprocessing

In [None]:
df_t1.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
sensors = [col for col in df_t1.columns if ',' not in col]
print(len(sensors))
sensors

In [None]:
print(df_t1.index.min())
print(df_t1.index.max())

Around 6 years of data.

Consider one full year of data to understand the features

In [None]:
df_Penmanshiel=df_t1[(df_t1.index>="2019-01-01 00:00:00") & (df_t1.index<="2019-12-31 23:50:00")].copy()
df_Penmanshiel=df_Penmanshiel[sensors].copy()

In [None]:
len(df_Penmanshiel.columns)

Drop Nan features

In [None]:
df_Penmanshiel.dropna(axis=1,how='all',inplace=True)

In [None]:
len(df_Penmanshiel.columns)

Drop features which are always zero

In [None]:
df_Penmanshiel = df_Penmanshiel.loc[:, (df_Penmanshiel != 0).any(axis=0)]

In [None]:
len(df_Penmanshiel.columns)

Drop features which are always the same

In [None]:
df_Penmanshiel = df_Penmanshiel.loc[:, df_Penmanshiel.nunique() > 1]

In [None]:
len(df_Penmanshiel.columns)

Drop features with more that 90% missing values

In [None]:
col = df_Penmanshiel.columns[df_Penmanshiel.isnull().mean() > 0.9].tolist()
col

In [None]:
df_Penmanshiel.drop(col,inplace=True,axis=1)

In [None]:
len(df_Penmanshiel.columns)

In [None]:
col=df_Penmanshiel.columns[df_Penmanshiel.columns.str.contains("Gear|gear")]
corr=df_Penmanshiel[col].corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
col

- pairplot for highly correlated feature
    - should we keep both features 

#### Correlation analysis

In [None]:
df_high_corr = pd.DataFrame(df_Penmanshiel.corr().where(lambda x: x > 0.95).stack().drop_duplicates())

In [None]:
df_high_corr.head(20)

In [None]:
df_final=df_Penmanshiel[['Wind speed (m/s)',
 'Long Term Wind (m/s)',
 'Wind direction (°)',
 'Nacelle position (°)',
 'Vane position 1+2 (°)',
 'Gear oil inlet temperature (°C)',
 'Generator bearing rear temperature (°C)',
 'Generator bearing front temperature (°C)',
 'Gear oil temperature (°C)',
 'Rotor bearing temp (°C)',
 'Temperature motor axis 1 (°C)',
 'Temperature motor axis 2 (°C)',
 'Temperature motor axis 3 (°C)',
 'Motor current axis 1 (A)',
 'Motor current axis 2 (A)',
 'Motor current axis 3 (A)',
 'Rotor speed (RPM)',
 'Generator RPM (RPM)',
 'Gearbox speed (RPM)',
 'Blade angle (pitch position) A (°)',
 'Blade angle (pitch position) B (°)',
 'Blade angle (pitch position) C (°)',
 'Yaw bearing angle (°)',
 'Gear oil inlet pressure (bar)',
 'Gear oil pump pressure (bar)',
 'Drive train acceleration (mm/ss)',
 'Tower Acceleration X (mm/ss)',
 'Tower Acceleration y (mm/ss)']].copy()

corr=df_final.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr[['Gear oil inlet pressure (bar)', 'Gear oil pump pressure (bar)']].mean(axis=0)

In [None]:
df_final["Motor temperature (°C)"] = df_final[['Temperature motor axis 1 (°C)',
                                          'Temperature motor axis 2 (°C)',
                                          'Temperature motor axis 3 (°C)']].mean(axis=1)

df_final["Motor current (A)"] = df_final[['Motor current axis 1 (A)',
                                          'Motor current axis 2 (A)',
                                          'Motor current axis 3 (A)',]].mean(axis=1)

temp = df_final[['Blade angle (pitch position) A (°)','Blade angle (pitch position) B (°)','Blade angle (pitch position) C (°)']].apply(np.radians)
# Convert angular values to Cartesian coordinates
x = temp.apply(lambda row: np.cos(row), axis=1)
y = temp.apply(lambda row: np.sin(row), axis=1)
# Average the Cartesian coordinates
mean_x = x.mean(axis=1)
mean_y = y.mean(axis=1)
# Convert the average Cartesian coordinates back to an angle in radians
mean_angle_radians = np.arctan2(mean_y, mean_x)
df_final['Blade Angle (pitch position) (°)'] = np.degrees(mean_angle_radians)

df_final.drop(['Blade angle (pitch position) A (°)','Blade angle (pitch position) B (°)','Blade angle (pitch position) C (°)',
                'Motor current axis 1 (A)','Motor current axis 2 (A)','Motor current axis 3 (A)',
                'Temperature motor axis 1 (°C)','Temperature motor axis 2 (°C)','Temperature motor axis 3 (°C)',
                'Yaw bearing angle (°)','Rotor speed (RPM)','Gearbox speed (RPM)','Gear oil inlet pressure (bar)'],axis=1,inplace=True)

In [None]:
corr=df_final.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr.stack().value_counts().sort_index(ascending=False)

In [None]:
for col in ['Nacelle position (°)','Vane position 1+2 (°)','Blade Angle (pitch position) (°)']:
    df_final[col.split("(°)")[0]+" cos"] = np.cos(np.radians(df_final[col]))
    df_final[col.split("(°)")[0]+" sin"] = np.sin(np.radians(df_final[col]))

In [None]:
def convert_angles_lengths_to_u_v(angles, lengths, conversion='trigonometric', kind='deg'):
    u = -np.sin(angles * np.pi / 180) * lengths
    v = -np.cos(angles * np.pi / 180) * lengths
    return u, v

df_final["Wind direction u"],df_final["Wind direction v"] = convert_angles_lengths_to_u_v(df_final['Wind direction (°)'],df_final['Wind speed (m/s)'])

In [None]:
df_final

In [None]:
["Wind speed (m/s)","Long Term Wind (m/s)","Wind direction (°)",]
["Nacelle position (°)","Vane position 1+2 (°)"]
["Energy Export (kWh)","Energy Import (kWh)",]

- Highly correlated
    - Wind speed and Wind speed Sensor 1/2 
    - "Nacelle position (°)" and "Yaw bearing angle (°)"
    - "Energy Export (kWh)" and "Virtual Production (kWh)"
    - "Energy Export counter (kWh)", "Energy Import counter (kWh)", "Reactive Energy Import counter (kvarh)"

- Energy Import counter (kWh) --> missing values
- 

In [None]:
corr=pd.DataFrame(df_Penmanshiel.corrwith(df_Penmanshiel["Front bearing temperature (°C)"]))
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr = df_Penmanshiel.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
df_final

In [None]:
np.cos(np.deg2rad(df_final['Nacelle position (°)']))

Let's inspect the theoretical energy and compare it to the export energy:

In [None]:
sns.scatterplot(data=df_t1, x='Wind speed (m/s)', y='Energy Export (kWh)')
sns.scatterplot(data=df_t1, x='Wind speed (m/s)', y='Energy Theoretical (kWh)', color='g')

The theoretical energy seems less spread out, but also not a clear 1-1 with wind speed. It remains unclear what this actually quantifies.

Now, let's have a look at the time evolution of the power output. We first resample the data to 1 day.

In [None]:
df_t1.index = pd.to_datetime(df_t1.index)

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
df_sub = df_t1.resample('1D').median()
sns.lineplot(data=df_sub, x=df_sub.index, y='Power (kW)', ax=ax)

Below we show a heatmap of the missing data.

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(df_t1[sensors].isna(), cmap='Grays', cbar=False, ax=ax)

- External Wind   --> ["Wind speed (m/s)","Long Term Wind (m/s)","Wind direction (°)",]
- Wind positioning(internal)   --> ["Nacelle position (°)","Vane position 1+2 (°)","Blade angle (pitch position) A (°)","Yaw"]
- Drive Train (Generator-Rotor-GearBox- Drive train) --> remove rotor speed, GearBox
- Gear
- Motor blade --> mean temp axis/ mean current axis (want to highlight misbehavior)
- Tower



- Metal
- Cable windings from calibration point

- KPI
    - Energy  
    - Power
    - Production-
    - Grid Voltage-Current-frequency
    - Capacity factor (what is this feature ?)
    - Contractual Avail
    - Performance Index

## Logs

Now we look at the log files, again for turbine 1.

In [None]:
df_logs_t1

In [None]:
df_logs_t1['Status'].value_counts()

In [None]:
df_logs_t1['Message'].value_counts()

We see that there are many logs reported, but most of them seem inconsequential. We zoom in on the ones that caused a "Stop".

In [None]:
df_logs_t1[df_logs_t1['Status'] == 'Stop']

1227 instances where the turbine had to be shut down. Let's see what their message was and how long they were.

In [None]:
df_logs_t1[df_logs_t1['Status'] == 'Stop']['Message'].value_counts()

In [None]:
df_logs_t1[df_logs_t1['Status'] == 'Stop']['Duration']

In [None]:
def duration_to_hours(x):
    x_split = x.split(':')
    return int(x_split[0]) + int(x_split[1]) / 60  + int(x_split[2]) / 3600

df_stop = df_logs_t1[df_logs_t1['Status'] == 'Stop']
df_stop['Duration in hours'] = df_stop['Duration'].apply(lambda x: duration_to_hours(x))

fix, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.boxplot(df_stop['Duration in hours'], ax=ax1)
sns.boxplot(df_stop[df_stop['Duration in hours'] < 2]['Duration in hours'], ax=ax2)


We see that most stops were less than an hour, but with a few strong outliers. We look into these.

In [None]:
df_stop[df_stop['Duration in hours'] > 50]

Some of these correspond to maintenance. Let's look at all logs were maintenance was reported and how long they took.

In [None]:
df_logs_maintenance = df_logs_t1[df_logs_t1['IEC category'].apply(lambda x: 'Maintenance' in str(x))]
df_logs_maintenance

In [None]:
df_logs_maintenance['Duration in hours'] = df_logs_maintenance['Duration'].apply(lambda x: duration_to_hours(x))

In [None]:
fix, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.boxplot(df_logs_maintenance['Duration in hours'], ax=ax1)
sns.boxplot(df_logs_maintenance['Duration in hours'], ax=ax2)
ax2.set_ylim(-1, 5)

About half of the maintenance events are less than 1 hour, however some lasted several hours, upto ~5 days (120 hours). If we select the ones lasting more than 5 hours, we have 10 events left over for turbine 1.

In [None]:
df_logs_maintenance[df_logs_maintenance['Duration in hours'] > 5]

- gear oil pressure vs wind speed and active power
    - is gear oil pressure related to curtailment of T09 and T10? 