In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype
from plotnine import *
%matplotlib inline

#https://www.kaggle.com/datasets/theforcecoder/wind-power-forecasting?resource=download

In [2]:
#Read in the dataframe
df = pd.read_csv("Data/Turbine_Data.csv")
#Rename the Time column from unnamed
df.rename(columns={'Unnamed: 0':'Time'}, inplace = True)
#Convert the time column to a datatime object
df['Time']  = pd.to_datetime(df['Time'])
#Del Columns that provide no info / only 1 value
del(df["WTG"])
del(df["ControlBoxTemperature"])
#Drop all rows that don't have any values
noTime = df.drop('Time', axis=1, inplace=False)
df = pd.DataFrame(df[noTime.notnull().any(axis=1)])
#Since null values are low percent for these columns, drop the rows that contain them since they contain no usefull info
df = df.dropna(subset=["ActivePower", "ReactivePower"])
#Since these low percent distributions are skewed, replace null with median
nullPer =  df.isnull().sum() / df.shape[0]
lowNullCount = []
for nullP in nullPer:
    if((nullP > 0.0) & (nullP < .05)):
        lowNullCount.append(nullPer[nullPer == nullP].index[0])
for col in lowNullCount:
    df[col].fillna(df[col].median(), inplace=True)
# Removing columns that have high correlations and similar distributions
# among the correlated columns, we remove the ones with lowest null values 
to_drop = ['GearboxOilTemperature','Blade2PitchAngle', 'Blade3PitchAngle','GeneratorWinding1Temperature',
           'RotorRPM','NacellePosition','WindSpeed',"TurbineStatus"]
# we skip WindSpeed since it seems to have a much different distribution than ActivePower
# creating new dataframe by dropping the above columns
df_new = df.drop(to_drop, axis=1, inplace=False)
df = pd.DataFrame(df_new)
#removing outliers using the interquartile range
cols = list(df.columns)
[cols.remove(x) for x in ["Time"]]
# Noting that Blade1PitchAngle had more than 50% missing values
Q3 = df[cols].quantile(0.75)
Q1 = df[cols].quantile(0.25)
iqr=Q3-Q1
df = df[~((df[cols] < (Q1 - 1.5 * iqr)) |(df[cols] > (Q3 + 1.5 * iqr))).any(axis=1)]
df["year"] = df.Time.dt.year
df["month"] = df.Time.dt.month
df["day"] = df.Time.dt.day

In [3]:
ncdf = pd.DataFrame()
for i in [2018,2019,2020]:
    ncdf[i] = df[df.year==i].isnull().sum() / df[df.year==i].shape[0]
ncdf.drop(["year","month","day","Time","AmbientTemperatue","ActivePower","ReactivePower"], inplace=True)
ncdf.reset_index(inplace=True)
ncdf = ncdf.melt(id_vars="index")
ordering = CategoricalDtype([2020,2019,2018], ordered=False)
ncdf["variable"] = ncdf["variable"].astype(ordering)
ncdf["value"] = ncdf["value"] * 100
attributeOrdering = CategoricalDtype(["WindDirection","BearingShaftTemperature", "GearboxBearingTemperature", "GeneratorRPM", "GeneratorWinding2Temperature", "HubTemperature", "MainBoxTemperature", "Blade1PitchAngle"], ordered=False)
ncdf["index"] = ncdf["index"].astype(attributeOrdering)
ncdf


Unnamed: 0,index,variable,value
0,BearingShaftTemperature,2018,89.494419
1,Blade1PitchAngle,2018,100.0
2,GearboxBearingTemperature,2018,89.494419
3,GeneratorRPM,2018,89.508693
4,GeneratorWinding2Temperature,2018,89.460162
5,HubTemperature,2018,89.497274
6,MainBoxTemperature,2018,89.491564
7,WindDirection,2018,24.080048
8,BearingShaftTemperature,2019,1.259687
9,Blade1PitchAngle,2019,36.57759


In [4]:
plot = ggplot(ncdf) + aes(x="index", y="value" ,fill="variable") + geom_col(position=position_dodge2()) + coord_flip() \
    + labs(x = "Attribute", y="Percentage of Null Values", fill="Year", title="Percent of Null Values by the Year")
ggsave(plot=plot, filename="Plots/Unique/nullpercents.png")



In [5]:
monthlyNullData = pd.DataFrame()
firstYear = df[df.year==2018]
for i in range(1,13):
    monthlyNullData[i] = firstYear[firstYear.month == i].isnull().sum() / firstYear[firstYear.month == i].shape[0]
monthlyNullData.drop(["year","month","day","Time","AmbientTemperatue","ActivePower","ReactivePower"], inplace=True)
monthlyNullData.reset_index(inplace=True)
monthlyNullData

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,10,11,12
0,BearingShaftTemperature,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.463245,0.825813,1.0,1.0,0.586595
1,Blade1PitchAngle,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,GearboxBearingTemperature,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.463245,0.825813,1.0,1.0,0.586595
3,GeneratorRPM,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.462899,0.825813,1.0,1.0,0.588285
4,GeneratorWinding2Temperature,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.460818,0.825289,1.0,1.0,0.58575
5,HubTemperature,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.462899,0.825813,1.0,1.0,0.587159
6,MainBoxTemperature,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.463245,0.825813,1.0,1.0,0.586314
7,WindDirection,0.096005,0.086755,0.083483,0.075326,0.068044,0.089326,0.078043,0.460125,0.332634,0.370178,0.439404,0.454238


In [6]:
specData = monthlyNullData.loc[[0]]
specData = specData.melt(id_vars='index')
specData["value"] *= 100

In [7]:
plot = ggplot(specData) + aes(x="variable", y="value") + geom_col()  \
    + labs(x = "Month of the year", y="Percentage of Null Values", title="Bearing Shaft Temperature Null Values by Month in 2018")
ggsave(plot=plot, filename="Plots/Unique/BearingShaft2018.png")



In [8]:
monthlyNullData2019 = pd.DataFrame()
secondYear = df[df.year==2019]
for i in range(1,13):
    monthlyNullData2019[i] = secondYear[secondYear.month == i].isnull().sum() / secondYear[secondYear.month == i].shape[0]
monthlyNullData2019.drop(["year","month","day","Time","AmbientTemperatue","ActivePower","ReactivePower"], inplace=True)
monthlyNullData2019.reset_index(inplace=True)
monthlyNullData2019

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,10,11,12
0,BearingShaftTemperature,0.006351,0.005055,0.005355,0.004082,0.018064,0.011755,0.009155,0.002931,0.004555,0.1432,0.000316,0.006871
1,Blade1PitchAngle,1.0,1.0,1.0,1.0,0.276373,0.017072,0.010329,0.022228,0.03679,0.010186,0.001265,0.007808
2,GearboxBearingTemperature,0.006351,0.005055,0.005355,0.004082,0.017702,0.011755,0.009155,0.002931,0.004555,0.1432,0.000316,0.006871
3,GeneratorRPM,0.008257,0.003568,0.002537,0.007483,0.019509,0.010355,0.008216,0.002198,0.005256,0.146195,0.000316,0.007183
4,GeneratorWinding2Temperature,0.005716,0.003568,0.002537,0.001701,0.017702,0.010355,0.008216,0.00171,0.003504,0.142001,0.000316,0.006558
5,HubTemperature,0.006351,0.013084,0.006483,0.008844,0.020231,0.011755,0.009155,0.00342,0.005256,0.14979,0.001582,0.010618
6,MainBoxTemperature,0.006669,0.007434,0.006201,0.004422,0.018786,0.011755,0.009155,0.002687,0.004555,0.143799,0.000949,0.006871
7,WindDirection,0.455065,0.405888,0.392897,0.317347,0.296604,0.345928,0.380986,0.41915,0.402593,0.188137,0.020247,0.099001


In [9]:
pitchData = monthlyNullData2019.loc[[1]]
pitchData = pitchData.melt(id_vars='index')
pitchData["value"] *= 100
plot = ggplot(pitchData) + aes(x="variable", y="value") + geom_col()  \
    + labs(x = "Month of the year", y="Percentage of Null Values", title="Blade Pitch Null Values by Month in 2019")
ggsave(plot=plot, filename="Plots/Unique/BladePitch2019.png")



In [10]:
bearData = monthlyNullData2019.loc[[0]]
bearData = bearData.melt(id_vars='index')
bearData["value"] *= 100
plot = ggplot(bearData) + aes(x="variable", y="value") + geom_col()  \
    + labs(x = "Month of the year", y="Percentage of Null Values", title="Bearing Shaft Tempurture Null Values by Month in 2019") + ylim(0,100)
ggsave(plot=plot, filename="Plots/Unique/BearingShaft2019.png")



## 2020 Nullcounts

In [11]:
monthlyNullData2020 = pd.DataFrame()
thirdYear = df[df.year==2020]
for i in range(1,13):
    monthlyNullData2020[i] = thirdYear[thirdYear.month == i].isnull().sum() / thirdYear[thirdYear.month == i].shape[0]
monthlyNullData2020.drop(["year","month","day","Time","AmbientTemperatue","ActivePower","ReactivePower"], inplace=True)
monthlyNullData2020.reset_index(inplace=True)
monthlyNullData2020

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,10,11,12
0,BearingShaftTemperature,0.053152,0.002245,0.102114,,,,,,,,,
1,Blade1PitchAngle,0.005075,0.001403,0.0,,,,,,,,,
2,GearboxBearingTemperature,0.053152,0.002245,0.102114,,,,,,,,,
3,GeneratorRPM,0.053152,0.002245,0.101518,,,,,,,,,
4,GeneratorWinding2Temperature,0.053152,0.002245,0.101518,,,,,,,,,
5,HubTemperature,0.058226,0.004771,0.102411,,,,,,,,,
6,MainBoxTemperature,0.053419,0.002526,0.102114,,,,,,,,,
7,WindDirection,0.088675,0.047993,0.002084,,,,,,,,,
