In [56]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype
from plotnine import *
%matplotlib inline

#https://www.kaggle.com/datasets/theforcecoder/wind-power-forecasting?resource=download

In [48]:
#Read in the dataframe
df = pd.read_csv("Data/Turbine_Data.csv")
#Rename the Time column from unnamed
df.rename(columns={'Unnamed: 0':'Time'}, inplace = True)
#Convert the time column to a datatime object
df['Time']  = pd.to_datetime(df['Time'])
#Del Columns that provide no info / only 1 value
del(df["WTG"])
del(df["ControlBoxTemperature"])
#Drop all rows that don't have any values
noTime = df.drop('Time', axis=1, inplace=False)
df = pd.DataFrame(df[noTime.notnull().any(axis=1)])
#Since null values are low percent for these columns, drop the rows that contain them since they contain no usefull info
df = df.dropna(subset=["ActivePower", "ReactivePower"])
#Since these low percent distributions are skewed, replace null with median
nullPer =  df.isnull().sum() / df.shape[0]
lowNullCount = []
for nullP in nullPer:
    if((nullP > 0.0) & (nullP < .05)):
        lowNullCount.append(nullPer[nullPer == nullP].index[0])
for col in lowNullCount:
    df[col].fillna(df[col].median(), inplace=True)
# Removing columns that have high correlations and similar distributions
# among the correlated columns, we remove the ones with lowest null values 
to_drop = ['GearboxOilTemperature','Blade2PitchAngle', 'Blade3PitchAngle','GeneratorWinding1Temperature',
           'RotorRPM','NacellePosition','WindSpeed',"TurbineStatus"]
# we skip WindSpeed since it seems to have a much different distribution than ActivePower
# creating new dataframe by dropping the above columns
df_new = df.drop(to_drop, axis=1, inplace=False)
df = pd.DataFrame(df_new)
#removing outliers using the interquartile range
cols = list(df.columns)
[cols.remove(x) for x in ["Time"]]
# Noting that Blade1PitchAngle had more than 50% missing values
Q3 = df[cols].quantile(0.75)
Q1 = df[cols].quantile(0.25)
iqr=Q3-Q1
df = df[~((df[cols] < (Q1 - 1.5 * iqr)) |(df[cols] > (Q3 + 1.5 * iqr))).any(axis=1)]
df["year"] = df.Time.dt.year
df["month"] = df.Time.dt.month
df["day"] = df.Time.dt.day

In [64]:
ncdf = pd.DataFrame()
for i in [2018,2019,2020]:
    ncdf[i] = df[df.year==i].isnull().sum() / df[df.year==i].shape[0]
ncdf.drop(["year","month","day","Time","AmbientTemperatue","ActivePower","ReactivePower"], inplace=True)
ncdf.reset_index(inplace=True)
ncdf = ncdf.melt(id_vars="index")
ordering = CategoricalDtype([2020,2019,2018], ordered=False)
ncdf["variable"] = ncdf["variable"].astype(ordering)
ncdf["value"] = ncdf["value"] * 100
ncdf


Unnamed: 0,index,variable,value
0,BearingShaftTemperature,2018,89.494419
1,Blade1PitchAngle,2018,100.0
2,GearboxBearingTemperature,2018,89.494419
3,GeneratorRPM,2018,89.508693
4,GeneratorWinding2Temperature,2018,89.460162
5,HubTemperature,2018,89.497274
6,MainBoxTemperature,2018,89.491564
7,WindDirection,2018,24.080048
8,BearingShaftTemperature,2019,1.259687
9,Blade1PitchAngle,2019,36.57759


In [79]:
plot = ggplot(ncdf) + aes(x="index", y="value" ,fill="variable") + geom_col(position=position_dodge2()) + coord_flip() \
    + labs(x = "Attribute", y="Percentage of Null Values", fill="Year", title="Percent of Null Values by the Year")
ggsave(plot=plot, filename="Plots/Unique/nullpercents.png")

