In [1]:
# Stephen Spaulding
# Please see below for my code following the outline proposed by the template and in the midterm document. 
# Load libraries and sub-libraries that might be used subsequently 

import numpy                   as np
import pandas                  as pd 
import matplotlib              as mpl
import matplotlib.pyplot       as plt

from   sklearn.svm             import SVC
from   scipy                   import stats
from   pandas.plotting         import scatter_matrix
from   sklearn.metrics         import confusion_matrix
from   sklearn.model_selection import GridSearchCV
from   sklearn.metrics         import accuracy_score

%matplotlib inline

In [2]:
echo "# Machine-Learning-Final-Project" >> README.md
git init
git add README.md
git commit -m "first commit"
git branch -M main
git remote add origin https://github.com/sfspaulding/Machine-Learning-Final-Project.git
git push -u origin main
                

SyntaxError: invalid syntax (<ipython-input-2-75852efc022d>, line 1)

## Section 1. Read and explore data

In [None]:
# read .csv wind and power data
data = pd.read_csv("Turbine_Data.csv")

In [None]:
# print the number of rows and columns 
row, col = data.shape

In [None]:
print("Number of Rows   : "    + str(row)) 
print("Number of Columns: " + str(col)) 

In [None]:
# Preview the first few lines of the loaded data and show dimension of data
data.head()

In [None]:
# generate statistics of the "raw" data
data.describe()

In [None]:
# Identify feaures and labels. We want to predict power (MW) using only wind speed at 100 m (m/s)
X = data["WindSpeed"]
y = data["ActivePower"]

In [None]:
# Check for NaN entries

group  = y.unique()

print(group)

y_vals = y.value_counts(ascending=True)
print(y_vals)

# Print histogram here
y.hist()

In [None]:
# removing NaN entries from X and y
n_nan     = y.isnull().sum()
print('number of nan in labels: ' +str(n_nan))

nan_elems = y.isnull()

X        = X[~ nan_elems]
y        = y[~ nan_elems]

In [None]:
# Print out the size of X and y
X.size

In [None]:
y.size

In [None]:
# Plot time series of power
X.plot(x='time', y='power (MW)', style='o', grid = True, title = 'Power')

In [None]:
# Plot time series of windspeed
y.plot(x='time', y='windspeed (m/s)', style='o', grid = True, title = 'Power')

## Section 2. Linear Regression

In [None]:
# Plot scatter
plt.scatter(X, y)

In [None]:
# Predicting Power might be too difficult after all
# Let's focus on predicting the power for windspeeds <= 15 m/s
data1  = data.loc[(data['WindSpeed'] <= 15)]
X = data1['WindSpeed']
y = data1['ActivePower']

# Compute the sample size
sample_size = X.size
sample_size

In [None]:
# Plot the new dataset
plt.plot(X, y, "b.")
plt.xlabel("windspeed (m/s)", fontsize=18)
plt.ylabel("power (MW)", rotation=90, fontsize=18)
plt.axis([0, 39, 0, 17])
plt.title("windspeed vs. power")
plt.show()

### Perform a fitting using the Normal Equations

In [None]:
# original sample size = 105408; new sample size = 98496
X_b = np.c_[np.ones((94291, 1)), X]  # add x0 = 1 to each instance
# Use Equation 4-4. Normal Equation to determine theta
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

In [None]:
# Print Dimension and Display the two-column matrix 
nrow, ncol = data1.shape
print("nrow: "    + str(nrow)),
print("ncol: " + str(ncol))

In [None]:
# Display theta_best
theta_best

In [None]:
# Make predictions using theta_best
X_new = np.array([[0], [20]])
X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance
y_predict = X_new_b.dot(theta_best)
y_predict

In [None]:
plt.plot(X_new, y_predict, "r-")
plt.plot(X, y, "b.")
plt.axis([0, 17, 0, 17])
plt.show()

In [None]:
# Perform regression using sklearn
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
# perform the fitting
lin_reg.fit(X_new_b, y_predict)
lin_reg.intercept_, lin_reg.coef_

In [None]:
theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_b, y, rcond=1e-6)
theta_best_svd

## Section 3. Polinomial interpolation

In [None]:
X = data['wind speed at 100m (m/s)']
y = data['power (MW)']

In [None]:
X = X.values.reshape(X.shape[0],-1)

In [None]:
# Do a quadratic function fitting
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
X[0]

In [None]:
X_poly[0]

In [None]:
# Do regression on the polynomial
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
lin_reg.intercept_, lin_reg.coef_

In [None]:
# Make predictions
X_new=np.linspace(0, 16, 100).reshape(100, 1)
X_new_poly = poly_features.transform(X_new)
y_new = lin_reg.predict(X_new_poly)
plt.plot(X, y, "b.")
plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
plt.xlabel("$Windspeed (m/s)$", fontsize=18)
plt.ylabel("$Wind Power (MW)$", rotation=90, fontsize=18)
plt.legend(loc="upper left", fontsize=14)
plt.axis([0, 16, 0, 20])

plt.show()

## Section 4. Evaluation and Pipeline

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)
    plt.xlabel("Training set size", fontsize=14)
    plt.ylabel("RMSE", fontsize=14)

In [None]:
from sklearn.pipeline import Pipeline

polynomial_regression = Pipeline([
        ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
        ("lin_reg", LinearRegression()),
    ])
# Display the learning curves

lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)
plt.axis([0, 10000, 0, 14])
plt.show()

In [None]:
# Repeat evaluation with degree=3
from sklearn.pipeline import Pipeline

polynomial_regression = Pipeline([
        ("poly_features", PolynomialFeatures(degree=3, include_bias=False)),
        ("lin_reg", LinearRegression()),
    ])
# Display the learning curves

plot_learning_curves(lin_reg, X, y)
plt.axis([0, 10000, 0, 14])
plt.show()     

## Section 5. Discussion of results
### What is the benefit of inspecting the data before applying any ML technique?
### How would you go about improving your model to predict power over the full range (including WS> 15 m/s) of windspeeds?

In [None]:
#Discussion Below. I included this text in my Midterm word document in case it's easier to read there.

#    The benefit of inspecting the data can best be summed up by the phrase “begin with the end in mind”.
#Inspecting the data allows us to display and visualize it, allowing one to better understand the problem and 
#the nature of the relationship between the variables in question, as well as the detection of potential noise/outlier data 
#that needs to be scrubbed.

#    In this case we see what appears to be a roughly cubic relationship between wind speed and power output
#over the 0-15 m/s range of wind speeds. We also notice that the power output becomes relatively constant
#following what is effectively a step function at wind speeds greater than 15 m/s.
#Right away we know that the best model to make predictions about power output between 0-15 m/s wind speeds
#is likely not going to be a linear model, since the relationship is nakedly non-linear.
#Furthermore, we can intuit that the regions for which wind speed is above 15 m/s
#is not predictive of power output in regions where the wind speed is less than 15 m/s.
#Thus, we know to exclude that data from our training set and not waste time having
#the model optimize itself and eventually weight this data to zero.
#By inspecting the data at the outset, we avoid model development pathways
#that are strictly suboptimal and thereby save valuable time and computational resources.

#    It is readily apparent that the power output at wind speeds greater than 15 m/s is not
#useful in making predictions regarding wind speeds below this threshold.
#Thus, I would continue to have the model disregard data for wind speeds above 15 m/s
#when performing training, keeping with a model that is maximally accurate
#at predicting power outputs in the 0-15 m/s
#but unable to accurately make predictions about power output at windspeeds above 15 m/s.
#Once optimized, I would manually hard code the model to have it predict 16MW
#(or whatever the exact average value is in this region) for wind speeds between 15 m/s and some upper threshold.
#The upper threshold I reference is the average wind speed at which the turbine shuts down
#and the system generates zero power output.
#This could be written as a simple step function as I’ve described it,
#or perhaps it would increase accuracy to have the model have another cubic curve around the region of power loss.
