In [1]:
# READ IN THE CSV

import pandas as pd
import numpy as np

df_sf = pd.read_csv('Smart_Farm_dataset.csv')

In [None]:
# DO ANY PRE OBSERVATION HERE

df_sf

In [2]:
# ONE HOT REPRESENT (PREVIOUSLY) STRING ENCODED LOC AND GEN COLUMNS

df_sf = pd.concat([df_sf, pd.get_dummies(df_sf['Loc'])], axis=1)
df_sf = pd.concat([df_sf, pd.get_dummies(df_sf['Gen'])], axis=1)

# DROP ANY COLUMNS HERE
COLUMNS_DROPPED = ['Date', 'Loc', 'Gen', 'PlantID']
df_sf = df_sf.drop(COLUMNS_DROPPED, axis=1)

In [None]:
# THIS MODULE WILL SPLIT UP EACH 100 POINTS IN THE DATAFRAME INTO SEPARATE CYCLES SO THAT WE CAN DO TIME SHIFTING
# ON THE APPROPRIATE CYCLES
cycles = []
for i in range(0, 4):
    start_index = i*100
    end_index = start_index + 99
    df_cycle = df_sf.loc[start_index:end_index]
    cycles.append(df_cycle)

In [3]:
from pandas import DataFrame
from pandas import concat

def series_to_supervised(data, col_names, n_in=1, n_out=1,  dropnan=True, ):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    print("vars_names", col_names)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('%s%d(t-%d)' % (col_names[j], j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('%s%d(t)' % (col_names[j], j+1)) for j in range(n_vars)]
        else:
            names += [('%s%d(t+%d)' % (col_names[j], j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
# CREATE THE DATAFRAME TO BE USED FOR TRAIN AND TEST SPLITTING

data = pd.DataFrame()
for df in cycles:
    values = df.values
    values_supervised = series_to_supervised(values, df.columns, 10, 1)
    data = pd.concat([data, values_supervised], axis=0)

In [None]:
# THIS IS THE TERRIBLY CODED TRAIN AND TEST SPLIT APPROACH FOR TESTING WITH THE LAST 5 DAYS AND TRAINING WITH
# THE OTHER DAYS.
# TODO: MAKE EXTENSIBLE
new_data = data.reset_index().drop(['index'], axis=1)

X_train = new_data.iloc[[i for i in range(0,85)]
                        + [i for i in range(90,175)]
                        + [i for i in range(180, 265)]
                        + [i for i in range(270,355)]]
y_train = X_train.loc[:, 'GrowthRate2(t)']
X_train = X_train.loc[:,~data.columns.str.contains('\(t\)')]
# X = data.drop([data.columns.str.contains('\(t\)')], axis=1)
X_test = new_data.iloc[[i for i in range(85,90)]
                        + [i for i in range(175,180)]
                        + [i for i in range(265,270)]
                        + [i for i in range(355, 360)]]
y_test = X_test.loc[:, 'GrowthRate2(t)']
X_test = X_test.loc[:,~data.columns.str.contains('\(t\)')]

In [None]:
data.reset_index()

In [5]:
values = df_sf.values
data = series_to_supervised(values,df_sf.columns,10,1)

vars_names Index(['DAP', 'GrowthRate', 'Temperature', 'Solar.Rad', 'Humidity', 'Rainfall',
       'K', 'N', 'A', 'B'],
      dtype='object')


In [6]:
#TRAIN AND TEST SPLIT RANDOMLY (INSTEAD)

X = data.loc[:,~data.columns.str.contains('\(t\)')]
# X = data.drop([data.columns.str.contains('\(t\)')], axis=1)
y = data.loc[:, 'GrowthRate2(t)']

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.95, test_size = 0.05, random_state = 42)
# for i in range(0, 50):
#     print(i)
#     X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.95, test_size = 0.05, random_state = i)
#     RF_train(X_train, X_test, y_train, y_test)



In [7]:
# ERROR COMPUTATION FUNCTIONS

from sklearn.metrics import mean_squared_error
def evaluate_errors(prediction, actual):
    print("RMSE Error: ", np.sqrt(mean_squared_error(prediction, actual)))
    avg_error_vector = np.absolute(((prediction - actual) / actual) * 100)
    print("Average Error details:\n", np.mean(avg_error_vector))
    return avg_error_vector

In [8]:
# FEATURE IMPORTANCE FUNCTIONS

def get_feature_importances(regr):
    feature_importances = regr.feature_importances_
    feature_importances = pd.Series(feature_importances)
    feature_importance_df = pd.DataFrame({'feature': X_train.columns,'feature_importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by=['feature_importance'])
    for index, row in feature_importance_df.iterrows():
        print(row['feature'], 'has importance: ', row['feature_importance'])

In [9]:
# MODELS RF
from sklearn.ensemble import RandomForestRegressor

def RF_train(X_train, X_test, y_train, y_test):
    regr = RandomForestRegressor(n_estimators=20, max_depth=15, random_state=0, verbose=0, n_jobs=-1)
    regr.fit(X_train, y_train)
    preds = regr.predict(X_test)

    preds = pd.DataFrame(preds)
    y_test = pd.DataFrame(y_test)
    # evaluate_errors(preds, y_test)
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    print('MAE', mean_absolute_error(preds, y_test))
    print('RMSE', np.sqrt(mean_squared_error(preds, y_test)))
    get_feature_importances(regr)

RF_train(X_train, X_test, y_train, y_test)

MAE 0.11844984113493803
RMSE 0.24384904931931922
A9(t-5) has importance:  6.107092687644009e-08
N8(t-5) has importance:  1.4540044676836902e-07
B10(t-3) has importance:  1.6513145177284342e-07
K7(t-10) has importance:  1.9441809943552976e-07
K7(t-9) has importance:  2.7964191375554406e-07
A9(t-1) has importance:  3.9825739100017034e-07
N8(t-10) has importance:  4.3478085691295494e-07
A9(t-2) has importance:  6.154446603399998e-07
N8(t-6) has importance:  7.122705431483357e-07
K7(t-7) has importance:  7.269386685733827e-07
N8(t-4) has importance:  8.276870226034404e-07
A9(t-10) has importance:  8.935667008970173e-07
N8(t-9) has importance:  8.977447513172287e-07
N8(t-8) has importance:  9.153436097290487e-07
A9(t-8) has importance:  9.87353122500488e-07
B10(t-10) has importance:  9.96884837386198e-07
A9(t-3) has importance:  1.2509498177882424e-06
B10(t-6) has importance:  1.36851456444999e-06
B10(t-7) has importance:  1.5959922160435203e-06
B10(t-5) has importance:  1.8744576621720063e

In [None]:
# MODELS LIGHTGBM

from lightgbm import LGBMRegressor

clf = LGBMRegressor(n_estimators=1000, learning_rate=0.01)
clf.fit(X_train, np.log1p(y_train))
preds = np.expm1(clf.predict(X_test))

preds = pd.DataFrame(preds)
y_test = pd.DataFrame(y_test)

evaluate_errors(preds, y_test)

In [None]:
# preds = model.predict(X_test)
preds = model.predict(X_train)

In [None]:
print(preds)
errs = evaluate_errors(preds, y_train)
print(errs)

In [None]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='adam', activation='tanh')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
# evaluate_errors(preds, y_test)
# print(preds)
# print(y_train)

from sklearn.metrics import mean_squared_error
print(mean_squared_error(preds, y_test))
# get_feature_importances(clf)

In [None]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='lbfgs', activation='tanh')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
evaluate_errors(preds, y_test)
print(preds)
print(y_test)

In [None]:
#THIS IS AN EXAMPLE OF A TIME SERIES PROBLEM WITH KERAS

import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# # convert an array of values into a dataset matrix
# def create_dataset(dataset, look_back=1):
# 	dataX, dataY = [], []
# 	for i in range(len(dataset)-look_back-1):
# 		a = dataset[i:(i+look_back), 0]
# 		dataX.append(a)
# 		dataY.append(dataset[i + look_back, 0])
# 	return numpy.array(dataX), numpy.array(dataY)
# # fix random seed for reproducibility
# numpy.random.seed(7)
# # load the dataset
# # dataframe = read_csv('Syngenta/Syngenta_2017/Experiment_dataset.csv', engine='python', skipfooter=3)
# dataset = dataframe.values
# dataset = dataset.astype('float32')
# # normalize the dataset
# scaler = MinMaxScaler(feature_range=(0, 1))
# dataset = scaler.fit_transform(dataset)
# # split into train and test sets
# train_size = int(len(dataset) * 0.67)
# test_size = len(dataset) - train_size
# train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
# # reshape into X=t and Y=t+1
look_back = 12
# trainX, trainY = create_dataset(train, look_back)
# testX, testY = create_dataset(test, look_back)
# # reshape input to be [samples, time steps, features]
# trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
# testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(4, input_shape=(380, look_back)))
# model.add(Flatten())
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=2)
# make predictions
trainPredict = model.predict(X_train)
testPredict = model.predict(X_test)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

In [None]:
# FORMER TRAIN AND TEST SPLIT APPORACH

X_train = pd.DataFrame()
y_train = np.array([])
X_test = pd.DataFrame()
y_test = np.array([])

true_index_counter = -1
for i in range (0, int(len(df_sf)/100)):
    for k in range (1, 101):
        true_index_counter += 1
        if k <= 95:
            X_train = X_train.append(df_sf.loc[true_index_counter].drop(['GrowthRate']))
            y_train = np.append(y_train, df_sf.loc[true_index_counter, 'GrowthRate'])
        else:
            X_test = X_test.append(df_sf.loc[true_index_counter].drop(['GrowthRate']))
            y_test = np.append(y_test, df_sf.loc[true_index_counter, 'GrowthRate'])

In [None]:
#THIS IS AN EXAMPLE OF A TIME SERIES PROBLEM WITH KERAS

# Stacked LSTM for international airline passengers problem with memory
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return numpy.array(dataX), numpy.array(dataY)
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset
dataframe = read_csv('international-airline-passengers.csv', usecols=[1], engine='python', skipfooter=3)
dataset = dataframe.values
dataset = dataset.astype('float32')
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
# split into train and test sets
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
# reshape into X=t and Y=t+1
look_back = 3
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], 1))
# create and fit the LSTM network
batch_size = 1
model = Sequential()
model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True, return_sequences=True))
model.add(LSTM(4, batch_input_shape=(batch_size, look_back, 1), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
for i in range(100):
	model.fit(trainX, trainY, epochs=1, batch_size=batch_size, verbose=2, shuffle=False)
	model.reset_states()
# make predictions
trainPredict = model.predict(trainX, batch_size=batch_size)
model.reset_states()
testPredict = model.predict(testX, batch_size=batch_size)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

In [None]:
#THIS IS AN EXAMPLE OF A TIME SERIES PROBLEM WITH KERAS

from keras.models import Sequential
from keras.layers import Dense
from sklearn import datasets
import numpy
# fix random seed for reproducibility
numpy.random.seed(7)
# load pima price dataset
dataset = datasets.load_boston()
# split into input (X) and output (Y) variables
# X = dataset[:,0:8]
# Y = dataset[:,8]
# create model
model = Sequential()
model.add(Dense(12, input_dim=109, activation='relu'))
model.add(Dense(8, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
# model.fit(X, y, epochs=1000, verbose=0)

# Compile model
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_train, y_train, epochs=150, batch_size=10)
# evaluate the model
scores = model.evaluate(X_train, y_train)
# print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))