## Task: To predict patient glucose levels using a Keras-based neural network.

### Original data: https://data.world/uci/pima-indians-diabetes

## Install TensorFlow

In [2]:
devtools::install_github("rstudio/tensorflow")

Downloading GitHub repo rstudio/tensorflow@master


reticulate (NA -> 6b94066a4...) [GitHub]
tfruns     (NA -> 1.4         ) [CRAN]


Downloading GitHub repo rstudio/reticulate@master



[32m✔[39m  [90mchecking for file ‘/tmp/RtmpUeu48G/remotesee5c89a3/rstudio-reticulate-6b94066/DESCRIPTION’[39m[36m[39m
[90m─[39m[90m  [39m[90mpreparing ‘reticulate’:[39m[36m[39m
[32m✔[39m  [90mchecking DESCRIPTION meta-information[39m[36m[39m
[90m─[39m[90m  [39m[90mcleaning src[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for LF line-endings in source and make files and shell scripts[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for empty or unneeded directories[39m[36m[39m
[90m─[39m[90m  [39m[90mbuilding ‘reticulate_1.13.0-9000.tar.gz’[39m[36m[39m
   


Installing 1 packages: tfruns
Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
Skipping install of 'reticulate' from a github remote, the SHA1 (6b94066a) has not changed since last install.
  Use `force = TRUE` to force installation


[32m✔[39m  [90mchecking for file ‘/tmp/RtmpUeu48G/remotese64f33819/rstudio-tensorflow-1102ce9/DESCRIPTION’[39m[36m[39m
[90m─[39m[90m  [39m[90mpreparing ‘tensorflow’:[39m[36m[39m
[32m✔[39m  [90mchecking DESCRIPTION meta-information[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for LF line-endings in source and make files and shell scripts[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for empty or unneeded directories[39m[36m[39m
[90m─[39m[90m  [39m[90mbuilding ‘tensorflow_1.14.0.9000.tar.gz’[39m[36m[39m
   


## Install Keras

In [7]:
devtools::install_github("rstudio/keras")

Downloading GitHub repo rstudio/keras@master



[32m✔[39m  [90mchecking for file ‘/tmp/RtmpUeu48G/remotese47c3107/rstudio-keras-95ea0b5/DESCRIPTION’[39m[36m[39m
[90m─[39m[90m  [39m[90mpreparing ‘keras’:[39m[36m[39m
[32m✔[39m  [90mchecking DESCRIPTION meta-information[39m[36m[39m
[90m─[39m[90m  [39m[90mchecking for LF line-endings in source and make files and shell scripts[39m[36m[36m (432ms)[36m[39m
[90m─[39m[90m  [39m[90mchecking for empty or unneeded directories[39m[36m[39m
   Removed empty directory ‘keras/man-roxygen’
[90m─[39m[90m  [39m[90mbuilding ‘keras_2.2.4.1.9001.tar.gz’[39m[36m[39m
   


## Load TensorFlow library

In [5]:
library(tensorflow)
#install_tensorflow()

## Load Keras library and normalize data

In [8]:
library(keras)
diabetes1<-read.csv("pima-indians-diabetes1.csv")

## Max-Min Normalization

In [None]:
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

maxmindf <- as.data.frame(lapply(diabetes1, normalize))
attach(maxmindf)
maxmindf<-as.matrix(maxmindf)

## Train-validation split

In [9]:
ind <- sample(2, nrow(maxmindf), replace=TRUE, prob = c(0.7,0.3))

## Build X_train, y_train, X_val, y_val

In [None]:
X_train <- maxmindf[ind==1, 1:8]
X_val <- maxmindf[ind==2, 1:8]
y_train <- maxmindf[ind==1, 9]
y_val <- maxmindf[ind==2, 9]

## Sequential model

In [10]:
model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 12, activation = 'relu', kernel_initializer='RandomNormal', input_shape = c(8)) %>% 
  layer_dense(units = 8, activation = 'relu') %>%
  layer_dense(units = 1, activation = 'linear')

summary(model)

Model: "sequential"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
dense (Dense)                       (None, 12)                      108         
________________________________________________________________________________
dense_1 (Dense)                     (None, 8)                       104         
________________________________________________________________________________
dense_2 (Dense)                     (None, 1)                       9           
Total params: 221
Trainable params: 221
Non-trainable params: 0
________________________________________________________________________________


## Model compilation with mean squared error used as loss function

### Model trained over 150 epochs

In [11]:
model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam',
  metrics = c('mae')
)

history <- model %>% fit(
  X_train, y_train, 
  epochs = 150, batch_size = 50, 
  validation_split = 0.2
)

### Model evaluation

In [12]:
model %>% evaluate(X_val, y_val)
model
pred <- data.frame(y = predict(model, as.matrix(X_val)))
predicted=pred$y * abs(diff(range(diabetes1$Glucose))) + min(diabetes1$Glucose)
actual=y_val * abs(diff(range(diabetes1$Glucose))) + min(diabetes1$Glucose)
df<-data.frame(predicted,actual)
attach(df)

Model
Model: "sequential"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
dense (Dense)                       (None, 12)                      108         
________________________________________________________________________________
dense_1 (Dense)                     (None, 8)                       104         
________________________________________________________________________________
dense_2 (Dense)                     (None, 1)                       9           
Total params: 221
Trainable params: 221
Non-trainable params: 0
________________________________________________________________________________



The following objects are masked _by_ .GlobalEnv:

    actual, predicted



## Glucose level predictions

In [13]:
predicted=as.matrix(predicted)
predicted

0
108.33714
129.56989
125.64662
108.13061
111.11950
113.20711
106.49417
96.00997
153.15887
111.73989


In [14]:
actual

## Mean percentage error

### Percentage difference between predicted and actual values

In [15]:
mpe=((predicted-actual)/actual)
mean(mpe)*100

## EXERCISE

### pima-indians-diabetes2.csv contains the predictor variables for the test set.

### pima-indians-diabetes3.csv contains the dependent variables (or glucose readings) for the test set.

### Your task is to use the existing model to generate new predictions for this test set and calculate the mean percentage error on these new predictions.

In [17]:
diabetes2<-read.csv("pima-indians-diabetes2.csv")

The following objects are masked from maxmindf:

    Age, BloodPressure, BMI, DiabetesPedigreeFunction, Insulin,
    Outcome, Pregnancies, SkinThickness



## Max-Min Normalization

In [None]:
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

maxmindf2 <- as.data.frame(lapply(diabetes2, normalize))
attach(maxmindf2)

## Using the predict function in R, generate predictions (pred_test) for the Glucose variable using maxmindf2.

In [18]:
pred_test <- # Complete this line...
predicted_test = pred_test$y * abs(diff(range(diabetes1$Glucose))) + min(diabetes1$Glucose)
predicted_test

ERROR: Error in eval(expr, envir, enclos): object 'pred_test' not found


## Loading test set - or unseen data for this purpose.

In [19]:
diabetes3<-read.csv("pima-indians-diabetes3.csv")
diabetes3

Glucose
<int>
97
83
130
128
149
144
119
108
120
120


## Compare predicted values with actual values.

In [20]:
actual_test = diabetes3$Glucose
df2<-data.frame(predicted_test,actual_test)
attach(df2)
df2

ERROR: Error in data.frame(predicted_test, actual_test): object 'predicted_test' not found


## Mean percentage error calculation

In [21]:
mpe2= # Insert the formula for calculating the mean percentage error here...
mean(mpe2)*100

ERROR: Error in mean(mpe2): object 'mpe2' not found
