# Task: To predict patient glucose levels using a Keras-based neural network.

### Original data: https://data.world/uci/pima-indians-diabetes

## Load TensorFlow, Keras and Corrplot libraries

In [None]:
library(tensorflow)
library(keras)
library(corrplot)

## Load dataset

In [None]:
diabetes1<-read.csv("pima-indians-diabetes1.csv")
attach(diabetes1)

In [None]:
diabetes1

# Feature Selection

### As we have already seen, the purpose of feature selection is to identify the features that have the most impact on the dependent variable and remove redundant features.

### The following link also contains further information on feature selection techniques for this dataset. (https://machinelearningmastery.com/feature-selection-with-the-caret-r-package/).

### This example is slightly different because feature selection is being conducted for a regression - rather than a classification problem.

## Correlation Plot

### Features that are overly correlated with each other are assumed redundant and should be removed from the model.

In [None]:
M <- cor(diabetes1)
corrplot(M, method = "circle")

## Display correlation coefficients

In [None]:
corrplot(M, method = "number")

## Multiple Linear Regression

### Linear regression is used to quantify the impact of features (or independent variables) on the dependent variable.

### In this case, features with a significance level of lower than 5% are kept in the model.

In [None]:
fit <- lm(Glucose ~ Pregnancies + Outcome + Age + DiabetesPedigreeFunction + BMI + Insulin + SkinThickness + BloodPressure, data=diabetes1)
summary(fit) # show results

In [None]:
df<-data.frame(Outcome, Age, Insulin, SkinThickness, Glucose)
attach(df)

## Max-Min Normalization

In [None]:
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

maxmindf <- as.data.frame(lapply(df, normalize))
attach(maxmindf)
maxmindf<-as.matrix(maxmindf)

## Train-validation split

In [None]:
ind <- sample(2, nrow(maxmindf), replace=TRUE, prob = c(0.7,0.3))

## Build X_train, y_train, X_val, y_val

In [None]:
X_train <- maxmindf[ind==1, 1:4]
X_val <- maxmindf[ind==2, 1:4]
y_train <- maxmindf[ind==1, 5]
y_val <- maxmindf[ind==2, 5]

# Neural Network

## Sequential model

In [None]:
model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 12, activation = 'relu', kernel_initializer='RandomNormal', input_shape = c(4)) %>% 
  layer_dense(units = 8, activation = 'relu') %>%
  layer_dense(units = 1, activation = 'linear')

summary(model)

## Model compilation with mean squared error used as loss function

### Model trained over 150 epochs

In [None]:
model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam',
  metrics = c('mae')
)

history <- model %>% fit(
  X_train, y_train, 
  epochs = 150, batch_size = 50, 
  validation_split = 0.2
)

### Model evaluation

In [None]:
model %>% evaluate(X_val, y_val)
model
pred <- data.frame(y = predict(model, as.matrix(X_val)))
predicted=pred$y * abs(diff(range(df$Glucose))) + min(df$Glucose)
actual=y_val * abs(diff(range(df$Glucose))) + min(df$Glucose)
df<-data.frame(predicted,actual)
attach(df)

## Glucose level predictions

In [None]:
predicted=as.matrix(predicted)
predicted

In [None]:
actual

## Mean percentage error

### Percentage difference between predicted and actual values

In [None]:
mpe=((predicted-actual)/actual)
mean(mpe)*100

## EXERCISE

### pima-indians-diabetes2.csv contains the predictor variables for the test set.

### pima-indians-diabetes3.csv contains the dependent variables (or glucose readings) for the test set.

### Your task is to use the existing model to generate new predictions for this test set and calculate the mean percentage error on these new predictions.

In [None]:
diabetes2<-read.csv("pima-indians-diabetes2.csv")
attach(diabetes2)

In [None]:
diabetes2

In [None]:
df2<-data.frame(Outcome, Age, Insulin, SkinThickness)
attach(df2)

In [None]:
df2

## Max-Min Normalization

In [None]:
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

maxmindf2 <- as.data.frame(lapply(df2, normalize))
attach(maxmindf2)

## Using the predict function in R, generate predictions (pred_test) for the Glucose variable using maxmindf2.

In [None]:
pred_test <- # Complete this line...
predicted_test = pred_test$y * abs(diff(range(diabetes1$Glucose))) + min(diabetes1$Glucose)
predicted_test

## Loading test set - or unseen data for this purpose.

In [None]:
diabetes3<-read.csv("pima-indians-diabetes3.csv")
diabetes3

## Compare predicted values with actual values.

In [None]:
actual_test = diabetes3$Glucose
df2<-data.frame(predicted_test,actual_test)
attach(df2)
df2

## Mean percentage error calculation

In [None]:
mpe2= # Insert the formula for calculating the mean percentage error here...
mean(mpe2)*100