Permalink
Cannot retrieve contributors at this time
#packages needed | |
install.packages(c("RANN", "caret", "readr", "doParallel", "ROCR")) | |
install.packages("caret", dependencies = c("Depends", "Suggests")) | |
library(caret) | |
library(RANN) | |
library(readr) | |
library(doParallel) | |
library(ROCR) | |
#Import the Land registry data | |
inputdata <- "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2016.txt" | |
names <- c('Transaction_unique_identifier', 'Price', 'Date_of_Transfer', 'Postcode', 'Property_Type', 'Old_New', 'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town_City', 'District', 'County', 'PPDCategory_Type', 'Record_Status_monthly_file_only') | |
landregistry <- read_csv(inputdata, col_names=names) | |
#Remove the hour and minutes from the date variable and create a month variable | |
landregistry$Transfer_Date <- as.Date(as.POSIXct(landregistry$Date_of_Transfer,tz="", "%Y-%m-%d %H:%M")) | |
landregistry$Transfer_Month <- as.Date(paste((format(landregistry$Transfer_Date, "%Y-%m")),"-01",sep="")) | |
landregistry$Transfer_Year <- as.numeric(format(landregistry$Transfer_Date, "%Y")) | |
#Remove rows where the value is over £1M, these will skew the model later | |
landregistry <- landregistry[!(landregistry$Price>1000000),] | |
#I want just sales from Jan 2016 | |
sales <- landregistry[ which(landregistry$Transfer_Month=="2016-01-01"), ] | |
#Keep only the variables I want to use | |
columns <- c('Price', 'Property_Type', 'Old_New', 'Duration', 'County') | |
Initialdata <- subset(sales,select=columns) | |
#Define the variable FiveHundredPlus | |
Initialdata$FiveHundredPlus <- ifelse(Initialdata$Price<=500000, 1, 0) | |
#Converting every categorical variable to numerical using dummy variables | |
dmy <- dummyVars(" ~ .", data = Initialdata, fullRank = T) | |
Initialdata_transformed <- data.frame(predict(dmy, newdata = Initialdata)) | |
#Checking the structure of transformed train file | |
str(Initialdata_transformed) | |
#Converting the dependent variable to categorical | |
Initialdata_transformed$FiveHundredPlus <-as.factor(Initialdata_transformed$FiveHundredPlus) | |
#Spliting training set into two parts based on outcome: 75% and 25% | |
index <- createDataPartition(Initialdata_transformed$FiveHundredPlus, p=0.75, list=FALSE) | |
trainSet <- Initialdata_transformed [ index,] | |
testSet <- Initialdata_transformed [-index,] | |
#Factors for the model | |
outcomeName<-'FiveHundredPlus' | |
predictors<-colnames(trainSet)[!(colnames(trainSet) %in% c("Price", "FiveHundredPlus"))] | |
#Try applying some models | |
model_gbm<-train(trainSet[,predictors],trainSet[,outcomeName],method='gbm') | |
model_rf<-train(trainSet[,predictors],trainSet[,outcomeName],method='rf') | |
model_nnet<-train(trainSet[,predictors],trainSet[,outcomeName],method='nnet') | |
model_glm<-train(trainSet[,predictors],trainSet[,outcomeName],method='glm') | |
#Apply the models to the test data | |
pred_gbm <- predict(model_gbm, testSet, type = "raw") | |
pred_rf <- predict(model_rf, testSet, type = "raw") | |
pred_nnet <- predict(model_nnet, testSet, type = "raw") | |
pred_glm <- predict(model_glm, testSet, type = "raw") | |
#Create confusion matricies | |
conf_gbm <- table(testSet$FiveHundredPlus, pred_gbm) | |
conf_rf <- table(testSet$FiveHundredPlus, pred_rf) | |
conf_nnet <- table(testSet$FiveHundredPlus, pred_nnet) | |
conf_glm <- table(testSet$FiveHundredPlus, pred_glm) | |
# Calculate the accuracy | |
acc_gbm <- sum(diag(conf_gbm))/sum(conf_gbm) | |
acc_rf <- sum(diag(conf_rf))/sum(conf_rf) | |
acc_nnet <- sum(diag(conf_nnet))/sum(conf_nnet) | |
acc_glm <- sum(diag(conf_glm))/sum(conf_glm) | |
Acc <- c(acc_gbm, acc_rf, acc_nnet, acc_glm) | |
#Calcuate probabilities | |
pred_p_gbm <- predict(model_gbm, testSet, type = "prob")[,2] | |
pred_p_rf <- predict(model_rf, testSet, type = "prob")[,2] | |
pred_p_nnet <- predict(model_nnet, testSet, type = "prob")[,2] | |
pred_p_glm <- predict(model_glm, testSet, type = "prob")[,2] | |
# Make prediction objects for the ROC curves | |
predict_gbm <- prediction(pred_p_gbm, testSet$FiveHundredPlus) | |
predict_rf <- prediction(pred_p_rf, testSet$FiveHundredPlus) | |
predict_nnet <- prediction(pred_p_nnet, testSet$FiveHundredPlus) | |
predict_glm <- prediction(pred_p_glm, testSet$FiveHundredPlus) | |
#Calculate the performance | |
perf_gbm <- performance(predict_gbm, "tpr", "fpr") | |
perf_rf <- performance(predict_rf, "tpr", "fpr") | |
perf_nnet <- performance(predict_nnet, "tpr", "fpr") | |
perf_glm <- performance(predict_glm, "tpr", "fpr") | |
#Calculate the area under the curves | |
auc_gbm <- performance(predict_gbm,"auc")@y.values[[1]] | |
auc_rf <- performance(predict_rf,"auc")@y.values[[1]] | |
auc_nnet <- performance(predict_nnet,"auc")@y.values[[1]] | |
auc_glm <- performance(predict_glm,"auc")@y.values[[1]] | |
AUC <- c(auc_gbm, auc_rf, auc_nnet, auc_glm) | |
#Plot the ROC curve | |
plot(perf_gbm) | |
plot(perf_rf, add = TRUE, colorize = TRUE) | |
plot(perf_nnet, add = TRUE, colorize = TRUE) | |
plot(perf_glm, add = TRUE, colorize = TRUE) | |
#Print the different performance metric vectors | |
print(Acc) | |
print(AUC) |