Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
114 lines (89 sloc) 4.72 KB
#packages needed
install.packages(c("RANN", "caret", "readr", "doParallel", "ROCR"))
install.packages("caret", dependencies = c("Depends", "Suggests"))
library(caret)
library(RANN)
library(readr)
library(doParallel)
library(ROCR)
#Import the Land registry data
inputdata <- "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2016.txt"
names <- c('Transaction_unique_identifier', 'Price', 'Date_of_Transfer', 'Postcode', 'Property_Type', 'Old_New', 'Duration', 'PAON', 'SAON', 'Street', 'Locality', 'Town_City', 'District', 'County', 'PPDCategory_Type', 'Record_Status_monthly_file_only')
landregistry <- read_csv(inputdata, col_names=names)
#Remove the hour and minutes from the date variable and create a month variable
landregistry$Transfer_Date <- as.Date(as.POSIXct(landregistry$Date_of_Transfer,tz="", "%Y-%m-%d %H:%M"))
landregistry$Transfer_Month <- as.Date(paste((format(landregistry$Transfer_Date, "%Y-%m")),"-01",sep=""))
landregistry$Transfer_Year <- as.numeric(format(landregistry$Transfer_Date, "%Y"))
#Remove rows where the value is over £1M, these will skew the model later
landregistry <- landregistry[!(landregistry$Price>1000000),]
#I want just sales from Jan 2016
sales <- landregistry[ which(landregistry$Transfer_Month=="2016-01-01"), ]
#Keep only the variables I want to use
columns <- c('Price', 'Property_Type', 'Old_New', 'Duration', 'County')
Initialdata <- subset(sales,select=columns)
#Define the variable FiveHundredPlus
Initialdata$FiveHundredPlus <- ifelse(Initialdata$Price<=500000, 1, 0)
#Converting every categorical variable to numerical using dummy variables
dmy <- dummyVars(" ~ .", data = Initialdata, fullRank = T)
Initialdata_transformed <- data.frame(predict(dmy, newdata = Initialdata))
#Checking the structure of transformed train file
str(Initialdata_transformed)
#Converting the dependent variable to categorical
Initialdata_transformed$FiveHundredPlus <-as.factor(Initialdata_transformed$FiveHundredPlus)
#Spliting training set into two parts based on outcome: 75% and 25%
index <- createDataPartition(Initialdata_transformed$FiveHundredPlus, p=0.75, list=FALSE)
trainSet <- Initialdata_transformed [ index,]
testSet <- Initialdata_transformed [-index,]
#Factors for the model
outcomeName<-'FiveHundredPlus'
predictors<-colnames(trainSet)[!(colnames(trainSet) %in% c("Price", "FiveHundredPlus"))]
#Try applying some models
model_gbm<-train(trainSet[,predictors],trainSet[,outcomeName],method='gbm')
model_rf<-train(trainSet[,predictors],trainSet[,outcomeName],method='rf')
model_nnet<-train(trainSet[,predictors],trainSet[,outcomeName],method='nnet')
model_glm<-train(trainSet[,predictors],trainSet[,outcomeName],method='glm')
#Apply the models to the test data
pred_gbm <- predict(model_gbm, testSet, type = "raw")
pred_rf <- predict(model_rf, testSet, type = "raw")
pred_nnet <- predict(model_nnet, testSet, type = "raw")
pred_glm <- predict(model_glm, testSet, type = "raw")
#Create confusion matricies
conf_gbm <- table(testSet$FiveHundredPlus, pred_gbm)
conf_rf <- table(testSet$FiveHundredPlus, pred_rf)
conf_nnet <- table(testSet$FiveHundredPlus, pred_nnet)
conf_glm <- table(testSet$FiveHundredPlus, pred_glm)
# Calculate the accuracy
acc_gbm <- sum(diag(conf_gbm))/sum(conf_gbm)
acc_rf <- sum(diag(conf_rf))/sum(conf_rf)
acc_nnet <- sum(diag(conf_nnet))/sum(conf_nnet)
acc_glm <- sum(diag(conf_glm))/sum(conf_glm)
Acc <- c(acc_gbm, acc_rf, acc_nnet, acc_glm)
#Calcuate probabilities
pred_p_gbm <- predict(model_gbm, testSet, type = "prob")[,2]
pred_p_rf <- predict(model_rf, testSet, type = "prob")[,2]
pred_p_nnet <- predict(model_nnet, testSet, type = "prob")[,2]
pred_p_glm <- predict(model_glm, testSet, type = "prob")[,2]
# Make prediction objects for the ROC curves
predict_gbm <- prediction(pred_p_gbm, testSet$FiveHundredPlus)
predict_rf <- prediction(pred_p_rf, testSet$FiveHundredPlus)
predict_nnet <- prediction(pred_p_nnet, testSet$FiveHundredPlus)
predict_glm <- prediction(pred_p_glm, testSet$FiveHundredPlus)
#Calculate the performance
perf_gbm <- performance(predict_gbm, "tpr", "fpr")
perf_rf <- performance(predict_rf, "tpr", "fpr")
perf_nnet <- performance(predict_nnet, "tpr", "fpr")
perf_glm <- performance(predict_glm, "tpr", "fpr")
#Calculate the area under the curves
auc_gbm <- performance(predict_gbm,"auc")@y.values[[1]]
auc_rf <- performance(predict_rf,"auc")@y.values[[1]]
auc_nnet <- performance(predict_nnet,"auc")@y.values[[1]]
auc_glm <- performance(predict_glm,"auc")@y.values[[1]]
AUC <- c(auc_gbm, auc_rf, auc_nnet, auc_glm)
#Plot the ROC curve
plot(perf_gbm)
plot(perf_rf, add = TRUE, colorize = TRUE)
plot(perf_nnet, add = TRUE, colorize = TRUE)
plot(perf_glm, add = TRUE, colorize = TRUE)
#Print the different performance metric vectors
print(Acc)
print(AUC)