In [None]:
## loading libraries 

library(rjson)
library(jsonlite)
library(readr)
library(ggplot2) 
library(readr) 

## reading train, test and sample submission files
train = fromJSON("../input/randompizza/train.json")
test = fromJSON("../input/randompizza/test.json")
sample_sub = read.csv( "../input/randompizza/sampleSubmission (1).csv")


In [None]:
# Data exploration 
names(train)
names(test)
# train dataset having a lot of columns and test having few of them


In [None]:
##univariate and multivariate analysis 
library(ggplot2)
ggplot(data = train, mapping = aes(y =requester_received_pizza, x = requester_number_of_posts_on_raop_at_retrieval))+
   geom_jitter()

## majority of people getting rejected and accepted in getting pizza have starting values in 
# number of posts at retreival

In [None]:
ggplot(data = train, mapping = aes(y =requester_received_pizza, x =requester_number_of_comments_at_request))+
   geom_jitter()

ggplot(data = train, mapping = aes(y =requester_received_pizza, x =requester_number_of_comments_at_retrieval))+
   geom_jitter()

## frequency high during starting and ending 

In [None]:
ggplot(data = train, mapping = aes( y =requester_upvotes_minus_downvotes_at_request , x = requester_upvotes_plus_downvotes_at_request))+
  geom_point(mapping = aes(color = requester_received_pizza))


In [None]:
ggplot(data = train, mapping = aes( y =requester_upvotes_minus_downvotes_at_retrieval , x = requester_upvotes_plus_downvotes_at_retrieval))+
  geom_point(mapping = aes(color = requester_received_pizza))

In [None]:
## wordclouds
library(tm)
library(methods)
library(RColorBrewer)
library(wordcloud)
##
make_word_cloud <- function(documents) {
  corpus = Corpus(VectorSource(tolower(documents)))
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords("english"))
  
  frequencies = DocumentTermMatrix(corpus)
  word_frequencies = as.data.frame(as.matrix(frequencies))
  
  words <- colnames(word_frequencies)
  freq <- colSums(word_frequencies)
  wordcloud(words, freq,
            min.freq=sort(freq, decreasing=TRUE)[[200]],
            colors=brewer.pal(8, "Dark2"),
            random.color=TRUE)  
}

## 200 most common words
make_word_cloud(train$request_title)


In [None]:
make_word_cloud(train$request_text)


In [None]:
make_word_cloud(train$request_text_edit_aware)


In [None]:
make_word_cloud(train$requester_subreddits_at_request)

In [None]:
# removing  text variables and variables absent from test data for  applying xgboost
train = train[ ,c(10,12,14,16,18,20,22,23,25,27,31,32)]
test = test[,-c(1,3,4,12,15)]
# adding target variable to test data
test$requester_received_pizza = 0
class(train$requester_received_pizza)
# converting class of target variable from logical to numeric
train$requester_received_pizza = as.factor (as.numeric(train$requester_received_pizza))
class(train$requester_received_pizza)

In [None]:
## applying xgboost
library(xgboost)
library(Matrix)
sparse_matrix <-sparse.model.matrix((requester_received_pizza)~.-1, data =train)
unique(train$requester_received_pizza)
## as factor converts values into 1 and 0 so taking 1 as yes
output_vector = train[,8] == "1"
# aplying xgboost keeping eval crieria to be auc and using random rounds
bst3 <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic", 
               eval_metric ="auc")


importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst3)
#  gain is most important as it tells the the improvement in accuracy brought by features added to it
# details here -http://xgboost.readthedocs.io/en/latest/R-package/discoverYourData.html
head(importance)

# plotting importance 
xgb.plot.importance(importance_matrix = importance)


In [None]:
#  giving predictions 
pre = predict(bst3, as.matrix(test[,-1]))
solution3 = data.frame(request_id = test$request_id , requester_received_pizza= pre)
write.csv(solution3, file = "xgpizza3", row.names = F)

In [None]:
# Improving accuracy 
# using cross validation feature to improve accuracy taking best rounds
bst <- xgb.cv(data = sparse_matrix, label = output_vector, max.depth = 4,
eta = 1, nthread = 2, nfold = 10,nround = 200,objective = "binary:logistic",
print_every_n = 10,early_stopping_rounds = 10,maximize = F, eval_metric  = "auc")
##
#running  cross validation multiple times gives you best rounds which you can further use to get better accuracy


In [None]:
#