In [15]:
library(tm)
library(topicmodels)
library(ggplot2)
library(MASS)
library(scatterplot3d)
library(rgl)

In [16]:
setwd('C:/Users/Suhas/Documents/MSBA/Semester Two/Predictive Analytics/Project/Data/Data')

In [6]:
data_review <- read.csv("review.csv", stringsAsFactors=FALSE)
data_bus <- read.csv("business.csv", stringsAsFactors=FALSE)

In [7]:
# Merging the business and review dataset to obtain the state column
data_intermediate <- merge(x=data_review, y=data_bus, by.x="business_id", by.y="business_id")

# Filtering the data only for Nevada state
data_intermediate <- data_intermediate[data_intermediate$state == 'NV',]

# Finding the indices of business with "restaurant" as category
loc <- grep("Restaurant", data_intermediate$categories, perl=TRUE, value=FALSE)
data_intermediate_subset <- data_intermediate[loc,]

# Selecting only "restaurant" reviews
data_review_restaurant <- subset(data_intermediate, data_intermediate$business_id %in% data_intermediate_subset$business_id &
                            nchar(toString(text)) > 100)

data_review_restaurant <- subset(data_review_restaurant, select = -c(10:23))
colnames(data_review_restaurant) <- c('business_id','date','review_id','stars','text','type','user_id','votes','year')

data_review_restaurant <- data_review_restaurant[data_review_restaurant$year > 2014,]

In [8]:
# Separating into testing (90%) and testing set (10%)
index <- sample(nrow(data_review_restaurant), 240000)

data_review_restaurant.train <- data_review_restaurant[index, ]
data_review_restaurant.test <- data_review_restaurant[-index, ]

cat("Training set: "); dim(data_review_restaurant.train)
cat("Testing set: "); dim(data_review_restaurant.test)

Training set: 

Testing set: 

In [9]:
# Further divide testing set, 1 to build the LDA model and other to apply the LDA model
index <- sample(nrow(data_review_restaurant.train), 120000)
data_review_restaurant.train_1 <- data_review_restaurant.train[index, ]$text
data_review_restaurant.train_2 <- data_review_restaurant.train[-index, ]
cat("Training set 1 size: "); print(length(data_review_restaurant.train_1))
cat("Training set 2 size: "); print(dim(data_review_restaurant.train_2))

Training set 1 size: [1] 120000
Training set 2 size: [1] 120000      9


In [10]:
data_review_restaurant.train_1[3]

In [11]:
write.csv(data_review_restaurant.test_1, "data_review_restaurant.train_1.csv", row.names = FALSE)
write.csv(data_review_restaurant.test_2, "data_review_restaurant.train_2.csv", row.names = FALSE)
write.csv(data_review_restaurant.test, "data_review_restaurant.test.csv", row.names = FALSE)
write.csv(data_review_restaurant, "data_review_restaurant.csv", row.names = FALSE)

In [12]:
train.5stars <- subset(data_review_restaurant.train_2, stars == 5)$text
train.4stars <- subset(data_review_restaurant.train_2, stars == 4)$text
train.3stars <- subset(data_review_restaurant.train_2, stars == 3)$text
train.2stars <- subset(data_review_restaurant.train_2, stars == 2)$text
train.1stars <- subset(data_review_restaurant.train_2, stars == 1)$text
cat("Number of 5-star reviews in test set 2: "); print(length(train.5stars))
cat("Number of 4-star reviews in test set 2: "); print(length(train.4stars))
cat("Number of 3-star reviews in test set 2: "); print(length(train.3stars))
cat("Number of 2-star reviews in test set 2: "); print(length(train.2stars))
cat("Number of 1-star reviews in test set 2: "); print(length(train.1stars))

Number of 5-star reviews in train set 2: [1] 54966
Number of 4-star reviews in train set 2: [1] 26899
Number of 3-star reviews in train set 2: [1] 13524
Number of 2-star reviews in train set 2: [1] 10149
Number of 1-star reviews in train set 2: [1] 14462


In [13]:
write.csv(test.5stars, "train.5stars.csv", row.names = FALSE)
write.csv(test.4stars, "train.4stars.csv", row.names = FALSE)
write.csv(test.3stars, "train.3stars.csv", row.names = FALSE)
write.csv(test.2stars, "train.2stars.csv", row.names = FALSE)
write.csv(test.1stars, "train.1stars.csv", row.names = FALSE)

In [18]:
test.5stars <- subset(data_review_restaurant.test, stars == 5)$text
test.4stars <- subset(data_review_restaurant.test, stars == 4)$text
test.3stars <- subset(data_review_restaurant.test, stars == 3)$text
test.2stars <- subset(data_review_restaurant.test, stars == 2)$text
test.1stars <- subset(data_review_restaurant.test, stars == 1)$text
cat("Number of 5-star reviews in test set 2: "); print(length(test.5stars))
cat("Number of 4-star reviews in test set 2: "); print(length(test.4stars))
cat("Number of 3-star reviews in test set 2: "); print(length(test.3stars))
cat("Number of 2-star reviews in test set 2: "); print(length(test.2stars))
cat("Number of 1-star reviews in test set 2: "); print(length(test.1stars))

Number of 5-star reviews in test set 2: [1] 14079
Number of 4-star reviews in test set 2: [1] 6898
Number of 3-star reviews in test set 2: [1] 3415
Number of 2-star reviews in test set 2: [1] 2548
Number of 1-star reviews in test set 2: [1] 3634


In [19]:
write.csv(test.5stars, "test.5stars.csv", row.names = FALSE)
write.csv(test.4stars, "test.4stars.csv", row.names = FALSE)
write.csv(test.3stars, "test.3stars.csv", row.names = FALSE)
write.csv(test.2stars, "test.2stars.csv", row.names = FALSE)
write.csv(test.1stars, "test.1stars.csv", row.names = FALSE)

In [23]:
topic_dist_train_1_5_df <- read.csv("topic_dist_train_1_5_df.csv", stringsAsFactors=FALSE)

In [24]:
topic_dist_test_1_5_df <- read.csv("topic_dist_test_1_5_df.csv", stringsAsFactors=FALSE)

In [25]:
glm_model <- glm(Star ~ ., data=topic_dist_train_1_5_df)
cat("Logistic rgression:\n"); print(glm_model); cat("\n")

predictions <- predict(glm_model, topic_dist_test_1_5_df)
print(head(round(predictions)))
print(length(predictions))
mround <- function(base, x) {base * round(x/base)}
predictions <- mround(5, predictions)
predictions[predictions==0] <- 1
      
m <- nrow(topic_dist_test_1_5_df)

# Confusion matrix
confusion_matrix <- table(topic_dist_test_1_5_df$Star, predictions)
cat("Confusion matrix: "); print(confusion_matrix); cat("\n")

accuracy <- (sum(predictions == topic_dist_test_1_5_df$Star))/m
cat("Accuracy: "); print(accuracy)

Logistic rgression:

Call:  glm(formula = Star ~ ., data = topic_dist_train_1_5_df)

Coefficients:
(Intercept)       Topic1       Topic2       Topic3       Topic4       Topic5  
    -0.4852       3.9029       3.6205       5.8495       4.3417       6.4065  
     Topic6       Topic7       Topic8       Topic9      Topic10      Topic11  
     5.6346       4.1862       1.1459       2.3335       2.9807       6.4871  
    Topic12      Topic13      Topic14      Topic15      Topic16      Topic17  
     1.2850       3.5980       3.5279       4.7605       7.6908       7.2072  
    Topic18      Topic19      Topic20      Topic21      Topic22      Topic23  
     5.9122       4.5987       5.4266       5.4167       3.6365       1.6975  
    Topic24      Topic25      Topic26      Topic27      Topic28      Topic29  
     7.0411      -1.3068       2.0934       2.4576       3.2785       5.6007  
    Topic30  
     1.3254  

Degrees of Freedom: 27692 Total (i.e. Null);  27662 Residual
Null Deviance:	    11