-
Notifications
You must be signed in to change notification settings - Fork 4
/
script5.Rout
174 lines (164 loc) · 5.63 KB
/
script5.Rout
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
R version 3.1.0 (2014-04-10) -- "Spring Dance"
Copyright (C) 2014 The R Foundation for Statistical Computing
Platform: x86_64-unknown-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library("rjson")
> library("irlba")
>
> businessDatasetFile <- "Dataset/yelp_academic_dataset_business.json"
> businessDatasetFileHandle <- file(businessDatasetFile,open="r")
> businessDatasetFileLines <- readLines(businessDatasetFileHandle)
> numberOfBusinesses <- length(businessDatasetFileLines)
> businesses <- matrix(,nrow=1,ncol=numberOfBusinesses)
> for (i in 1:numberOfBusinesses) {
+ businessJSON <- fromJSON(businessDatasetFileLines[i],method="C")
+ businesses[i] <- businessJSON$business_id
+ }
> close(businessDatasetFileHandle)
> closeAllConnections()
> remove('businessDatasetFile', 'businessDatasetFileHandle', 'businessDatasetFileLines', 'businessJSON')
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1064531 56.9 1710298 91.4 1406117 75.1
Vcells 1640575 12.6 4470640 34.2 4181561 32.0
>
> userDatasetFile <- "Dataset/yelp_academic_dataset_user.json"
> userDatasetFileHandle <- file(userDatasetFile,open="r")
> userDatasetFileLines <- readLines(userDatasetFileHandle)
> numberOfUsers <- length(userDatasetFileLines)
> users <- matrix(,nrow=1,ncol=numberOfUsers)
> for (i in 1:numberOfUsers) {
+ userJSON <- fromJSON(userDatasetFileLines[i],method="C")
+ users[i] <- userJSON$user_id
+ }
> close(userDatasetFileHandle)
> closeAllConnections()
> remove('userDatasetFile', 'userDatasetFileHandle', 'userDatasetFileLines', 'userJSON')
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1135349 60.7 1835812 98.1 1835812 98.1
Vcells 2060082 15.8 7349357 56.1 6919706 52.8
>
> ptm <- proc.time()
> reviewDatasetFile <- "Dataset/yelp_academic_dataset_review.json"
> reviewDatasetFileHandle <- file(reviewDatasetFile,open="r")
> reviewDatasetFileLines <- readLines(reviewDatasetFileHandle)
> numberOfReviews <- length(reviewDatasetFileLines)
> reviews <- matrix(0,nrow=numberOfUsers,ncol=numberOfBusinesses)
> for (i in 1:numberOfReviews) {
+ reviewJSON <- fromJSON(reviewDatasetFileLines[i],method="C")
+ reviews[which(users==reviewJSON$user_id, arr.ind=TRUE)[2], which(businesses==reviewJSON$business_id, arr.ind=TRUE)[2]] <- reviewJSON$stars
+ }
> close(reviewDatasetFileHandle)
> closeAllConnections()
> remove('reviewDatasetFile', 'reviewDatasetFileHandle', 'reviewDatasetFileLines', 'reviewJSON')
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1135621 60.7 2251281 120.3 2251281 120.3
Vcells 1106136698 8439.2 1647752592 12571.4 1285875890 9810.5
> ptm <- proc.time() - ptm
> cat ("Time taken to build the reviews matrix\n")
Time taken to build the reviews matrix
> print(ptm)
user system elapsed
936.925 2.566 939.668
>
> ptm <- proc.time()
> reviewsSVDRank5 <- irlba(reviews, nu=5, nv=5)
> ptm <- proc.time() - ptm
> cat ("Time taken to perform SVD using irlba\n")
Time taken to perform SVD using irlba
> print(ptm)
user system elapsed
226.150 0.000 226.396
>
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1139614 60.9 2251281 120.3 2251281 120.3
Vcells 1106571543 8442.5 1647752592 12571.4 1285875890 9810.5
>
> reviewsSVDRank5$D <- diag(reviewsSVDRank5$d)
>
> ptm <- proc.time()
> weightMatrix <- matrix(0,nrow=numberOfUsers,ncol=numberOfBusinesses)
> Wlocations <- which (reviews != 0, arr.ind=T)
> for (i in 1:dim(Wlocations)[1]) {
+ weightMatrix[Wlocations[i,1], Wlocations[i,2]] = 1
+ }
> remove('Wlocations')
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1139636 60.9 2251281 120.3 2251281 120.3
Vcells 2210254519 16863.0 3479879471 26549.4 3315789536 25297.5
> ptm <- proc.time() - ptm
> cat ("Time taken to calculate weight matrix\n")
Time taken to calculate weight matrix
> print(ptm)
user system elapsed
26.144 2.615 28.776
>
> save.image()
>
> minRank <- 0
> minNorm <- -1
>
> ptm <- proc.time()
> for (i in 2:5) {
+ u <- reviewsSVDRank5$u[,1:i]
+ D <- reviewsSVDRank5$D[1:i,1:i]
+ v <- t(reviewsSVDRank5$v[,1:i])
+ T <- D %*% v
+ remove('D', 'v')
+ LRARank <- u %*% T
+ remove('u', 'T')
+ W <- LRARank * weightMatrix
+ diff <- reviews - W
+ remove('W')
+ fNorm <- norm(diff, "F")
+ remove('diff')
+ cat ("Fnorm for Rank ", i, " = ", fNorm, "\n")
+ if (minNorm == -1) {
+ minNorm <- fNorm
+ minRank <- i
+ } else if (minNorm > fNorm) {
+ minNorm <- fNorm
+ minRank <- i
+ }
+ gc()
+ }
Fnorm for Rank 2 = 2212.238
Fnorm for Rank 3 = 2203.021
Fnorm for Rank 4 = 2196.231
Fnorm for Rank 5 = 2189.701
> remove('LRARank', 'fNorm')
>
> save.image()
>
> ptm <- proc.time() - ptm
> cat ("Time taken to calculate fnorm for each rank\n")
Time taken to calculate fnorm for each rank
> print(ptm)
user system elapsed
380.792 251.493 3973.508
>
> cat("\n\n")
> cat("***************")
***************> cat("Rank with minimum fNorm is ", minNorm, "\n")
Rank with minimum fNorm is 2189.701
> cat("Lowest fNorm is ", minRank, "\n")
Lowest fNorm is 5
> cat("***************")
***************> cat("\n\n")
>
> proc.time()
user system elapsed
1870.601 256.881 5470.266