-
Notifications
You must be signed in to change notification settings - Fork 23
/
IMDB_Paatal_Lok.r
166 lines (123 loc) · 5.12 KB
/
IMDB_Paatal_Lok.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#Text Mining
#Sentiment Analysis
#Extract movie reviews for any movie from IMDB and perform sentimental analysis
library(magrittr)
library(rvest)
library(XML)
#I am Going to extract the reviews of "Paatal Lok" from IMDB
rurl <- "https://www.imdb.com/title/tt9680440/reviews?ref_=tt_urv"
IMDB_reviews <- NULL
for (i in 1:10){
murl <- read_html(as.character(paste(rurl,i,sep="=")))
rev <- murl %>%
html_nodes(".show-more__control") %>%
html_text()
IMDB_reviews <- c(IMDB_reviews,rev)
}
length(IMDB_reviews)
write.table(IMDB_reviews, "Paatal_Lok_Reviews.txt", row.names = F)
#Now Lets Import the Review file
rev <- readLines(file.choose())
rev
length(rev)
stpwrd <- readLines(file.choose()) #Read-In the stop.txt file
library(tm)
#lets Create the Corpus
rev.corpus <- Corpus(VectorSource(rev))
inspect(rev.corpus[2])
#Lets Do the Data Cleaning
rev.corpus1 <- tm_map(rev.corpus, tolower)
inspect(rev.corpus1[6])
rev.corpus1 <- tm_map(rev.corpus, removePunctuation)
rev.corpus1 <- tm_map(rev.corpus1, removeNumbers)
rev.corpus1 <- tm_map(rev.corpus1, removeWords, c(stpwrd,"the"))
rev.corpus1 <- tm_map(rev.corpus1, stripWhitespace) #Removes the WhiteSpaces in the Vector
#Lets Form the Term Document Matrix
tdm <- TermDocumentMatrix(rev.corpus1)
dtm <- t(tdm) #Converting the Trem Document Matrix into Document Term Matrix using Transponse Function
tdm <- as.matrix(tdm)
rows <-rowSums(tdm)
rows
rs_sub <- subset(rows, rows >=10)
rs_sub
#Lets Plot it
barplot(rs_sub, las = 3, col = rainbow(20))
#The Word "watch" is used the most in the Reviews
#Lets Make a Word Cloud
library(wordcloud)
windows()
wordcloud(words = names(rs_sub), freq = rs_sub) #This Creates a Wordcloud only with the rs_sub values
#In this WordCloud, the Terms with large Text are the most used words in the Reviews
#Lets Create a WordClou COnsiering all the Words in the rs
rs1 <- sort(rows, decreasing = T)
wordcloud(words = names(rs1), freq = rs1)
#lets make an Attractive Wordcloud by adding colors to it
wordcloud(words = names(rs1), freq = rs1, random.order = F, colors = rainbow(30), scale = c(2,1), rot.per = 0.2)
#Now lets do sentimental Analysis
#load the Positive and Negavtive words
pos.words = scan(file.choose(), what="character", comment.char=";") # read-in positive-words.txt
neg.words = scan(file.choose(), what="character", comment.char=";") # read-in negative-words.txt
#Positive Wordcloud
pos.match <- match(names(rs1), pos.words)
pos.match <- !is.na(pos.match)
pos.freq <- rs1[pos.match]
pos.match.words <- names(pos.freq)
windows()
wordcloud(words = pos.match.words, freq = pos.freq, colors = rainbow(20), scale = c(3,1))
#Here the Term Good is used in many of the Reviews
#Lets make the Negative Wordcloud
neg.match <- match(names(rs1), neg.words)
neg.match <- !is.na(neg.match)
neg.freq <- rs1[neg.match]
neg.match.words <- names(neg.freq)
windows()
wordcloud(words = neg.match.words, freq = neg.freq, colors = rainbow(20))
#Here the word "propaganda is used in most of the Reviews
#Lets Do Emotion Mining
library(syuzhet)
review <- iconv(rev, "UTF-8")
x <- get_nrc_sentiment(review)
head(x)
head(x,100)
x1 <- get_sentences(rev)
class(x1)
str(x1)
head(x1)
sentiment_vector <- get_sentiment(x1, method = "bing")
head(sentiment_vector, 100)
sum(sentiment_vector)
mean(sentiment_vector)
summary(sentiment_vector)
afinn_s_v <- get_sentiment(x1, method = "afinn")
head(afinn_s_v,100)
sum(afinn_s_v)
mean(afinn_s_v)
summary(afinn_s_v)
nrc_vector <- get_sentiment(x1, method="nrc")
head(nrc_vector,100)
sum(nrc_vector)
mean(nrc_vector)
summary(nrc_vector)
plot(sentiment_vector,type='l',main ='Plot trajectory',xlab='Narative time',ylab='Emotional valence')
abline(h=0,col="red")
plot(sentiment_vector, type="h", main="Example Plot Trajectory", xlab = "Narrative Time", ylab= "Emotional Valence")
# To extract the sentence with the most negative emotional valence
negative <- x1[which.min(sentiment_vector)]
negative
# and to extract the most positive sentence
positive <- x1[which.max(sentiment_vector)]
positive
# percentage based figures
percent_vals <- get_percentage_values(sentiment_vector)
plot(percent_vals, type="l",main="Percentage-Based Means", xlab = "Narrative Time", ylab= "Emotional Valence", col="red")
#Shape smoothing and normalization using a Fourier based transformation and low pass filtering is achieved using the get_transformed_values function as shown below.
ft_values <- get_transformed_values(sentiment_vector, low_pass_size = 3, x_reverse_len = 100,padding_factor = 2,scale_vals = TRUE,scale_range = FALSE)
plot(ft_values, type ="l", main ="HP Pavilion Reviews using Transformed Values", xlab = "Narrative Time", ylab = "Emotional Valence", col = "red")
# categorize each sentence by eight emotions
nrc_data <- get_nrc_sentiment(x1)
nrc_score_sent <- get_nrc_sentiment(negative)
sad_items <- which(nrc_data$sadness > 0)
head(x1[sad_items])
# To view the emotions as a barplot
barplot(sort(colSums(prop.table(nrc_data[, 1:10]))), horiz = T, cex.names = 0.7,las = 1, main = "Emotions", xlab = "Percentage",col = 1:8)
#In this Review Set we infer that this WebSeries has Almost Same Positive and Negative Reviews, But The Positive Reviews are Slightly more than Negative Reviews.