-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextMining_FinalVersion.py
369 lines (326 loc) · 13.1 KB
/
TextMining_FinalVersion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
"""
Mini-Poject 3: Text Mining
AUTHOR: SPARSH BANSAL
"""
import Pickling
import pickle
import requests
import string
from string import punctuation
from string import whitespace
from bs4 import BeautifulSoup
import re
import sys
import random
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# I used the Pickling program here which is in a separate text file
# Pickles the html or text file using a book link from the Gutenberg Project Website
# Book 1 - Voyage To Jupiter
Pickling.pickler('Voyage_To_Jupiter.pickle', 'http://www.gutenberg.org/files/58915/58915-0.txt')
# Opens the pickled binary file
open_voyage = open('Voyage_To_Jupiter.pickle', 'rb')
reloaded_copy_of_voyage = pickle.load(open_voyage)
# Writes the pickled file to a text file - getting rid of \r and \n tags
pickled_voyage = open('pickled_voyage.txt', 'w')
for line in reloaded_copy_of_voyage:
pickled_voyage.write(line)
# Pickling the rest of the books in a similar order
# Book 2 - Dracula
Pickling.pickler('Dracula.pickle', 'http://www.gutenberg.org/ebooks/345.txt.utf-8')
open_dracula = open('Dracula.pickle', 'rb')
reloaded_copy_of_dracula = pickle.load(open_dracula)
pickled_dracula = open('pickled_dracula.txt', 'w')
for line in reloaded_copy_of_dracula:
pickled_dracula.write(line)
# Book 3 - Alice's Adventures in Wonderland
Pickling.pickler('Alice.pickle', 'http://www.gutenberg.org/ebooks/19033.txt.utf-8')
open_alice = open('Alice.pickle', 'rb')
reloaded_copy_of_alice = pickle.load(open_alice)
pickled_alice = open('pickled_alice.txt', 'w')
for line in reloaded_copy_of_alice:
pickled_alice.write(line)
# 3 books appended
books = open('books.txt', 'w')
book1 = open('voyage.txt', 'r')
book2 = open('dracula.txt', 'r')
book3 = open('alice.txt', 'r')
for line in book1:
books.write(line)
for line in book2:
books.write(line)
for line in book3:
books.write(line)
# Scrapes the HTML file by removing the HTML tags </p>, </r>, and </n>
def scraper(input_file):
"""Scrapes the pickled file and makes it usable for the processing functions.
"""
# Creates an empty list as an empty beginner sentence
sentence = []
# Calls the html parser in BeautifulSoup
html = BeautifulSoup(input_file, 'html.parser')
# Finds all the tags in the pickled file
html = html.findAll('p')
# Substitues all the tags for empty strings
for line in range(len(html)):
print(line)
html = re.sub(r'<.*?>|\r|\n','', str(html[line]))
sentence.append(html)
return sentence
# Processes the scraped lines to be (i) all lowercase, (ii) without spaces and line breaks
def get_lines(filename):
"""
Read all lines from `filename` and return a list of strings,
one per line, with whitespace stripped from the ends.
>>> get_lines('My name is Sparsh Bansal. I love doing Reading Journals, but they take a lot of my time.')
[['my','name','is','sparsh','bansal','i','love','doing','reading','journals','but','they','take','a','lot'
,'of','my','time']]
"""
# Defining the whitespace that will be replaced with punctuation marks in the text processing program
whitespace_string = ' '
delete_string = string.punctuation + whitespace_string
# Defines an empty list as an empty beginner line
lines = []
# Opens the file passed in as an argument to the function
with open(filename) as fp:
for line in fp:
processed_line = line.split(' ')
for word in processed_line:
test = ''
# Converts all the non-lowercase words in the lines to lowercase
processed_word = word.lower()
for letter in processed_word:
if letter not in delete_string:
# Deletes the characters in the letters as defined by the string at the start of the function
test += letter
# Replaces all the line breaks by empty strings
final = test.replace("\n",'')
if final != '':
lines.append(final)
return lines
# First Analysis on the processed text file - Frequency of subject related words
def histogram(l):
"""Return a dictionary that counts occurrences of each word in l.
"""
# Defines a new dictionary
d = dict()
# Assigns all the words to the dictionary and adds one for each words to the frequency histogram
for w in l:
d[w] = 1 + d.get(w, 0)
return d
# __ First Analysis continued __
def most_frequent(l):
"""Returns a list in the descending order of frequency of words in a list of strings.
"""
# Calls the histogram functioned defined above
hist = histogram(l)
# Initializes an empty list
list_init = []
# Appends the frequencies and keys to the list in the format: '(frequency, key)'
for key, frequency in hist.items():
list_init.append((frequency, key))
#Arranges all the list based on the frequency of the keys
list_init.sort(reverse = True)
return list_init
# Second Analysis on the processed text file - Markov Analysis on a single book text.
# To prevent skewing of the results due to the book header (Project Gutenberg Header)
def skip_file_header(fp):
"""
Skips the header of the text file named 'filename' at the point where the Table on
contents is defined.
"""
for line in fp:
if line.startswith('CONTENTS'):
break
# Initializing the dictionary for mapping the suffixes
suffix_map = {}
# Initializing the global tuple of words
prefix = ()
# Performs Markov Analysis on the given text and generates text
# Used Think Python 'Markov Analysis - Chapter 13' for reference
# User-defined order
def process_file(filename, order=2):
"""
Traverses the lines and processes every word according to the process_word function
"""
fp = open(filename)
# Skips the header information of the book
#skip_file_header(fp)
for line in fp:
# Removes all the trailing characters in the splitted text using rstrip() and splits the text into words using split()
for word in line.rstrip().split():
# Calls process_word to process each word
process_word(word, order)
def process_word(word, order=2):
"""
Returns a dictionary of suffixes and prefixes
"""
global prefix
# Adds the word to the prefix if the string has lesser number of words than the order input (2 in this case)
if len(prefix) < order:
prefix += (word,)
return
try:
# Appending a word to existing prefix keys - Accounts for multiple words existing for a single prefix key
suffix_map[prefix].append(word)
except KeyError:
# Accounts for the case where a prefix key does not exist for the word being processed
suffix_map[prefix] = [word]
# Updates the prefix to move on to the next pair of words
prefix = shift(prefix, word)
# Changes the processing frame to the next pair of words
def shift(t, word):
"""
Forms the input tuple for the next frame to be processed -
1st element of the new tuple (prefix) is the 2nd element from the previous tuple (word)
2nd element of the new tuple (Word) is the next word in the text file being processed
"""
return t[1:] + (word,)
# Generates the random text from the processed data - Output of the Markov Analysis
def text_generator(n):
"""
This fucnction randomly picks words in the original text and generates text based on what these words key to
in the dictionary.
My initial plan was to pick words weighted by how frequently they are used in the original text - hence the functions
most_frequent and histogram
I pivoted to using unweighted words due to the fact that most commom words turned out to be the common english language
words - and, the, a, etc which skewed the text output.
"""
# Picks the first word for the text
# It is a randomly selected key from the dictionary of suffixes
# It is converted to a list as the shift function uses 't' as a list object.
# random.choice is used because the array of keys is essentially 1-D
# As an iteration of this program, including the frequency of the keys can be implemented by assigning weights/probabilities
# to the keys and then randomly selecting the keys
first_key = random.choice(list(suffix_map.keys()))
# Creates an empty file to store the Generated Text
Generated_text = open('Generated.txt', 'w')
for i in range(n):
# Indexes to the suffix that maps to the key prefix
suffixes = suffix_map.get(first_key, None)
# Handles the case when the suffix does not map to any prefix in the dictionary
if suffixes == None:
# Skips the word and processes the next word
text_generator(n-i)
return
# Chooses a random suffix from the list of suffixes available for a particular key
word = random.choice(suffixes)
word_string = str(word) + ' '
# Writes every word to the .txt file to keep track of text
Generated_text.write(word_string)
print(word, end=' ')
# Re-initializes the key for the next word
first_key = shift(first_key, word)
# Third Analysis on the processed text file - Sentiment Analysis on multiple book text.
# Doing Linguistic Post-processing using Vader by the NLTK corpora.
# Calculates the Sentiment scores for all the lines in the file
def analyzer(filename):
"""
Uses Sentiment Analysis function by NLTK Corpora on the generated text and compares it to the original books
Cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social
Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014
"""
# Opens the file that is input to the function
fp = open(filename)
# Calls the Sentiment Intensity Analyzer
analyzer = SentimentIntensityAnalyzer()
for line in fp:
# Calculates the score for each line
scores = analyzer.polarity_scores(line)
print(scores)
# Calculates the average scores from the sentiment analyzer
def average_scores(filename):
"""
Calculates the average scores from the Sentiment Analyzer
"""
# Initializes the compound score to 0
compound = 0
# Initializes the neg score to 0
neg = 0
# Initializes the neu score to 0
neu = 0
# Initializes the pos score to 0
pos = 0
# Initializes the number of scores to 0
number = 0
# Opens the file that is input to the function
fp = open(filename)
analyzer = SentimentIntensityAnalyzer()
for line in fp:
score = analyzer.polarity_scores(line)
compound += score['compound']
neg += score['neg']
neu += score['neu']
pos += score['pos']
number += 1
# Outputs the average scores for the text file
string_output = 'For ' + str(filename) + ' the Sentiment scores are as follows:' + '\n' + 'Compound score = ' + str(compound/number) + '\n' + 'Neg score = ' + str(neg/number) + '\n' + 'Neu score = ' + str(neu/number) + '\n' + 'Pos score = ' + str(pos/number)
return print(string_output)
# Calls the functions in the main screen
if __name__ == '__main__':
# Analysis One - most frequent word analysis in the three books
print('Analysis I')
print('\n')
print('Histogram of words for Voyage to Jupiter')
print('\n')
print(most_frequent(get_lines('voyage.txt')))
print('\n')
print('Histogram of words for Dracula')
print('\n')
print(most_frequent(get_lines('dracula.txt')))
print('\n')
print('Histogram of words for Alice in Wonderland')
print('\n')
print(most_frequent(get_lines('alice.txt')))
# Analysis Two - Markov Analysis Text Generator
print('Analysis II')
print('\n')
print('Markov Analysis for Voyage to Jupiter')
print('\n')
process_file('voyage.txt', 2)
print('\n')
text_generator(200)
print('\n')
print('Markov Analysis for Dracula')
print('\n')
process_file('dracula.txt', 2)
print('\n')
text_generator(200)
print('\n')
print('Markov Analysis for Alice in Wonderland')
print('\n')
process_file('alice.txt', 2)
print('\n')
text_generator(200)
print('\n')
print('Markov Analysis for 3 books combined')
print('\n')
process_file('books.txt', 2)
print('\n')
text_generator(200)
# Analysis Three - Sentiment Analysis
#analyzer('voyage.txt')
#print('\n')
#analyzer('dracula.txt')
#print('\n')
#analyzer('alice.txt')
#print('\n')
#analyzer('Generated.txt')
#print('\n')
print('Analysis III')
print('\n')
print('Sentiment Analysis for Voyage to Jupiter')
print('\n')
average_scores('voyage.txt')
print('\n')
print('Sentiment Analysis for Dracula')
print('\n')
average_scores('dracula.txt')
print('\n')
print('Sentiment Analysis for Alice in Wonderland')
print('\n')
average_scores('alice.txt')
print('\n')
print('Sentiment Analysis for Markov Analysis generated text')
print('\n')
average_scores('Generated.txt')