Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d5599ec
commit aa7d76f
Showing
11 changed files
with
3,576 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
> Anyone knows how much it costs to host a web portal ? | ||
> | ||
Well, it depends on how many visitors you're expecting. | ||
This can be anywhere from less than 10 bucks a month to a couple of $100. | ||
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 | ||
if youre running something big.. | ||
|
||
To unsubscribe yourself from this mailing list, send an email to: | ||
groupname-unsubscribe@egroups.com | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import re | ||
from nltk.stem import PorterStemmer | ||
|
||
# Load vocabulary | ||
vocabulary = [] | ||
with open('vocab.txt', 'r') as f: | ||
lines = f.readlines() | ||
vocabulary = [line[:-1] for line in lines] | ||
|
||
def process_email(email_contents): | ||
""" preprocesses a the body of an email and returns a list of word_indices """ | ||
global vocabulary | ||
|
||
# ----- Preprocess email ----- | ||
|
||
# Lower case | ||
email_contents = email_contents.lower() | ||
|
||
# Remove all HTML | ||
html = re.compile(r'<[^<>]+>') | ||
email_contents = html.sub('', email_contents) | ||
|
||
# Handle numbers | ||
numbers = re.compile(r'[0-9]+') | ||
email_contents = numbers.sub('number', email_contents) | ||
|
||
# Handle URLs | ||
urls = re.compile(r'(http|https)://[^\s]*') | ||
email_contents = urls.sub('httpaddr', email_contents) | ||
|
||
email_addresses = re.compile(r'[^\s]+@[^\s]+') | ||
email_contents = email_addresses.sub('emailaddr', email_contents) | ||
|
||
# Dollar sign | ||
dollar_sign = re.compile(r'[$]+') | ||
email_contents = dollar_sign.sub('dollar', email_contents) | ||
|
||
# ----- Tokenize email ----- | ||
tokens = [] | ||
words = re.split(r"\s|\@|\$|\/|\#|\.|\-|\:|\&|\*|\+|\=|\[|\]|\?|\!|\(|\)|\{|\}|\,|\'|\'|\"|\>|\_|\<|\;|\%", email_contents) | ||
|
||
stemmer = PorterStemmer() | ||
|
||
for word in words: | ||
# Remove nonalphanumeric characters | ||
alphanumeric = re.compile(r'[^a-zA-Z0-9]') | ||
word = alphanumeric.sub('', word) | ||
|
||
# Stem the word | ||
word = stemmer.stem(word) | ||
|
||
# Get index if it exists | ||
if word in vocabulary: | ||
tokens.append(vocabulary.index(word)) | ||
|
||
return tokens | ||
|
||
if __name__ == '__main__': | ||
# Run a test. | ||
# Note: the indexes are all 1 lower than the exercise, because Python is 0-indexed. | ||
with open('emailSample1.txt', 'r') as email: | ||
email_contents = email.read() | ||
tokens = process_email(email_contents) | ||
print(tokens) |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.