Add Exercise 6

rickwierenga · Jul 10, 2019 · aa7d76f · aa7d76f
1 parent d5599ec
commit aa7d76f
Show file tree

Hide file tree

Showing 11 changed files with 3,576 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -14,12 +14,21 @@ Make sure you have jupyter notebooks installed. You can find instructions [here]
 * [Matplotlib](https://matplotlib.org)
 * [Pandas](https://pandas.pydata.org)
 * [Pillow](https://python-pillow.org)
+* [Natural Language Toolkit](http://www.nltk.org)
 
 ## Instructions
 1. Please download the exercises (pdf) from the Coursera course. Some instructions are included in the Notebooks. 
 2. Complete the exercises in the exercises Notebook.
 3. Compare your answers to the code in solutions Notebook.
 
+## Contents
+1. Linear Regression
+2. Logistic Regression & Regularization
+3. Multiclass Classifcation & Neural Networks
+4. Neueral Networks Learning
+5. Regularized Linear Regression and Bias v.s. Variance
+6. Support Vector Machines
+
 ## Copyright Notice
 All code, exercises, data and other files in this repo are ©Stanford University. If you are unhappy about me hosting these files on GitHub for educational purposes, please send me an email.
 

diff --git a/ex6/Support Vector Machines (Exercises).ipynb b/ex6/Support Vector Machines (Exercises).ipynb
diff --git a/ex6/Support Vector Machines (Solutions).ipynb b/ex6/Support Vector Machines (Solutions).ipynb
diff --git a/ex6/emailSample1.txt b/ex6/emailSample1.txt
@@ -0,0 +1,10 @@
+> Anyone knows how much it costs to host a web portal ?
+>
+Well, it depends on how many visitors you're expecting.
+This can be anywhere from less than 10 bucks a month to a couple of $100. 
+You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
+if youre running something big..
+
+To unsubscribe yourself from this mailing list, send an email to:
+groupname-unsubscribe@egroups.com
+
diff --git a/ex6/ex6data1.mat b/ex6/ex6data1.mat
diff --git a/ex6/ex6data2.mat b/ex6/ex6data2.mat
diff --git a/ex6/ex6data3.mat b/ex6/ex6data3.mat
diff --git a/ex6/process_email.py b/ex6/process_email.py
@@ -0,0 +1,65 @@
+import re
+from nltk.stem import PorterStemmer
+
+# Load vocabulary
+vocabulary = []
+with open('vocab.txt', 'r') as f:
+    lines = f.readlines()
+    vocabulary = [line[:-1] for line in lines]
+
+def process_email(email_contents):
+    """ preprocesses a the body of an email and returns a list of word_indices """
+    global vocabulary
+
+    # ----- Preprocess email -----
+
+    # Lower case
+    email_contents = email_contents.lower()
+
+    # Remove all HTML
+    html = re.compile(r'<[^<>]+>')
+    email_contents = html.sub('', email_contents)
+
+    # Handle numbers
+    numbers = re.compile(r'[0-9]+')
+    email_contents = numbers.sub('number', email_contents)
+
+    # Handle URLs
+    urls = re.compile(r'(http|https)://[^\s]*')
+    email_contents = urls.sub('httpaddr', email_contents)
+
+    # Email
+    email_addresses = re.compile(r'[^\s]+@[^\s]+')
+    email_contents = email_addresses.sub('emailaddr', email_contents)
+
+    # Dollar sign
+    dollar_sign = re.compile(r'[$]+')
+    email_contents = dollar_sign.sub('dollar', email_contents)
+
+    # ----- Tokenize email -----
+    tokens = []
+    words = re.split(r"\s|\@|\$|\/|\#|\.|\-|\:|\&|\*|\+|\=|\[|\]|\?|\!|\(|\)|\{|\}|\,|\'|\'|\"|\>|\_|\<|\;|\%", email_contents)
+
+    stemmer = PorterStemmer()
+
+    for word in words:
+        # Remove nonalphanumeric characters
+        alphanumeric = re.compile(r'[^a-zA-Z0-9]')
+        word = alphanumeric.sub('', word)
+
+        # Stem the word
+        word = stemmer.stem(word)
+
+        # Get index if it exists
+        if word in vocabulary:
+            tokens.append(vocabulary.index(word))
+
+    return tokens
+
+if __name__ == '__main__':
+    # Run a test.
+    # Note: the indexes are all 1 lower than the exercise, because Python is 0-indexed.
+    with open('emailSample1.txt', 'r') as email:
+        email_contents = email.read()
+        tokens = process_email(email_contents)
+        print(tokens)
diff --git a/ex6/spamTest.mat b/ex6/spamTest.mat
diff --git a/ex6/spamTrain.mat b/ex6/spamTrain.mat