Skip to content

Commit

Permalink
part 3 - nltk data, text processing, requests
Browse files Browse the repository at this point in the history
  • Loading branch information
mjhea0 committed Mar 16, 2016
1 parent 1f9bfa7 commit 29df7ba
Show file tree
Hide file tree
Showing 6 changed files with 61,837 additions and 11 deletions.
65 changes: 54 additions & 11 deletions app.py
@@ -1,24 +1,67 @@
from flask import Flask
from flask.ext.sqlalchemy import SQLAlchemy
import os
import requests
import operator
import re
import nltk
from flask import Flask, render_template, request
from flask.ext.sqlalchemy import SQLAlchemy
from stop_words import stops
from collections import Counter
from bs4 import BeautifulSoup


app = Flask(__name__)
app.config.from_object(os.environ['APP_SETTINGS'])
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
db = SQLAlchemy(app)

from models import Result


@app.route('/')
def hello():
return "Hello World!"


@app.route('/<name>')
def hello_name(name):
return "Hello {}!".format(name)
@app.route('/', methods=['GET', 'POST'])
def index():
errors = []
results = {}
if request.method == "POST":
# get url that the person has entered
try:
url = request.form['url']
r = requests.get(url)
except:
errors.append(
"Unable to get URL. Please make sure it's valid and try again."
)
return render_template('index.html', errors=errors)
if r:
# text processing
raw = BeautifulSoup(r.text).get_text()
nltk.data.path.append('./nltk_data/') # set the path
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
# remove punctuation, count raw words
nonPunct = re.compile('.*[A-Za-z].*')
raw_words = [w for w in text if nonPunct.match(w)]
raw_word_count = Counter(raw_words)
# stop words
no_stop_words = [w for w in raw_words if w.lower() not in stops]
no_stop_words_count = Counter(no_stop_words)
# save the results
results = sorted(
no_stop_words_count.items(),
key=operator.itemgetter(1),
reverse=True
)[:10]
try:
result = Result(
url=url,
result_all=raw_word_count,
result_no_stop_words=no_stop_words_count
)
db.session.add(result)
db.session.commit()
except:
errors.append("Unable to add item to database.")
return render_template('index.html', errors=errors, results=results)


if __name__ == '__main__':
Expand Down
Binary file added nltk_data/tokenizers/punkt/PY3/english.pickle
Binary file not shown.

0 comments on commit 29df7ba

Please sign in to comment.