In [1]:
import spacy

import os

In [4]:
multiSentence = 'This restaurant is a step above the rest with its well-balanced menu and \
ability to create innovative versions of traditional recipes. This is thanks to a critical and \
non-nostalgic approach and a focus on light cuisine. The Osteria Francescana has consolidated \
its position as one of Italy\'s leading restaurants, with a talented chef whose reputation is \
truly international.'

In [None]:
parser = English()
# all you have to do to parse text is this:
#note: the first time you run spaCy in a file it takes a little while to load up its modules
parsedData = parser(unicode(multiSentence))

In [28]:
# Let's look at the tokens
# All you have to do is iterate through the parsedData
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word
# corpus, smoothed using the Simple Good-Turing method.
for i, token in enumerate(parsedData):
    print"original:", token.orth, token.orth_
    print"lowercased:", token.lower, token.lower_
    print"lemma:", token.lemma, token.lemma_
    print"shape:", token.shape, token.shape_
    print"prefix:", token.prefix, token.prefix_
    print"suffix:", token.suffix, token.suffix_
    print"log probability:", token.prob
    print"Brown cluster id:", token.cluster
    print"----------------------------------------"
    if i > 1:
        break
        
# Let's look at the sentences
sents = []
# the "sents" property returns spans
# spans have indices into the original string
# where each index value represents a token
for span in parsedData.sents:
    # go from the start to the end of each span, returning each token in the sentence
    # combine each token using join()
    sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)

for sentence in sents:
    print sentence
    print '-`-`-`-`-`-`-`-`-`-`-`-`-`-`-`-'
    
# Let's look at the part of speech tags of the first sentence
for span in parsedData.sents:
    sent = [parsedData[i] for i in range(span.start, span.end)]
    break

for token in sent:
    print token.orth_, token.pos_
    
# Let's look at the dependencies of this example:
example = "The boy with the spotted dog quickly ran after the firetruck."
parsedEx = parser(unicode(example))
# shown as: original token, dependency tag, head word, left dependents, right dependents
for token in parsedEx:
    print token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], \
          [t.orth_ for t in token.rights]
    print '\n'
    
# Let's look at the named entities of this example:
example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October."
parsedEx = parser(unicode(example))
for token in parsedEx:
    print token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)"

print("-------------- entities only ---------------")
# if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this:
ents = list(parsedEx.ents)
for entity in ents:
    print entity.label, entity.label_, ' '.join(t.orth_ for t in entity)
    
messyData = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
parsedData = parser(unicode(messyData))
for token in parsedData:
    print token.orth_, token.pos_, token.lemma_
    
# it does pretty well! Note that it does fail on the token "gr8", 
# taking it as a verb rather than an adjective meaning "great"
# and "lol" probably isn't a noun...it's more like an interjection

original: 956 lol
lowercased: 956 lol
lemma: 956 lol
shape: 28983 xxx
prefix: 10443 l
suffix: 956 lol
log probability: -8.62125778198
Brown cluster id: 0
----------------------------------------
original: 475 that
lowercased: 475 that
lemma: 475 that
shape: 53740 xxxx
prefix: 3598 t
suffix: 2768 hat
log probability: -4.46450471878
Brown cluster id: 84
----------------------------------------
original: 474 is
lowercased: 474 is
lemma: 488 be
shape: 21581 xx
prefix: 570 i
suffix: 474 is
log probability: -4.45774888992
Brown cluster id: 762
----------------------------------------
lol that is rly funny :)
-`-`-`-`-`-`-`-`-`-`-`-`-`-`-`-
This is gr8
-`-`-`-`-`-`-`-`-`-`-`-`-`-`-`-
i rate it 8/8!!!
-`-`-`-`-`-`-`-`-`-`-`-`-`-`-`-
lol NOUN
that ADJ
is VERB
rly ADV
funny ADJ
:) PUNCT
The det boy [] []


boy nsubj ran [u'The'] [u'with']


with prep boy [] [u'dog']


the det dog [] []


spotted amod dog [] []


dog pobj with [u'the', u'spotted'] []


quickly advmod ran [] []


ran ROOT ran [u'b

In [18]:
from numpy import dot
from numpy.linalg import norm

# you can access known words from the parser's vocabulary
nasa = parser.vocab['NASA']

# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

# gather all known words, take only the lowercased versions
allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "nasa"})

# sort by similarity to NASA
allWords.sort(key=lambda w: cosine(w.repvec, nasa.repvec))
allWords.reverse()
print("Top 10 most similar words to NASA:")
for word in allWords[:10]:   
    print(word.orth_)

TypeError: an integer is required

In [19]:
# Let's see if it can figure out this analogy
# Man is to King as Woman is to ??
king = parser.vocab['king']
man = parser.vocab['man']
woman = parser.vocab['woman']

result = king.repvec - man.repvec + woman.repvec

# gather all known words, take only the lowercased versions
allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"})
# sort by similarity to the result
allWords.sort(key=lambda w: cosine(w.repvec, result))
allWords.reverse()
print("\n----------------------------\nTop 3 closest results for king - man + woman:")
for word in allWords[:3]:   
    print(word.orth_)
    
# it got it! Queen!

TypeError: an integer is required

In [30]:
from subject_object_extraction import findSVOs

# can still work even without punctuation
parse = parser("he and his brother shot me and my sister")
print(findSVOs(parse))

ImportError: No module named subject_object_extraction

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

# Every step in a pipeline needs to be a "transformer". 
# Define a custom transformer to clean text using spaCy
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

def printNMostInformative(vectorizer, clf, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)
    
# the vectorizer and classifer to use
# note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
# the pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# data
train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", 
        "lol @twitterdude that is gr8", "twitter &amp; reddit are fun.", 
        "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", 
        "Rockets launch from Earth and go to other planets.", "twitter social media &gt; &lt;", 
        "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."]
labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"]

test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"]
labelsTest = ["twitter", "space"]

# train
pipe.fit(train, labelsTrain)

# test
preds = pipe.predict(test)
print("----------------------------------------------------------------------------------------------")
print("results:")
for (sample, pred) in zip(test, preds):
    print(sample, ":", pred)
print("accuracy:", accuracy_score(labelsTest, preds))

print("----------------------------------------------------------------------------------------------")
print("Top 10 features used to predict: ")
# show the top features
printNMostInformative(vectorizer, clf, 10)

print("----------------------------------------------------------------------------------------------")
print("The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc")
# let's see what the pipeline was transforming the data into
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(train, labelsTrain)

# get the features that the vectorizer learned (its vocabulary)
vocab = vectorizer.get_feature_names()

# the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)
for i in range(len(train)):
    s = ""
    indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
    numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
    for idx, num in zip(indexIntoVocab, numOccurences):
        s += str((vocab[idx], num))
    print("Sample {}: {}".format(i, s))

----------------------------------------------------------------------------------------------
results:
('i h8 riting comprehensibly #skoolsux', ':', 'twitter')
('planets and stars and rockets and stuff', ':', 'space')
('accuracy:', 1.0)
----------------------------------------------------------------------------------------------
Top 10 features used to predict: 
Class 1 best: 
(-0.53174515364897312, u'planet')
(-0.35387714355466582, u'space')
(-0.21950271095320154, u'mar')
(-0.21950271095320154, u'red')
(-0.15678781478414799, u'earth')
(-0.15678781478414799, u'launch')
(-0.15678781478414799, u'rocket')
(-0.14909820280997324, u'great')
(-0.14909820280997324, u'love')
(-0.099773889976904243, u'blue')
Class 2 best: 
(0.40866468414933055, u'twitter')
(0.35268364630295895, u'@mention')
(0.22672506206603951, u'lol')
(0.22672506206603951, u'gr8')
(0.20433254740639992, u'social')
(0.20433254740639992, u'medium')
(0.2043321367429306, u'reddit')
(0.2043321367429306, u'fun')
(0.1259585842369193



In [2]:
import exifread
# Open image file for reading (binary mode)
f = open('/Users/Rebecca/Desktop/cherries.jpg', 'rb')

# Return Exif tags
tags = exifread.process_file(f)

In [3]:
tags

{}

In [14]:
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')
# USAGE
# python detect_blur.py --images images

# import the necessary packages
from imutils import paths
import argparse
import opencv
import cv3

def variance_of_laplacian(image):
	# compute the Laplacian of the image and then return the focus
	# measure, which is simply the variance of the Laplacian
	return cv3.Laplacian(image, cv3.CV_64F).var()

# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--images", required=True,
	help="path to input directory of images")
ap.add_argument("-t", "--threshold", type=float, default=100.0,
	help="focus measures that fall below this value will be considered 'blurry'")
args = vars(ap.parse_args())

# loop over the input images
for imagePath in paths.list_images(args["images"]):
	# load the image, convert it to grayscale, and compute the
	# focus measure of the image using the Variance of Laplacian
	# method
	image = cv3.imread(imagePath)
	gray = cv3.cvtColor(image, cv3.COLOR_BGR2GRAY)
	fm = variance_of_laplacian(gray)
	text = "Not Blurry"

	# if the focus measure is less than the supplied threshold,
	# then the image should be considered "blurry"
	if fm < args["threshold"]:
		text = "Blurry"

	# show the image
	cv3.putText(image, "{}: {:.2f}".format(text, fm), (10, 30),
		cv3.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 3)
	cv3.imshow("Image", image)
	key = cv3.waitKey(0)

ImportError: No module named cv2

In [2]:
!pip install imutils

Downloading/unpacking imutils
  Downloading imutils-0.3.6.tar.gz
  Running setup.py egg_info for package imutils
    
Installing collected packages: imutils
  Running setup.py install for imutils
    changing mode of build/scripts-2.7/range-detector from 644 to 755
    
    changing mode of /Users/Rebecca/anaconda/bin/range-detector to 755
  Could not find .egg-info directory in install record for imutils
Successfully installed imutils
Cleaning up...


In [13]:
!pip install opencv
!pip install cv2

Collecting opencv
[31m  Could not find a version that satisfies the requirement opencv (from versions: )[0m
[31mNo matching distribution found for opencv[0m
Collecting cv2
[31m  Could not find a version that satisfies the requirement cv2 (from versions: )[0m
[31mNo matching distribution found for cv2[0m


In [6]:
!pip freeze

Exception:
Traceback (most recent call last):
  File "/Users/Rebecca/anaconda/lib/python2.7/site-packages/pip/basecommand.py", line 134, in main
    status = self.run(options, args)
  File "/Users/Rebecca/anaconda/lib/python2.7/site-packages/pip/commands/freeze.py", line 73, in run
    req = pip.FrozenRequirement.from_dist(dist, dependency_links, find_tags=find_tags)
  File "/Users/Rebecca/anaconda/lib/python2.7/site-packages/pip/__init__.py", line 194, in from_dist
    assert len(specs) == 1 and specs[0][0] == '=='
AssertionError

Storing complete log in /Users/Rebecca/.pip/pip.log
