In [115]:
import warnings
warnings.filterwarnings('ignore')

I. Preprocessing
=========

The objective is to **clean** and **standardize** your input data so that it can be manipulated easily afterward.

1. Tokenization
---------------

Split your input into **tokens**, according to specific rules.

In [50]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize import RegexpTokenizer
import nltk.data

**Word tokenizer**

In [6]:
word_tokenize("Welcome to the Data For Good NLP workshops! Hope you'll enjoy it :)")

['Welcome',
 'to',
 'the',
 'Data',
 'For',
 'Good',
 'NLP',
 'workshops',
 '!',
 'Hope',
 'you',
 "'ll",
 'enjoy',
 'it',
 ':',
 ')']

**Tweet tokenizer**

In [8]:
tokenizer = TweetTokenizer()
tweet = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokenizer.tokenize(tweet)

[u'This',
 u'is',
 u'a',
 u'cooool',
 u'#dummysmiley',
 u':',
 u':-)',
 u':-P',
 u'<3',
 u'and',
 u'some',
 u'arrows',
 u'<',
 u'>',
 u'->',
 u'<--']

In [9]:
tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
excited_tweet = '@snips: This workshop is sooooooooo cool!!!!!!'
tokenizer.tokenize(excited_tweet)

[u':', u'This', u'workshop', u'is', u'sooo', u'cool', u'!', u'!', u'!']

**Multi-word Expression tokenizer**

In [19]:
tokenizer = MWETokenizer([('Data','For','Good'), ('natural', 'language', 'processing')])
tokenizer.add_mwe(('wednesday', 'evening'))
tokenizer.tokenize('Snips is hosting Data For Good natural language processing workshops on wednesday evening'.split(),)

['Snips',
 'hosting',
 'Data_For_Good',
 'natural_language_processing',
 'workshops',
 'on',
 'wednesday_evening']

**Sentence tokenizer**

In [58]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
text = '''
This sentence tokenizer knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.  And sometimes sentences can start with non-capitalized 
words.  i is a good variable name.
'''
sentences = sent_detector.tokenize(text.strip())
print('\n-----\n'.join(sentences))

This sentence tokenizer knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.
-----
And sometimes sentences can start with non-capitalized 
words.
-----
i is a good variable name.


**Regexp tokenizer**

In [31]:
tokenizer = RegexpTokenizer('[A-Z]\w+')
tokenizer.tokenize('Snips is hosting Data For Good natural language processing workshops on wednesday evening')

['Snips', 'Data', 'For', 'Good']

2. Stemming
-----------

Normalize each token by **reducing** it to its linguistic root or **stem**.

In [45]:
from nltk.stem import SnowballStemmer

In [57]:
stemmer = SnowballStemmer("english")
sentence = '''Stemming is the term used in linguistic morphology and information retrieval to describe
the process for reducing inflected words to their word stem'''
for token in word_tokenize(sentence):
    print(token + ' --> ' + stemmer.stem(token))

Stemming --> stem
is --> is
the --> the
term --> term
used --> use
in --> in
linguistic --> linguist
morphology --> morpholog
and --> and
information --> inform
retrieval --> retriev
to --> to
describe --> describ
the --> the
process --> process
for --> for
reducing --> reduc
inflected --> inflect
words --> word
to --> to
their --> their
word --> word
stem --> stem


II. Features
=======

The idea is to build a representation of each token, which can be understood and manipulated easily by a learning algorithm.

Part-Of-Speech (POS) Tagging
--------------------------

Tags meaning here --> https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [37]:
from nltk import pos_tag

In [58]:
sentence = "The cat is hunting silently in the very dark alley"
tokens = word_tokenize(sentence)
tags = pos_tag(tokens)
print tags

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('hunting', 'VBG'), ('silently', 'RB'), ('in', 'IN'), ('the', 'DT'), ('very', 'RB'), ('dark', 'JJ'), ('alley', 'NN')]


Word Embedding with Word2Vec
--------------------------

The objective is to map each word to a vector or real numbers in a relevant way. Words with similar meanings should be close in this vector space.

In [1]:
import word2vec
model = word2vec.load("GoogleNews-vectors-negative300.bin", encoding="ISO-8859-1")

In [10]:
model['good'].shape

(300,)

In [12]:
model['good'][:10].tolist()

[0.018065597862005234,
 0.02786020003259182,
 -0.007781266700476408,
 0.035042908042669296,
 0.014583073556423187,
 -0.005631895735859871,
 0.004298741929233074,
 0.05506742745637894,
 -0.009576943702995777,
 0.06790924072265625]

In [132]:
indexes, metrics = model.cosine('good', n=5)
model.generate_response(indexes, metrics).tolist()

[(u'great', 0.7291509923988669),
 (u'bad', 0.7190051300688269),
 (u'terrific', 0.6889115720662927),
 (u'decent', 0.6837348416440666),
 (u'nice', 0.6836092515280819)]

In [69]:
indexes, metrics = model.cosine('better', n=5)
model.generate_response(indexes, metrics).tolist()

[(u'stronger', 0.6623841784808244),
 (u'quicker', 0.6499592814523818),
 (u'smarter', 0.6418017961667891),
 (u'worse', 0.6248995415773693),
 (u'good', 0.6120729390511608)]

In [71]:
indexes, metrics = model.analogy(pos=['bad', 'better'], neg=['good'], n=5)
model.generate_response(indexes, metrics).tolist()

[(u'worse', 0.2644434422893838),
 (u'uglier', 0.20772927367153107),
 (u'sooner', 0.19263500501334097),
 (u'dumber', 0.18967211741489587),
 (u'differently', 0.18769694485908592)]

Term frequencies
---------------

The objective here is to identify important words in a document.

In [111]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from unidecode import unidecode
import math

In [157]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = SnowballStemmer('english')

def compute_df(filename, with_stemming):
    nb_documents = 0
    idf = {}
    line_number = 0
    for line in open(filename):
        if len(line) == 0:
            continue
        for sentence in sent_detector.tokenize(unidecode(line)):
            nb_documents+=1
            for token in set(word_tokenize(sentence)):
                stem = stemmer.stem(token)
                if stem in idf:
                    idf[stem] += 1
                else:
                    idf[stem] = 1
        line_number+=1
        if line_number % 30000 == 0:
            print(line_number)
    return nb_documents, idf

def compute_tf_idf(string, nb_documents, df, with_stemming=False):
    tokens=word_tokenize(string)
    tf_idf = {}
    if with_stemming:
        tokens = map(lambda token: stemmer.stem(token), tokens)
    frequencies = FreqDist(tokens)
    for freq in frequencies.iteritems():
        token = freq[0]
        tf_idf[token] = freq[1]*math.log(nb_documents/(1+df.get(token, 0)))
    return tf_idf

In [158]:
nb_documents, df = compute_df("raw.en/englishText_10000_20000", True)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000


In [168]:
sentence = "I recently got intoxicated after eating in a terrible restaurant with some of my friends"

tf_idf = compute_tf_idf(sentence, nb_documents, df, with_stemming=True)
[(token, score) for token, score in sorted(tf_idf.iteritems(),key=lambda item: -item[1])]

[(u'intox', 10.160452652325848),
 (u'terribl', 8.945984124827898),
 (u'restaur', 7.70210434005105),
 (u'eat', 7.649216319820633),
 (u'got', 6.949856455000773),
 ('my', 6.375024819828097),
 (u'recent', 6.210600077024653),
 (u'friend', 6.150602768446279),
 ('i', 5.087596335232384),
 (u'some', 4.770684624465665),
 (u'after', 4.343805421853684),
 (u'with', 3.091042453358316),
 ('a', 1.9459101490553132),
 ('in', 1.791759469228055),
 ('of', 1.6094379124341003)]