<h1>TEXT to tensors</h1>

<h3>Tokenisation</h3>

In [1]:
import string
import numpy as np

In [2]:
BEGIN_SYMBOL = '^'
END_SYMBOL = '$'
BLANK = ' '
CHAR_SET = set(string.ascii_lowercase + BEGIN_SYMBOL + END_SYMBOL + BLANK)
CHAR_NUM = len(CHAR_SET)
CHAR_TO_INDICES = {c:i for i, c in enumerate(CHAR_SET)}
INDICES_TO_CHAR = {i:c for c, i in CHAR_TO_INDICES.iteritems()}

In [3]:
CHAR_TO_INDICES

{' ': 0,
 '$': 1,
 '^': 2,
 'a': 3,
 'b': 5,
 'c': 4,
 'd': 7,
 'e': 6,
 'f': 9,
 'g': 8,
 'h': 11,
 'i': 10,
 'j': 13,
 'k': 12,
 'l': 15,
 'm': 14,
 'n': 17,
 'o': 16,
 'p': 19,
 'q': 18,
 'r': 21,
 's': 20,
 't': 23,
 'u': 22,
 'v': 25,
 'w': 24,
 'x': 27,
 'y': 26,
 'z': 28}

In [4]:
INDICES_TO_CHAR

{0: ' ',
 1: '$',
 2: '^',
 3: 'a',
 4: 'c',
 5: 'b',
 6: 'e',
 7: 'd',
 8: 'g',
 9: 'f',
 10: 'i',
 11: 'h',
 12: 'k',
 13: 'j',
 14: 'm',
 15: 'l',
 16: 'o',
 17: 'n',
 18: 'q',
 19: 'p',
 20: 's',
 21: 'r',
 22: 'u',
 23: 't',
 24: 'w',
 25: 'v',
 26: 'y',
 27: 'x',
 28: 'z'}

In [5]:
def vectorize(word, seq_len, vec_size):
    vec = np.zeros((seq_len, vec_size), dtype=int)
    for i, ch in enumerate(word):
        vec[i, CHAR_TO_INDICES[ch]] = 1

    for i in range(len(word), seq_len):
        vec[i, CHAR_TO_INDICES[END_SYMBOL]] = 1

    return vec

In [8]:
say = 'ewqwr qerqewr qrwrqwr qwrwqrqw rqwrqw rqwrqwr qwrwqrq wqrwqr qwrqwrqw'
X = np.zeros((1,150, 29), dtype=int)
Word = BEGIN_SYMBOL + say.lower().strip() + END_SYMBOL
X[0] = vectorize(Word, 150, 29)
X.shape

(1, 150, 29)

In [17]:
X[0][6]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [18]:
print ''.join([INDICES_TO_CHAR[i]
               for i in X[0].argmax(axis=1)
               if INDICES_TO_CHAR[i] not in (BEGIN_SYMBOL,END_SYMBOL)])

ewqwr qerqewr qrwrqwr qwrwqrqw rqwrqw rqwrqwr qwrwqrq wqrwqr qwrqwrqw


<h3>CountVectorizer</h3>

In [118]:
from sklearn.feature_extraction.text import CountVectorizer

In [120]:
vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [122]:
corpus = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?',
]

In [129]:
x = vectorizer.fit_transform(corpus)
x

<4x9 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [127]:
vectorizer.get_feature_names()

[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']

In [130]:
x.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

<h1>tf - idf</h1>

In [39]:
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return (float)(blob.words.count(word)) / (float)(len(blob.words))  # frequency of word in sentence

def n_containing(word, bloblist):
    return (float)(sum(1 for blob in bloblist if word in blob)) # no. of word in corpus

def idf(word, bloblist):
    return (float)(math.log(len(bloblist)) / (float)(1 + n_containing(word, bloblist))) # idf 

def tfidf(word, blob, bloblist):
    return (float)((float)(tf(word, blob)) * (float)(idf(word, bloblist))) # tf * idf

In [88]:
document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
 California and Malibu, California. Python was followed by two sequels: Python
 II (2002) and Boa vs. Python (2004), both also made-for-TV films.""")

document2 = tb("""Python, from the Greek word, is a genus of
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are
recognised.[2] A member of this genus, P. reticulatus, is among the longest
snakes known.""")

document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum".[1] It was first introduced
in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made.""")

In [None]:
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

In [103]:
doc1 = [word for word in document1.words]
doc2 = [word for word in document2.words]
doc3 = [word for word in document3.words]
#bloblist = [doc1, doc2, doc3]

In [97]:
bloblist = [document1, document2, document3]

In [114]:
len(bloblist)

3

In [100]:
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:5]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: The, TF-IDF: 0.01801
	Word: python, TF-IDF: 0.01501
	Word: A, TF-IDF: 0.01501
	Word: films, TF-IDF: 0.01351
	Word: the, TF-IDF: 0.01351
Top words in document 2
	Word: genus, TF-IDF: 0.03052
	Word: A, TF-IDF: 0.02034
	Word: among, TF-IDF: 0.01526
	Word: snakes, TF-IDF: 0.01526
	Word: is, TF-IDF: 0.01526
Top words in document 3
	Word: The, TF-IDF: 0.02469
	Word: Magnum, TF-IDF: 0.01852
	Word: revolver, TF-IDF: 0.01852
	Word: Colt, TF-IDF: 0.01852
	Word: the, TF-IDF: 0.01852


In [112]:
doc3_vec = [scores[w] for w in doc3] 

In [113]:
doc3_vec # sentence in vector form

[0.024687916599283367,
 0.018515937449462522,
 0.009257968724731261,
 0.006171979149820842,
 0.006171979149820842,
 0.006171979149820842,
 0.018515937449462522,
 0.006171979149820842,
 0.018515937449462522,
 0.006171979149820842,
 0.006171979149820842,
 0.0041146527665472275,
 0.018515937449462522,
 0.012343958299641683,
 0.006171979149820842,
 0.006171979149820842,
 0.003085989574910421,
 0.006171979149820842,
 0.006171979149820842,
 0.008229305533094455,
 0.006171979149820842,
 0.006171979149820842,
 0.006171979149820842,
 0.0041146527665472275,
 0.01645861106618891,
 0.006171979149820842,
 0.006171979149820842,
 0.018515937449462522,
 0.0041146527665472275,
 0.008229305533094455,
 0.0041146527665472275,
 0.006171979149820842,
 0.006171979149820842,
 0.003085989574910421,
 0.006171979149820842,
 0.018515937449462522,
 0.006171979149820842,
 0.006171979149820842,
 0.01645861106618891,
 0.006171979149820842,
 0.006171979149820842,
 0.006171979149820842,
 0.012343958299641683,
 0.006171

In [48]:
word_count = {word:(tf(word, document1)*100) for word in document1.words}

In [49]:
word_count

{'13th': 0.819672131147541,
 '2000': 0.819672131147541,
 '2002': 0.819672131147541,
 '2004': 0.819672131147541,
 'A': 4.098360655737705,
 'Angeles': 0.819672131147541,
 'Barron': 0.819672131147541,
 'Boa': 0.819672131147541,
 'Bowe': 0.819672131147541,
 'California': 1.639344262295082,
 'Casper': 0.819672131147541,
 'Clabaugh': 0.819672131147541,
 'Coogan': 0.819672131147541,
 'Dana': 0.819672131147541,
 'David': 0.819672131147541,
 'Dien': 0.819672131147541,
 'Elm': 0.819672131147541,
 'Englund': 0.819672131147541,
 'Freddy': 0.819672131147541,
 'Friday': 0.819672131147541,
 'II': 0.819672131147541,
 'It': 1.639344262295082,
 'Jenny': 0.819672131147541,
 'Karate': 0.819672131147541,
 'Keith': 0.819672131147541,
 'Kid': 0.819672131147541,
 'Krueger': 0.819672131147541,
 'Los': 0.819672131147541,
 'Malibu': 0.819672131147541,
 'McCarthy': 0.819672131147541,
 'Nightmare': 0.819672131147541,
 'Python': 4.098360655737705,
 'Richard': 0.819672131147541,
 'Robert': 0.819672131147541,
 'Sean'

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["This is very strange",
          "This is very nice"]
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
print dict(zip(vectorizer.get_feature_names(), idf))

{u'this': 1.0, u'very': 1.0, u'is': 1.0, u'strange': 1.4054651081081644, u'nice': 1.4054651081081644}


In [117]:
X.toarray()

array([[ 0.44832087,  0.        ,  0.63009934,  0.44832087,  0.44832087],
       [ 0.44832087,  0.63009934,  0.        ,  0.44832087,  0.44832087]])