# sklearn demo

## First example

This example uses unigram tfidf vectors to represent docs and queries. And, cosine similarity for the score.

In [1]:
# import required modules
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# init corpus
doc_corpus = [ 'the first concept',
        'a second concept',
        'a third concept',
        'the last concept']

query_corpus = [ 'question about the first concept',
        'puzzle involving a second concept',
        'query for a third concept',
        'example of the last concept']

In [3]:
# unigram vector representation
vectorizer = TfidfVectorizer(min_df=1)
doc_vectors = vectorizer.fit_transform(doc_corpus)
print "Feature vector: \n", vectorizer.get_feature_names()
print "Document corpus vectors: \n", doc_vectors.toarray()

Feature vector: 
[u'concept', u'first', u'last', u'second', u'the', u'third']
Document corpus vectors: 
[[ 0.37919167  0.72664149  0.          0.          0.5728925   0.        ]
 [ 0.46263733  0.          0.          0.88654763  0.          0.        ]
 [ 0.46263733  0.          0.          0.          0.          0.88654763]
 [ 0.37919167  0.          0.72664149  0.          0.5728925   0.        ]]


In [4]:
# cosine similarity score
for query in query_corpus:
    print "Query: \n\t", query
    q_vector = vectorizer.transform([query])
    print "Query vector: \n\t", q_vector.toarray()
    print "Cosine similarity score: \n\t", cosine_similarity(q_vector, doc_vectors.toarray())
    print "\n"

Query: 
	question about the first concept
Query vector: 
	[[ 0.37919167  0.72664149  0.          0.          0.5728925   0.        ]]
Cosine similarity score: 
	[[ 1.          0.17542822  0.17542822  0.47199214]]


Query: 
	puzzle involving a second concept
Query vector: 
	[[ 0.46263733  0.          0.          0.88654763  0.          0.        ]]
Cosine similarity score: 
	[[ 0.17542822  1.          0.2140333   0.17542822]]


Query: 
	query for a third concept
Query vector: 
	[[ 0.46263733  0.          0.          0.          0.          0.88654763]]
Cosine similarity score: 
	[[ 0.17542822  0.2140333   1.          0.17542822]]


Query: 
	example of the last concept
Query vector: 
	[[ 0.37919167  0.          0.72664149  0.          0.5728925   0.        ]]
Cosine similarity score: 
	[[ 0.47199214  0.17542822  0.17542822  1.        ]]




As seen in the example, questions are matched against concepts.

## second example

This example uses both unigram and bigram for vectors.

In [5]:
# both unigram and bigram vectors
bigram_vectorizer = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
bigram_doc_vectors = bigram_vectorizer.fit_transform(doc_corpus)
print "Feature vector: \n", bigram_vectorizer.get_feature_names()
print "Document corpus vectors: \n", bigram_doc_vectors.toarray()

Feature vector: 
[u'a', u'a second', u'a third', u'concept', u'first', u'first concept', u'last', u'last concept', u'second', u'second concept', u'the', u'the first', u'the last', u'third', u'third concept']
Document corpus vectors: 
[[ 0.          0.          0.          0.26445122  0.50676543  0.50676543
   0.          0.          0.          0.          0.39953968  0.50676543
   0.          0.          0.        ]
 [ 0.39953968  0.50676543  0.          0.26445122  0.          0.          0.
   0.          0.50676543  0.50676543  0.          0.          0.          0.
   0.        ]
 [ 0.39953968  0.          0.50676543  0.26445122  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.50676543  0.50676543]
 [ 0.          0.          0.          0.26445122  0.          0.
   0.50676543  0.50676543  0.          0.          0.39953968  0.
   0.50676543  0.          0.        ]]


In [6]:
# cosine similarity score
for query in query_corpus:
    print "Query: \n\t", query
    q_vector = bigram_vectorizer.transform([query])
    print "Query vector: \n\t", q_vector.toarray()
    print "Cosine similarity score: \n\t", cosine_similarity(q_vector, bigram_doc_vectors.toarray())
    print "\n"

Query: 
	question about the first concept
Query vector: 
	[[ 0.          0.          0.          0.26445122  0.50676543  0.50676543
   0.          0.          0.          0.          0.39953968  0.50676543
   0.          0.          0.        ]]
Cosine similarity score: 
	[[ 1.          0.06993445  0.06993445  0.22956641]]


Query: 
	puzzle involving a second concept
Query vector: 
	[[ 0.39953968  0.50676543  0.          0.26445122  0.          0.          0.
   0.          0.50676543  0.50676543  0.          0.          0.          0.
   0.        ]]
Cosine similarity score: 
	[[ 0.06993445  1.          0.22956641  0.06993445]]


Query: 
	query for a third concept
Query vector: 
	[[ 0.39953968  0.          0.50676543  0.26445122  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.50676543  0.50676543]]
Cosine similarity score: 
	[[ 0.06993445  0.22956641  1.          0.06993445]]


Query: 
	example of the last concept
Query vector: 
	[[ 0