# sklearn demo

## First example

This example uses unigram count vectors to represent docs and queries. And, cosine similarity for the score.

In [1]:
# import required modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# init corpus
doc_corpus = [ 'the first concept',
        'a second concept',
        'a third concept',
        'the last concept']

query_corpus = [ 'question about the first concept',
        'puzzle involving a second concept',
        'query for a third concept',
        'example of the last concept']

In [3]:
# unigram vector representation
vectorizer = CountVectorizer(min_df=1)
doc_vectors = vectorizer.fit_transform(doc_corpus)
print "Feature vector: \n", vectorizer.get_feature_names()
print "Document corpus vectors: \n", doc_vectors.toarray()

Feature vector: 
[u'concept', u'first', u'last', u'second', u'the', u'third']
Document corpus vectors: 
[[1 1 0 0 1 0]
 [1 0 0 1 0 0]
 [1 0 0 0 0 1]
 [1 0 1 0 1 0]]


In [4]:
# cosine similarity score
for query in query_corpus:
    print "Query: \n\t", query
    q_vector = vectorizer.transform([query])
    print "Query vector: \n\t", q_vector.toarray()
    print "Cosine similarity score: \n\t", cosine_similarity(q_vector, doc_vectors.toarray())
    print "\n"

Query: 
	question about the first concept
Query vector: 
	[[1 1 0 0 1 0]]
Cosine similarity score: 
	[[ 1.          0.40824829  0.40824829  0.66666667]]


Query: 
	puzzle involving a second concept
Query vector: 
	[[1 0 0 1 0 0]]
Cosine similarity score: 
	[[ 0.40824829  1.          0.5         0.40824829]]


Query: 
	query for a third concept
Query vector: 
	[[1 0 0 0 0 1]]
Cosine similarity score: 
	[[ 0.40824829  0.5         1.          0.40824829]]


Query: 
	example of the last concept
Query vector: 
	[[1 0 1 0 1 0]]
Cosine similarity score: 
	[[ 0.66666667  0.40824829  0.40824829  1.        ]]




As seen in the example, questions are matched against concepts.

## second example

This example includes bigram vectors.

In [5]:
# both unigram and bigram vectors
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
bigram_doc_vectors = bigram_vectorizer.fit_transform(doc_corpus)
print "Feature vector: \n", bigram_vectorizer.get_feature_names()
print "Document corpus vectors: \n", bigram_doc_vectors.toarray()

Feature vector: 
[u'a', u'a second', u'a third', u'concept', u'first', u'first concept', u'last', u'last concept', u'second', u'second concept', u'the', u'the first', u'the last', u'third', u'third concept']
Document corpus vectors: 
[[0 0 0 1 1 1 0 0 0 0 1 1 0 0 0]
 [1 1 0 1 0 0 0 0 1 1 0 0 0 0 0]
 [1 0 1 1 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 1 0 0 1 1 0 0 1 0 1 0 0]]


In [6]:
# cosine similarity score
for query in query_corpus:
    print "Query: \n\t", query
    q_vector = bigram_vectorizer.transform([query])
    print "Query vector: \n\t", q_vector.toarray()
    print "Cosine similarity score: \n\t", cosine_similarity(q_vector, bigram_doc_vectors.toarray())
    print "\n"

Query: 
	question about the first concept
Query vector: 
	[[0 0 0 1 1 1 0 0 0 0 1 1 0 0 0]]
Cosine similarity score: 
	[[ 1.   0.2  0.2  0.4]]


Query: 
	puzzle involving a second concept
Query vector: 
	[[1 1 0 1 0 0 0 0 1 1 0 0 0 0 0]]
Cosine similarity score: 
	[[ 0.2  1.   0.4  0.2]]


Query: 
	query for a third concept
Query vector: 
	[[1 0 1 1 0 0 0 0 0 0 0 0 0 1 1]]
Cosine similarity score: 
	[[ 0.2  0.4  1.   0.2]]


Query: 
	example of the last concept
Query vector: 
	[[0 0 0 1 0 0 1 1 0 0 1 0 1 0 0]]
Cosine similarity score: 
	[[ 0.4  0.2  0.2  1. ]]


