In [1]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer

data = '''Time flies like an arrow
Fruit flies like a banana
Cat sat on the mat
The cat is white.'''

t0 = time()
dataset = data.split('\n')

In [2]:
dataset

['Time flies like an arrow',
 'Fruit flies like a banana',
 'Cat sat on the mat',
 'The cat is white.']

In [4]:
dataset

['Time flies like an arrow', 'Fruit flies like a banana', 'Cat sat on the mat']

In [3]:
#Get TF-IDFs.
print("Extracting tf-idf features...")
#First we initiate an empty tfidf object with specific conditions
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))#max_df=0.95, min_df=2, stop_words='english' #USE HELP TO SEE WHAT EACH DOES)
t0 = time()
#Next we give the data for processing
tfidf = tfidf_vectorizer.fit_transform(dataset)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features...
done in 0.016s.


In [5]:
tfidf.data

array([ 0.35657982,  0.28113163,  0.28113163,  0.35657982,  0.35657982,
        0.35657982,  0.28113163,  0.35657982,  0.35657982,  0.32555709,
        0.32555709,  0.32555709,  0.41292788,  0.41292788,  0.41292788,
        0.41292788,  0.27460308,  0.34829919,  0.34829919,  0.27460308,
        0.34829919,  0.34829919,  0.34829919,  0.34829919,  0.34829919,
        0.31553666,  0.31553666,  0.40021825,  0.40021825,  0.40021825,
        0.40021825,  0.40021825])

In [6]:
dense = tfidf.todense()
dense.shape
print(dense)

[[ 0.35657982  0.35657982  0.35657982  0.          0.          0.          0.
   0.28113163  0.28113163  0.          0.          0.          0.
   0.28113163  0.35657982  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.35657982  0.35657982
   0.        ]
 [ 0.          0.          0.          0.41292788  0.          0.          0.
   0.32555709  0.32555709  0.41292788  0.41292788  0.          0.
   0.32555709  0.          0.41292788  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.27460308  0.
   0.34829919  0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.34829919  0.34829919  0.34829919
   0.34829919  0.34829919  0.27460308  0.          0.34829919  0.          0.
   0.        ]
 [ 0.          0.          0.          0.          0.31553666  0.4002

In [20]:
feature_names = tfidf_vectorizer.get_feature_names()
print(len(feature_names))
feature_names[:3]

27


[u'an', u'an arrow', u'arrow']

In [21]:
import pandas as pd
x = pd.DataFrame(dense)
x.columns = tfidf_vectorizer.get_feature_names()
x['text'] = dataset
x.to_csv('mytfidf.csv', index = False)
x

Unnamed: 0,an,an arrow,arrow,banana,cat,cat is,cat sat,flies,flies like,fruit,...,on the,sat,sat on,the,the cat,the mat,time,time flies,white,text
0,0.35658,0.35658,0.35658,0.0,0.0,0.0,0.0,0.281132,0.281132,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.35658,0.35658,0.0,Time flies like an arrow
1,0.0,0.0,0.0,0.412928,0.0,0.0,0.0,0.325557,0.325557,0.412928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fruit flies like a banana
2,0.0,0.0,0.0,0.0,0.274603,0.0,0.348299,0.0,0.0,0.0,...,0.348299,0.348299,0.348299,0.274603,0.0,0.348299,0.0,0.0,0.0,Cat sat on the mat
3,0.0,0.0,0.0,0.0,0.315537,0.400218,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.315537,0.400218,0.0,0.0,0.0,0.400218,The cat is white.


### Doc Similarity

Given a new query, how to find out which document is it closest to?

In [7]:
new = 'Time flies like Sam'
response = tfidf_vectorizer.transform([new])

In [8]:
dense_response = response.todense()

In [9]:
pd.DataFrame(dense_response, columns=x.columns[0:27])

Unnamed: 0,an,an arrow,arrow,banana,cat,cat is,cat sat,flies,flies like,fruit,...,on,on the,sat,sat on,the,the cat,the mat,time,time flies,white
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401043,0.401043,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.508672,0.508672,0.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

map(lambda x: cosine_similarity(response, x), dense)

[array([[ 0.70100165]]),
 array([[ 0.39168692]]),
 array([[ 0.]]),
 array([[ 0.]])]

====================================
### Using logistic regression to predict closest document
Logistic regression is not used for these types of problems, but let us use it to learn the syntax
* We already have our dataframe with 4x36 shape.
* We create the target attribute which would be unique to each document
* Then create an empty logistic regression object with
```python
model = LogisticRegression()```
* We train the ```model``` using the following method and syntax
```python
model.fit(X, y)``` 
** Here X is our dataframe (tfidf matrix) and y is our list of ground truth labels **
* The prediction is done using the following method on ```model```
```python
model.predict(response)```

In [11]:
truth = [1,2,3,4]

In [12]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [13]:
model.fit(dense, truth)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
model.predict_proba(response)

array([[ 0.30489197,  0.26112455,  0.21699174,  0.21699174]])

In [16]:
model.predict(response)

array([1])