# Word frequency using "TfidfVectorizer"
<b>It transforms a list of documents into a word frequency array, which it outputs as a csr_matrix.</b>

In [17]:
# Let's import TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
# Sample data in list
documnets = ['cats say meow', 'dogs say woof', 'dogs chase cats']

In [23]:
# intialize the "TfidfVectorizer"
tfidv = TfidfVectorizer()

In [24]:
# apply fit_transfer to tfidv
doc_fre = tfidv.fit_transform(documnets)

In [25]:
# print result using .toarray()
doc_fre.toarray()

array([[0.51785612, 0.        , 0.        , 0.68091856, 0.51785612,
        0.        ],
       [0.        , 0.        , 0.51785612, 0.        , 0.51785612,
        0.68091856],
       [0.51785612, 0.68091856, 0.51785612, 0.        , 0.        ,
        0.        ]])

In [26]:
# get the word
words = tfidv.get_feature_names()

In [27]:
words

['cats', 'chase', 'dogs', 'meow', 'say', 'woof']

# Get Word frequency from wikipedia page 

In [28]:
import requests

import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [29]:
df = pd.read_csv('https://raw.githubusercontent.com/sawrupesh04/DataCamp/master/Machine%20Learning/Unsupervised-learning/dataset/Wikipedia%20articles/wikipedia-vectors.csv', index_col=0)

In [30]:
articles = csr_matrix(df.transpose())

In [31]:
titles = list(df.columns)

In [32]:
# Let's create TruncatedSVD instance
tsvd = TruncatedSVD(n_components=50)

# Let's create Kmean instance with cluster=5
kmean = KMeans(n_clusters=5)

In [33]:
# Let's create pipeline
pipeline = make_pipeline(tsvd, kmean)

In [34]:
# fit 
pipeline.fit(articles)

Pipeline(memory=None,
         steps=[('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
                              random_state=None, tol=0.0)),
                ('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=5, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)

In [35]:
labels = pipeline.predict(articles)

In [36]:
# create dateframe
df = pd.DataFrame({'labels': labels, 'articles': titles})

In [37]:
print(df.sort_values('labels'))

    labels                                       articles
29       0                               Jennifer Aniston
28       0                                  Anne Hathaway
27       0                                 Dakota Fanning
26       0                                     Mila Kunis
25       0                                  Russell Crowe
24       0                                   Jessica Biel
23       0                           Catherine Zeta-Jones
22       0                              Denzel Washington
21       0                             Michael Fassbender
20       0                                 Angelina Jolie
0        1                                       HTTP 404
9        1                                       LinkedIn
2        1                              Internet Explorer
3        1                                    HTTP cookie
4        1                                  Google Search
5        1                                         Tumblr
6        1    

### Let's reduce the dimension of articles using NMF
<b>Get info about <a href='https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/'>NMF</a></b>

In [38]:
from sklearn.decomposition import NMF

In [39]:
# Create a NMF model
model = NMF(n_components=6)

In [40]:
# fit the model
model.fit(articles)

# transform the feature nmf_features
nmf_features = model.transform(articles)

In [41]:
# print the nmf_features
nmf_features

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.40519498e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 5.66674144e-01],
       [3.82024669e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.98695604e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.81786828e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.85576056e-01],
       [1.29280793e-02, 1.37894216e-02, 7.76328904e-03, 3.34410240e-02,
        0.00000000e+00, 3.34563100e-01],
       [0.00000000e+00, 0.00000000e+00, 2.06743204e-02, 0.00000000e+00,
        6.04397024e-03, 3.59105336e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.91036326e-01],
       [1.54261929e-02, 1.42821634e-02, 3.76635149e-03, 2.37058370e-02,
        2.62581725e-02, 

In [42]:
print('Articles Shape :        >>>    ', articles.shape)
print('NMF Features Shape :    >>>    ', nmf_features.shape)

Articles Shape :        >>>     (60, 13125)
NMF Features Shape :    >>>     (60, 6)


We can clearly see the dimension reduction.

In [43]:
# create a pandas dataframe 
df = pd.DataFrame(nmf_features, index=titles)

In [44]:
df.head()

Unnamed: 0,0,1,2,3,4,5
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.440519
Alexa Internet,0.0,0.0,0.0,0.0,0.0,0.566674
Internet Explorer,0.00382,0.0,0.0,0.0,0.0,0.398696
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.381787
Google Search,0.0,0.0,0.0,0.0,0.0,0.485576


In [45]:
print(df.loc['Anne Hathaway'])
print('----------------------')
print(df.loc['Denzel Washington'])

0    0.003845
1    0.000000
2    0.000000
3    0.575582
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
----------------------
0    0.000000
1    0.005601
2    0.000000
3    0.422286
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


The NMF feature 3 by far has highest value.

### Which article are similar to 'Cristiano Ronaldo'?

In [46]:
# necessary imports
from sklearn.preprocessing import normalize

In [48]:
norm_feature = normalize(nmf_features)

In [49]:
# create a dataframe
new_df = pd.DataFrame(norm_feature, index=titles)

In [51]:
# Select the 'Cristiano Ronaldo' article
article = new_df.loc['Cristiano Ronaldo']

In [52]:
# compute the dot product 
similarity = new_df.dot(article)

In [55]:
# Get the larest similar articles
similarity.nlargest()

Cristiano Ronaldo                1.000000
Franck Ribéry                    0.999972
Radamel Falcao                   0.999942
Zlatan Ibrahimović               0.999942
France national football team    0.999923
dtype: float64