# Word frequency using "TfidfVectorizer"
<b>It transforms a list of documents into a word frequency array, which it outputs as a csr_matrix.</b>

In [2]:
# Let's import TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Sample data in list
documnets = ['cats say meow', 'dogs say woof', 'dogs chase cats']

In [5]:
# intialize the "TfidfVectorizer"
tfidv = TfidfVectorizer()

In [6]:
# apply fit_transfer to tfidv
doc_fre = tfidv.fit_transform(documnets)

In [13]:
# print result using .toarray()
doc_fre.toarray()

array([[0.51785612, 0.        , 0.        , 0.68091856, 0.51785612,
        0.        ],
       [0.        , 0.        , 0.51785612, 0.        , 0.51785612,
        0.68091856],
       [0.51785612, 0.68091856, 0.51785612, 0.        , 0.        ,
        0.        ]])

In [14]:
# get the word
words = tfidv.get_feature_names()

In [12]:
words

['cats', 'chase', 'dogs', 'meow', 'say', 'woof']

# Get Word frequency from wikipedia page 

In [54]:
import requests

import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [52]:
df = pd.read_csv('https://raw.githubusercontent.com/sawrupesh04/DataCamp/master/Machine%20Learning/Unsupervised-learning/dataset/Wikipedia%20articles/wikipedia-vectors.csv', index_col=0)

In [55]:
articles = csr_matrix(df.transpose())

In [56]:
titles = list(df.columns)

In [57]:
# Let's create TruncatedSVD instance
tsvd = TruncatedSVD(n_components=50)

# Let's create Kmean instance with cluster=5
kmean = KMeans(n_clusters=5)

In [59]:
# Let's create pipeline
pipeline = make_pipeline(tsvd, kmean)

In [60]:
# fit 
pipeline.fit(articles)

Pipeline(memory=None,
         steps=[('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
                              random_state=None, tol=0.0)),
                ('kmeans',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=300, n_clusters=5, n_init=10, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)

In [61]:
labels = pipeline.predict(articles)

In [67]:
# create dateframe
df = pd.DataFrame({'labels': labels, 'articles': titles})

In [68]:
print(df.sort_values('labels'))

    labels                                       articles
34       0                             Zlatan Ibrahimović
31       0                              Cristiano Ronaldo
35       0                Colombia national football team
36       0              2014 FIFA World Cup qualification
37       0                                       Football
38       0                                         Neymar
39       0                                  Franck Ribéry
33       0                                 Radamel Falcao
30       0                  France national football team
32       0                                   Arsenal F.C.
21       1                             Michael Fassbender
22       1                              Denzel Washington
23       1                           Catherine Zeta-Jones
24       1                                   Jessica Biel
25       1                                  Russell Crowe
26       1                                     Mila Kunis
20       1    