In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

# A tf-idf word-frequency array 

In [18]:
documents=['cats say meow', 'dogs say woof', 'dogs chase cats']

# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer()

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())

# Get the words: words
words =tfidf.get_feature_names()

# Print words
print(words)

[[0.51785612 0.         0.         0.68091856 0.51785612 0.        ]
 [0.         0.         0.51785612 0.         0.51785612 0.68091856]
 [0.51785612 0.68091856 0.51785612 0.         0.         0.        ]]
['cats', 'chase', 'dogs', 'meow', 'say', 'woof']


# Clustering Wikipedia part

In [3]:
df = pd.read_csv('wikipedia-vectors.csv')

In [4]:
df.T.to_csv('wikipedia-vectors2.csv')
articles = pd.read_csv('wikipedia-vectors2.csv')

In [5]:
# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd,kmeans)

In [6]:
articles.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,13115,13116,13117,13118,13119,13120,13121,13122,13123,13124
0,HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
3,HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [7]:
titles=articles.loc[:,'Unnamed: 0']

In [8]:
articles1=articles.drop(['Unnamed: 0'],axis=1).values

In [9]:
# Fit the pipeline to articles
pipeline.fit(articles1)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles1)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df.sort_values('label'))

                                          article  label
29                               Jennifer Aniston      0
28                                  Anne Hathaway      0
27                                 Dakota Fanning      0
26                                     Mila Kunis      0
25                                  Russell Crowe      0
24                                   Jessica Biel      0
23                           Catherine Zeta-Jones      0
22                              Denzel Washington      0
21                             Michael Fassbender      0
20                                 Angelina Jolie      0
0                                        HTTP 404      1
4                                   Google Search      1
1                                  Alexa Internet      1
2                               Internet Explorer      1
3                                     HTTP cookie      1
5                                          Tumblr      1
6                     Hypertext

# NMF applied to Wikipedia articles

In [10]:
# Create an NMF instance: model
model =NMF(n_components=6)

# Fit the model to articles
model.fit(articles1)

# Transform the articles: nmf_features
nmf_features = model.transform(articles1)

# Print the NMF features
print(nmf_features)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.40469565e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 5.66610790e-01]
 [3.82027256e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 3.98650734e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 3.81743552e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.85522092e-01]
 [1.29281754e-02 1.37894214e-02 7.76340609e-03 3.34457763e-02
  0.00000000e+00 3.34525964e-01]
 [0.00000000e+00 0.00000000e+00 2.06748288e-02 0.00000000e+00
  6.04386412e-03 3.59064382e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.90981792e-01]
 [1.54262982e-02 1.42821549e-02 3.76643718e-03 2.37093104e-02
  2.62574982e-02 4.80779509e-01]
 [1.11730893e-02 3.13687747e-02 3.09493836e-02 6.56946174e-02
  1.96642608e-02 3.38292872e-01]
 [0.00000000e+00 0.00000000e+00 5.30734104e-01 0.0

# NMF features of the Wikipedia articles

In [11]:
# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features,index=titles)

# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])

0    0.003845
1    0.000000
2    0.000000
3    0.575666
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005601
2    0.000000
3    0.422347
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


In [12]:
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.44047
Alexa Internet,0.0,0.0,0.0,0.0,0.0,0.566611
Internet Explorer,0.00382,0.0,0.0,0.0,0.0,0.398651
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.381744
Google Search,0.0,0.0,0.0,0.0,0.0,0.485522


In [13]:
df=pd.read_csv('wikipedia_vocabulary_utf8.txt',header=0)
Words=df['Words']
df.shape

(13125, 1)

In [14]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_,columns=Words)

# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest())

(6, 13125)
Words
film       0.627925
award      0.253151
starred    0.245303
role       0.211467
actress    0.186412
Name: 3, dtype: float64


In [15]:
components_df.head()

Words,aaron,abandon,abandoned,abandoning,abandonment,abbas,abbey,abbreviated,abbreviation,abc,...,zealand,zenith,zeppelin,zero,zeus,zimbabwe,zinc,zone,zones,zoo
0,0.011376,0.00121,0.0,0.001739,0.000136,0.0,0.0,0.002463,2.446296e-07,0.000834,...,0.025782,0.0,0.008324,0.0,0.0,0.0,0.0,0.0,0.000424,0.0
1,0.0,1e-05,0.005663,0.0,2e-06,0.0,0.0,0.000566,0.0005002454,0.0,...,0.008106,0.0,0.0,0.00171,0.0,0.0,0.0,0.002813,0.000297,0.0
2,0.0,8e-06,0.0,0.0,0.004691,0.0,0.0,0.000758,1.604213e-05,0.0,...,0.00873,0.0,0.0,0.001317,0.0,0.0,0.0,0.0,0.000143,0.0
3,0.004148,0.0,0.003056,0.0,0.000614,0.0,0.0,0.002436,8.14389e-05,0.003985,...,0.012595,0.0,0.0,0.0,0.0,0.0,0.0,0.001742,0.00672,0.0
4,0.0,0.000568,0.004919,0.0,0.0,0.0,0.0,8.9e-05,4.26041e-05,0.0,...,0.00181,0.0,0.0,1.7e-05,0.0,0.0,0.0,0.000192,0.001352,0.0


# cosine similarity to find similar articles

In [16]:
# Normalize the NMF features: norm_features
norm_features =normalize(nmf_features)

# Create a DataFrame: df
df = pd.DataFrame(norm_features,index=titles)
df.index.names = ['titles']
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5
titles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTTP 404,0.0,0.0,0.0,0.0,0.0,1.0
Alexa Internet,0.0,0.0,0.0,0.0,0.0,1.0
Internet Explorer,0.009583,0.0,0.0,0.0,0.0,0.999954
HTTP cookie,0.0,0.0,0.0,0.0,0.0,1.0
Google Search,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']

# Compute the dot products: similarities
similarities = df.dot(article)

# Display those with the largest cosine similarity
print(similarities.nlargest())

titles
Cristiano Ronaldo                1.000000
Franck Ribéry                    0.999972
Radamel Falcao                   0.999942
Zlatan Ibrahimović               0.999942
France national football team    0.999923
dtype: float64
