In [31]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [3]:
df = pd.read_csv("Articles.csv",encoding="latin-1")

In [4]:
df.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [5]:
df.dtypes

Article     object
Date        object
Heading     object
NewsType    object
dtype: object

In [6]:
df.isna().sum()

Article     0
Date        0
Heading     0
NewsType    0
dtype: int64

In [23]:
articles = df["Article"]

In [28]:
articles.shape

(2692,)

In [22]:
tfidf = TfidfVectorizer()

In [30]:
len(tfidf.vocabulary_)

27607

In [24]:
article_matrix = tfidf.fit_transform(articles)

In [25]:
article_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 463757 stored elements and shape (2692, 27607)>

In [27]:
article_matrix.toarray().shape

(2692, 27607)

In [33]:
nmf = NMF(n_components=100)

In [37]:
w = nmf.fit_transform(article_matrix)



In [38]:
w_norm = normalize(w)

In [39]:
h = nmf.components_

In [40]:
w_norm.shape

(2692, 100)

In [41]:
w_norm

array([[0.04201685, 0.0113877 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07758665, 0.13184039, 0.        , ..., 0.12976561, 0.        ,
        0.00057124],
       [0.04368365, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.08849066, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.44168498, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.35852017, 0.        , 0.00659901, ..., 0.        , 0.        ,
        0.        ]], shape=(2692, 100))

In [47]:
article_1 = w_norm[1,] # takes entire content of index first entry

In [48]:
article_1

array([7.75866549e-02, 1.31840394e-01, 0.00000000e+00, 5.11451478e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.71485478e-01, 0.00000000e+00, 2.38418211e-02,
       0.00000000e+00, 3.64321847e-03, 0.00000000e+00, 5.76679953e-06,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.50054935e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.65533809e-02,
       0.00000000e+00, 2.54580961e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.31119033e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.57083125e-02,
       0.00000000e+00, 2.42617565e-03, 1.60687220e-01, 1.93439888e-03,
       0.00000000e+00, 1.32993908e-01, 0.00000000e+00, 1.16174165e-03,
       0.00000000e+00, 4.72463043e-02, 9.82494612e-02, 4.30058801e-05,
       4.73816261e-03, 0.00000000e+00, 0.00000000e+00, 5.43932398e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [49]:
similarity_with_others = w_norm.dot(article_1)

In [50]:
similarity_with_others 

array([0.03022729, 1.        , 0.75855902, ..., 0.01170425, 0.2112389 ,
       0.10216924], shape=(2692,))

In [51]:
score = similarity_with_others
article_text = df["Article"]

data = pd.DataFrame({"score":score,"article_text":article_text})

In [57]:
top_articles = data.sort_values(by="score",ascending=False)["article_text"][0:5]

In [59]:
for i in top_articles:
    print(i)
    print("\n")

HONG KONG: Asian markets started 2015 on an upswing in limited trading on Friday, with mainland Chinese stocks surging in Hong Kong on speculation Beijing may ease monetary policy to boost slowing growth.Hong Kong rose 1.07 percent, closing 252.78 points higher at 23857.82.Seoul closed up 0.57 percent, rising 10.85 points to 1,926.44, while Sydney gained 0.46 percent, or 24.89 points, to close at 5,435.9.Singapore edged up 0.19 percent, gaining 6.39 points to 3,371.54.Markets in mainland China, Japan, Taiwan, New Zealand, the Philippines, and Thailand remained closed for holidays.With mainland bourses shut until January 5, shares in Chinese developers and financial companies surged in Hong Kong, stoked by hopes that Beijing could ease monetary policy to support lagging growth in the world´s second-largest economy.China Vanke, the country´s biggest developer by sales, leapt 10.8 percent and the People´s Insurance Company (Group) of China Ltd. was up 5.51 percent in afternoon trading.Tra