<a href="https://colab.research.google.com/github/sarvesh237/lfkdsahkhfa/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [97]:
import pandas as pd
import numpy as np
from numpy import linalg
import matplotlib.pyplot as plt

from scipy.optimize import nnls

#for lemmatization
import nltk
nltk.download("wordnet")
nltk.download('stopwords')
#to remove stop words
from nltk.corpus import stopwords

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#gaussian mixture
from sklearn.mixture import GaussianMixture

from sklearn.decomposition import TruncatedSVD# TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Importing the collected data**

In [98]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [99]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, stop words, non-english characters, drop empty rows and duplicates.**

In [100]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

stop_words = stopwords.words('english')
stop_words.extend(['span','class','spacing','href','html','http','title','said','that'])
news_corpus['Content'] = news_corpus['Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_corpus = news_corpus.apply(lambda x: [item for item in x if item not in stop_words])


**Lemmatization**

In [101]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus = news_corpus.reset_index(drop=True)
news_corpus


Unnamed: 0,Content
0,medium report about swedish manufacturer scani...
1,access covid vaccine cooperation technology cl...
2,after severe criticism over holding consultati...
3,former congress president rahul gandhi thursda...
4,enforcement directorate attached three immovab...
...,...
4589,over mughal gold coin dating back early centur...
4590,china planning spend tibet five year plan allo...
4591,supreme court tuesday came with solution stale...
4592,indian american maju varghese previously serve...


# TF-IDF

In [102]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aajtak,aakash,aaksha,aamir,aamk,aandolan,aandolanjivi,aane,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,aatmanirbharbharat,aatmanirbharta,aatmiyata,aatmnirbhar,aatmnirbharta,aawam,aaya,...,zhao,zhoigar,zhuoran,ziarul,zila,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,ziyad,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,ztdrktlic,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **LSA : Latent Semantic Analysis**

In [103]:
sv_dec = TruncatedSVD(n_components = 50)  # components(no. of latent factors) < n ; n : no. of docs.
ls_analysis = sv_dec.fit_transform(vectors.T)

sv_dec.singular_values_ # eigenvalues for svd (in decreasing order)

array([13.49665211,  8.19753288,  6.65195647,  6.45052751,  6.13464354,
        5.38660288,  5.15440856,  4.58017184,  4.37335448,  4.2780365 ,
        4.22043377,  4.1236323 ,  4.0632226 ,  4.0383047 ,  3.93007173,
        3.9126631 ,  3.78092851,  3.73162349,  3.6800745 ,  3.6193513 ,
        3.56642955,  3.48141999,  3.40813528,  3.3692762 ,  3.31079839,
        3.29485928,  3.26959259,  3.22718497,  3.14340629,  3.13810491,
        3.09471736,  3.06753421,  2.99802963,  2.98252856,  2.95974143,
        2.94536408,  2.92308606,  2.85707278,  2.80786934,  2.79095077,
        2.7765547 ,  2.76196705,  2.73873558,  2.72680156,  2.70687299,
        2.66853837,  2.6567974 ,  2.63560407,  2.60529831,  2.58774209])

In [104]:
variance =  sv_dec.singular_values_/sum(sv_dec.singular_values_) # checking the variance captured by eigenvalues
variance

array([0.06982043, 0.0424072 , 0.03441168, 0.03336965, 0.03173553,
       0.02786579, 0.02666461, 0.02369399, 0.02262409, 0.02213099,
       0.02183301, 0.02133224, 0.02101973, 0.02089082, 0.02033092,
       0.02024086, 0.01955937, 0.01930431, 0.01903764, 0.01872351,
       0.01844973, 0.01800996, 0.01763085, 0.01742983, 0.01712731,
       0.01704486, 0.01691415, 0.01669476, 0.01626136, 0.01623394,
       0.01600949, 0.01586886, 0.01550931, 0.01542912, 0.01531123,
       0.01523686, 0.01512161, 0.01478011, 0.01452558, 0.01443805,
       0.01436358, 0.01428812, 0.01416793, 0.0141062 , 0.0140031 ,
       0.01380479, 0.01374406, 0.01363442, 0.01347764, 0.01338682])

In [105]:
sv_dec = TruncatedSVD(n_components = 25) # components with accepted variance
las = sv_dec.fit_transform(vectors)

#print(las,las.shape)

Topic - Document matrix

In [106]:
# Representation of each doc in terms of latent topics after dimensionality reduction
col = []
for i in range(las.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df = pd.DataFrame(las, columns = col)

topic_df["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

display(topic_df[l])


Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish manufacturer scani...,0.129416,-0.031666,-0.002058,0.016605,-0.058607,-0.036438,-0.018041,0.010050,0.001868,-0.019372,0.012664,0.013280,0.007438,-0.051521,0.006927,-0.014012,0.069864,-0.026699,-0.010756,0.021812,0.014998,0.006407,0.036542,-0.067759,-0.020104
1,access covid vaccine cooperation technology cl...,0.241852,-0.033736,0.076414,0.116287,-0.063081,-0.107510,-0.079364,-0.060769,-0.053590,-0.139505,-0.009936,-0.105339,0.030463,0.043142,-0.104947,0.016516,0.117557,0.049204,0.014813,-0.011267,0.026371,-0.025765,0.078259,0.104987,0.061162
2,after severe criticism over holding consultati...,0.187095,-0.047152,-0.003864,0.027564,-0.070198,-0.018174,-0.070887,-0.018233,-0.001448,-0.023247,-0.013666,-0.021586,0.017834,-0.001031,0.029915,-0.082822,0.027540,-0.071610,-0.018281,-0.002505,0.082835,0.036118,0.154253,-0.175393,0.038409
3,former congress president rahul gandhi thursda...,0.177060,-0.026037,0.029295,0.010229,-0.039884,-0.040926,-0.059154,0.027677,-0.032572,-0.082365,0.102244,-0.067387,-0.008977,-0.007876,0.026333,0.015575,0.056736,-0.035127,-0.005263,0.038353,0.032116,0.017332,0.028867,-0.024747,0.029983
4,enforcement directorate attached three immovab...,0.134256,-0.048095,-0.066812,-0.030234,-0.111708,0.012899,0.045303,0.013744,0.053604,-0.023226,-0.002070,0.027286,-0.039002,-0.090547,0.039099,-0.004836,0.099150,-0.019609,-0.037162,0.012991,-0.070213,-0.030764,-0.021960,-0.099468,-0.046365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,over mughal gold coin dating back early centur...,0.086212,-0.022274,-0.033224,-0.015101,-0.062881,-0.042085,0.091904,0.036300,0.021548,-0.010066,0.003670,0.001553,-0.013426,-0.038579,0.003760,-0.035780,0.005773,-0.012861,0.020065,-0.006979,0.025123,0.008134,0.004880,0.006797,-0.018818
4590,china planning spend tibet five year plan allo...,0.123348,-0.013562,0.004560,0.000890,-0.026666,-0.072727,-0.033887,-0.033634,-0.018257,-0.023684,-0.066347,-0.000012,0.026591,-0.018268,-0.012336,-0.007417,0.044445,0.001792,-0.016605,0.003226,0.025897,-0.019723,-0.026303,0.014331,0.010079
4591,supreme court tuesday came with solution stale...,0.178528,-0.066109,-0.074735,-0.003266,-0.120676,0.127836,-0.102559,-0.017978,0.033943,-0.019551,-0.024884,0.011380,0.004560,-0.013514,0.029952,0.006285,0.020824,0.015690,0.014373,-0.003805,0.034863,0.016672,-0.050000,-0.034759,-0.013485
4592,indian american maju varghese previously serve...,0.080712,-0.038111,0.006444,0.013016,-0.024379,-0.039534,-0.024926,0.021061,0.019019,-0.051308,-0.049723,-0.038660,-0.011536,0.059215,-0.026464,-0.059757,0.043048,0.007680,0.021817,0.036786,0.012264,-0.021420,-0.010631,0.063626,-0.046910


**Visualizing the topics**

In [107]:
'''import umap
import matplotlib.pyplot as plt

embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )
plt.show()'''

'import umap\nimport matplotlib.pyplot as plt\n\nembedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)\n\nplt.figure(figsize=(7,5))\nplt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )\nplt.show()'

**Topics and terms list**

In [108]:
Sigma = sv_dec.singular_values_ 
V_transpose = sv_dec.components_.T
terms = vectorizer.get_feature_names()

for index, component in enumerate(sv_dec.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)
del Sigma,V_transpose,terms,zipped,top_terms_key

Topic 0:  ['said', 'that', 'covid', 'case', 'will', 'farmer', 'have', 'state', 'with', 'minister']
Topic 1:  ['covid', 'vaccine', 'case', 'death', 'vaccination', 'health', 'total', 'reported', 'dose', 'active']
Topic 2:  ['election', 'assembly', 'party', 'vaccine', 'congress', 'seat', 'will', 'poll', 'minister', 'bengal']
Topic 3:  ['vaccine', 'vaccination', 'farmer', 'dose', 'first', 'covid', 'modi', 'dos', 'law', 'covaxin']
Topic 4:  ['farmer', 'law', 'farm', 'protest', 'congress', 'price', 'protesting', 'death', 'party', 'hour']
Topic 5:  ['court', 'vaccine', 'justice', 'supreme', 'vaccination', 'election', 'dose', 'petition', 'bench', 'hearing']
Topic 6:  ['police', 'vaccine', 'arrested', 'dose', 'farmer', 'vaccination', 'accused', 'bengal', 'west', 'delhi']
Topic 7:  ['sabha', 'price', 'congress', 'rajya', 'budget', 'house', 'party', 'opposition', 'session', 'parliament']
Topic 8:  ['lakh', 'price', 'sabha', 'from', 'fuel', 'rajya', 'budget', 'crore', 'petrol', 'bengal']
Topic 9: 

# **LDA : Latent Dirichlet Allocation**

In [109]:
#lda

lda = LatentDirichletAllocation(n_components=25)
lda_matrix = lda.fit_transform(df_vecs)
lda_topics = lda.fit(df_vecs)
for idx, topic in enumerate(lda.components_):
        print ("Topic ", idx, " ".join(feature_names[i] for i in topic.argsort()[:-10 - 1:-1]))

Topic  0 gelatin milk ganjhu ammk pamela cocaine goswami alaknanda scorpio joke
Topic  1 jarkiholi adhikari chowdhury coal suvendu league ramesh manchester abhishek brigade
Topic  2 ambani vaze hiren mansukh mukesh explosive sachin ansari deshmukh fadnavis
Topic  3 muraleedharan sloganeering sanction juster assaulted liquified terminal fisherman brijendra inaugurate
Topic  4 dalai lama cbdt epfo slaf mody sidhu allen pregnant extension
Topic  5 bandh mauritius edhi kevadia spectrum ashwin nigerian marshal axar kotdwar
Topic  6 nifty pujari khandwa nand dowry condoled medanta saddened psaki sensex
Topic  7 haasan vijayawada tral antigen fisherman unaccounted needhi rapid katihar skiing
Topic  8 maninderjit hoisted khempreet monkey subscription chakka mandsaur burari ombudsman lakhan
Topic  9 mufti whatsapp harry prince meghan goswami royal ceraweek khan interview
Topic  10 tandav blast leak gallantry nirani pampore solar explosive mining posthumously
Topic  11 minister will said that st

In [110]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [111]:
#plot_top_words(lda, feature_names, 10, "t")

In [112]:
col = []
for i in range(lda_matrix.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df_2 = pd.DataFrame(lda_matrix, columns = col)

topic_df_2["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

#display(topic_df_2[l])

**Cosine Similarity**

In [113]:
cos_sim = pd.DataFrame(cosine_similarity(topic_df.iloc[:, :-1]))
cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,1.000000,0.481869,0.781898,0.733791,0.740156,0.349853,0.319120,0.459730,0.372357,0.557769,0.754624,0.110479,0.300891,0.256913,0.267843,0.145527,0.624484,0.440916,0.689252,0.403022,0.447295,0.671660,0.606980,0.633459,0.329534,0.428797,0.331382,0.689356,0.720028,0.645600,0.239458,0.301579,0.525316,0.216048,0.362699,0.315235,0.314279,0.410137,0.529257,0.560953,...,0.479127,0.567417,0.501027,0.149200,0.483580,0.735615,0.483885,0.618147,0.503166,0.659888,0.886091,0.543393,0.647759,0.435482,0.624945,0.444960,0.496506,0.678656,0.518940,0.542045,0.713863,0.278564,0.570336,0.596124,0.772860,0.586481,0.588931,0.490405,0.479299,0.610823,0.611973,0.376158,0.551378,0.472142,0.289541,0.556713,0.622695,0.538106,0.325021,0.120086
1,0.481869,1.000000,0.391739,0.669163,0.069171,0.172461,0.292125,0.279491,0.212501,0.530814,0.483561,0.086451,0.131289,0.140844,0.281629,0.193308,0.893548,0.854921,0.354196,0.150066,0.245893,0.206136,0.199593,0.328056,0.218831,0.294961,0.235627,0.434868,0.200173,0.482127,0.192468,0.403270,0.254507,0.056804,0.547483,0.109891,0.180070,0.514432,0.319374,0.153765,...,0.493727,0.405626,0.515536,0.035274,0.632524,0.704776,0.477156,0.384458,0.345788,0.722332,0.410373,0.621078,0.691147,0.535178,0.538774,0.151359,0.312906,0.476719,0.447010,0.446694,0.481036,0.103994,0.189873,0.357995,0.548462,0.479063,0.418238,0.622784,0.544718,0.535888,0.509247,0.274160,0.331647,0.255163,0.138634,0.186191,0.706456,0.258252,0.633022,0.170633
2,0.781898,0.391739,1.000000,0.619873,0.398953,0.317252,0.334916,0.484109,0.265314,0.463497,0.662093,0.144501,0.198032,0.289061,0.316839,0.181032,0.561868,0.438389,0.703608,0.275936,0.314877,0.585472,0.331870,0.607700,0.314095,0.369380,0.281028,0.605275,0.530100,0.596392,0.212428,0.386014,0.328689,0.221845,0.326552,0.261189,0.255859,0.249397,0.631409,0.307568,...,0.310950,0.450408,0.425495,0.037780,0.287335,0.518122,0.493895,0.686442,0.458147,0.550275,0.775408,0.435848,0.724203,0.430818,0.543255,0.250952,0.538505,0.702077,0.283246,0.462578,0.562035,0.183964,0.604337,0.635212,0.586949,0.420277,0.492222,0.408987,0.417696,0.577788,0.573637,0.274265,0.471226,0.465595,0.289757,0.319313,0.432377,0.475155,0.194522,0.104843
3,0.733791,0.669163,0.619873,1.000000,0.365138,0.322507,0.292347,0.471377,0.576907,0.674688,0.703720,0.222029,0.059517,0.275350,0.276736,0.183605,0.694070,0.549483,0.555971,0.193756,0.614069,0.398869,0.300056,0.617217,0.372977,0.559910,0.166031,0.466494,0.443375,0.478579,0.262323,0.331631,0.391844,0.251368,0.478745,0.259822,0.491285,0.465417,0.471814,0.301086,...,0.446734,0.416967,0.543673,0.137655,0.542455,0.791290,0.467039,0.647580,0.486674,0.656238,0.753623,0.433766,0.718839,0.530563,0.680788,0.288186,0.565056,0.703641,0.443122,0.624960,0.647833,0.285704,0.548597,0.614881,0.644953,0.675844,0.554844,0.527155,0.459202,0.573485,0.624147,0.621247,0.243066,0.518863,0.335410,0.331455,0.494561,0.423869,0.374359,0.043589
4,0.740156,0.069171,0.398953,0.365138,1.000000,0.460146,0.162532,0.431259,0.295715,0.182427,0.442443,0.100731,0.249799,0.259696,0.269549,0.207170,0.266901,0.083303,0.628179,0.397594,0.324459,0.527232,0.687251,0.411302,0.217704,0.293865,0.163073,0.582511,0.668539,0.485214,0.345466,0.278490,0.636350,0.381637,0.062711,0.533574,0.233060,0.180641,0.397742,0.727508,...,0.266434,0.474739,0.327937,0.055367,0.152632,0.505191,0.225958,0.249619,0.309258,0.383254,0.558220,0.237106,0.214389,0.116388,0.317014,0.487592,0.209673,0.238008,0.324829,0.251440,0.528307,0.182256,0.239407,0.354633,0.560079,0.259950,0.366921,0.154551,0.247694,0.306026,0.351980,0.183869,0.499264,0.285767,0.244766,0.594975,0.353689,0.565264,0.150231,0.198516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.556713,0.186191,0.319313,0.331455,0.594975,0.246678,0.145782,0.454685,0.300540,0.190997,0.685987,0.118248,0.321813,0.100035,0.225430,0.067707,0.329789,0.213977,0.435959,0.231368,0.322034,0.526817,0.466431,0.495284,0.297956,0.564464,0.119643,0.420276,0.602645,0.320926,0.061253,0.152820,0.724041,0.081389,0.233942,0.298123,0.241147,0.368855,0.304165,0.398714,...,0.306056,0.254376,0.432437,0.196417,0.328504,0.660826,0.384782,0.391168,0.301244,0.503067,0.621255,0.454148,0.421837,0.361383,0.411246,0.396005,0.264955,0.346356,0.464907,0.409466,0.625536,0.197116,0.552591,0.368126,0.425561,0.402183,0.578943,0.288642,0.415689,0.328034,0.464928,0.268691,0.371511,0.327768,0.146564,1.000000,0.356379,0.259562,0.341503,0.227742
4590,0.622695,0.706456,0.432377,0.494561,0.353689,0.216024,0.266800,0.346346,0.244252,0.366697,0.650536,0.139673,0.446356,0.149042,0.312463,0.194385,0.800830,0.663822,0.490757,0.617227,0.258031,0.492535,0.500305,0.422626,0.196467,0.308078,0.546467,0.670101,0.499060,0.637650,0.223714,0.457220,0.369577,0.061859,0.598402,0.179214,0.194606,0.451289,0.455098,0.322111,...,0.727433,0.538728,0.573605,0.210292,0.671875,0.601539,0.691839,0.440677,0.440434,0.723202,0.475344,0.837321,0.674128,0.637123,0.566878,0.460373,0.420836,0.528300,0.604010,0.499557,0.558665,0.202601,0.215997,0.424937,0.754441,0.456157,0.599147,0.740564,0.692263,0.759047,0.638600,0.302750,0.751351,0.286452,0.318732,0.356379,1.000000,0.422256,0.583569,0.269229
4591,0.538106,0.258252,0.475155,0.423869,0.565264,0.890066,0.177280,0.652809,0.315357,0.283840,0.565364,0.154891,0.233539,0.726169,0.488580,0.612295,0.435175,0.244975,0.711009,0.388545,0.328220,0.411661,0.585197,0.462252,0.262735,0.260170,0.267065,0.634401,0.479800,0.477152,0.888449,0.616279,0.369696,0.824328,0.283007,0.862199,0.276073,0.204879,0.781362,0.322119,...,0.461798,0.833297,0.324995,0.099046,0.366525,0.440773,0.415221,0.413695,0.363155,0.479697,0.487194,0.350158,0.416247,0.382768,0.441872,0.520445,0.368514,0.465035,0.440302,0.418230,0.536861,0.191099,0.290420,0.772955,0.559851,0.392291,0.460746,0.416655,0.436399,0.478943,0.588406,0.264333,0.528808,0.329742,0.265314,0.259562,0.422256,1.000000,0.291272,0.203858
4592,0.325021,0.633022,0.194522,0.374359,0.150231,0.191660,0.067878,0.324401,0.351140,0.147767,0.418410,0.084237,0.090084,0.223951,0.549593,0.225391,0.717688,0.642273,0.446752,0.228746,0.321742,0.323890,0.202826,0.378259,0.565460,0.521399,0.139077,0.487220,0.299802,0.509477,0.266968,0.318639,0.236231,0.087692,0.280129,0.164211,0.100312,0.325857,0.318216,0.115304,...,0.457275,0.547168,0.534205,0.209645,0.767978,0.487564,0.394457,0.250577,0.262255,0.683016,0.355210,0.509852,0.425899,0.453418,0.450945,0.280629,0.284013,0.311501,0.702610,0.565974,0.472051,0.119781,0.361013,0.290317,0.428622,0.647874,0.469392,0.477627,0.517827,0.439467,0.445205,0.335914,0.309193,0.252203,0.191805,0.341503,0.583569,0.291272,1.000000,0.170044


# **Creating Users and initial ratings**

**Generating Users**

In [114]:
#fitting GMM to column "topic_i"

num_cols_topic_df = topic_df.shape[1]-1
gm = []

for i in range(num_cols_topic_df):
  gm.append(GaussianMixture(n_components=10).fit(topic_df.iloc[:, [i]]))

In [115]:
#generating gmm based topic values for each topic for 50 users
#24 x 50 matrix

Users = []

for i in range(50):
  temp = []
  for i in range(num_cols_topic_df):
    temp.append(gm[i].sample(1)[0][0][0])
  Users.append(temp)
Users=np.array(Users)
Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

In [116]:
#run the above cell before running this twice
slc = list(range(topic_df.shape[1]))
slc.remove(25)
temp_topic_df = topic_df.iloc[:, slc]
Users_temp = Users.T.set_index(temp_topic_df.T.index)

result = temp_topic_df.dot(Users_temp);
#result.T

**Generating User ratings** 




In [117]:
rank_matrix = result.rank().T/4593*10
rank_matrix = rank_matrix.round(0).astype(int)
rank_matrix.columns =np.linspace(0,4593,4594).astype(int)

#not every reader reads all news
#remove random elements from the rank matrix
for i in range(4594):
  random_entries = np.random.randint(0,50,25)
  rank_matrix.loc[random_entries,i] = "No rating"
rank_matrix = rank_matrix.apply(pd.to_numeric, errors='coerce')
rank_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,1.0,,1.0,6.0,,9.0,1.0,5.0,5.0,,4.0,0.0,0.0,4.0,,10.0,,6.0,3.0,2.0,4.0,,,1.0,,1.0,1.0,2.0,1.0,3.0,10.0,9.0,10.0,9.0,6.0,10.0,4.0,,7.0,5.0,...,4.0,,4.0,6.0,5.0,,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,,2.0,4.0,3.0,,4.0,,,2.0,,,2.0,4.0,4.0,3.0,4.0,3.0,,,,9.0,2.0,,,,6.0
1,,7.0,,,,2.0,8.0,4.0,,,5.0,7.0,,1.0,2.0,,6.0,8.0,,,9.0,7.0,7.0,,9.0,,2.0,,7.0,,2.0,4.0,,,,,8.0,5.0,4.0,3.0,...,0.0,,2.0,1.0,3.0,,0.0,,,1.0,,0.0,1.0,,0.0,1.0,3.0,,,2.0,2.0,,1.0,,0.0,2.0,,,1.0,1.0,1.0,,1.0,,4.0,4.0,,,4.0,4.0
2,,9.0,6.0,,,,9.0,4.0,9.0,,6.0,,3.0,2.0,,4.0,,8.0,3.0,4.0,10.0,5.0,7.0,3.0,7.0,7.0,4.0,,,,1.0,5.0,3.0,3.0,,,9.0,6.0,,,...,2.0,4.0,5.0,,,3.0,,2.0,3.0,,,,2.0,2.0,1.0,1.0,2.0,4.0,4.0,6.0,4.0,2.0,2.0,3.0,2.0,,,3.0,2.0,,,,4.0,9.0,,4.0,4.0,,,8.0
3,,5.0,5.0,,,8.0,1.0,6.0,,2.0,5.0,9.0,,4.0,,,,,,5.0,7.0,6.0,,,7.0,5.0,2.0,,7.0,,9.0,7.0,10.0,,,,6.0,,6.0,5.0,...,,,,,,,0.0,0.0,1.0,,,1.0,1.0,0.0,,2.0,2.0,1.0,2.0,1.0,3.0,3.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,,1.0,0.0,,,7.0,,1.0
4,6.0,9.0,8.0,,,5.0,5.0,4.0,,2.0,,,2.0,1.0,3.0,1.0,8.0,,5.0,7.0,,5.0,7.0,4.0,10.0,,1.0,2.0,,,5.0,,10.0,,5.0,6.0,,,,3.0,...,1.0,,1.0,0.0,4.0,1.0,0.0,,,1.0,,,1.0,0.0,0.0,0.0,,1.0,2.0,2.0,3.0,0.0,,1.0,1.0,2.0,1.0,1.0,1.0,,1.0,,3.0,6.0,3.0,4.0,3.0,7.0,4.0,2.0
5,5.0,3.0,8.0,3.0,,,,4.0,9.0,1.0,6.0,10.0,7.0,,,3.0,,4.0,,,,8.0,,5.0,,9.0,5.0,,8.0,3.0,,3.0,,,6.0,7.0,,,,2.0,...,1.0,4.0,2.0,,4.0,1.0,,,1.0,,,1.0,1.0,,,2.0,,,3.0,3.0,3.0,,,,1.0,2.0,2.0,1.0,,1.0,1.0,2.0,5.0,,7.0,3.0,5.0,7.0,4.0,
6,3.0,,4.0,3.0,3.0,3.0,9.0,2.0,8.0,,,10.0,6.0,,3.0,,4.0,,2.0,7.0,9.0,5.0,7.0,,5.0,,,2.0,,,6.0,,,4.0,,4.0,6.0,,,,...,0.0,1.0,1.0,,1.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,,0.0,,,,,,0.0,0.0,,,,1.0,,0.0,1.0,0.0,2.0,,5.0,,1.0,3.0,,1.0,
7,6.0,,6.0,8.0,8.0,,1.0,8.0,,,,0.0,,6.0,,9.0,7.0,,8.0,9.0,10.0,6.0,,5.0,,10.0,,,8.0,3.0,,,,9.0,,,,6.0,6.0,3.0,...,2.0,7.0,3.0,2.0,,3.0,2.0,2.0,3.0,,,2.0,3.0,,,2.0,2.0,4.0,5.0,4.0,4.0,4.0,,,3.0,,,,2.0,3.0,2.0,,,5.0,3.0,,5.0,8.0,7.0,1.0
8,6.0,6.0,,6.0,8.0,10.0,10.0,7.0,4.0,,,1.0,7.0,6.0,,,5.0,5.0,7.0,5.0,,,10.0,5.0,1.0,4.0,4.0,4.0,,,10.0,8.0,,,5.0,10.0,,,5.0,3.0,...,,,2.0,2.0,,,1.0,1.0,3.0,2.0,4.0,1.0,2.0,,,,0.0,,2.0,2.0,4.0,3.0,,2.0,,2.0,,1.0,1.0,,2.0,,5.0,5.0,2.0,5.0,3.0,,1.0,9.0
9,,,,8.0,,1.0,,,,7.0,5.0,2.0,1.0,2.0,,,9.0,,4.0,0.0,9.0,4.0,,4.0,,7.0,1.0,2.0,,4.0,4.0,4.0,,,7.0,1.0,4.0,5.0,5.0,,...,2.0,4.0,4.0,2.0,6.0,3.0,,,2.0,3.0,,,4.0,2.0,1.0,1.0,,3.0,3.0,4.0,3.0,1.0,,2.0,,,,,3.0,,2.0,,,,7.0,1.0,6.0,3.0,,5.0


# **Content based recommender function**

In [118]:
def content_recommender(rank_matrix,cos_sim):
  
  #find top 5 ratings
  top_five = rank_matrix.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=5)

  #best two similar docs for every doc
  best_two = cos_sim.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=3)
  best_two.columns =['0', '1','2']
  del best_two['0']
  best_two['combined']= best_two.values.tolist()
  best_two
  best_two_dict_1 = best_two['1'].to_dict()
  best_two_dict_2 = best_two['2'].to_dict()

  selected_docs_1 = top_five.replace(best_two_dict_1)
  selected_docs_2 = top_five.replace(best_two_dict_2)
  selected_docs = pd.concat([selected_docs_1,selected_docs_2],axis=1,ignore_index=True)
  #display(selected_docs)

  #replace value by the actual news
  doc_dict = news_corpus.Content.to_dict()

  selected_docs = selected_docs.replace(doc_dict)
  return selected_docs

# **Top 10 news : Content based recommender.**

In [119]:
selected_docs_content = content_recommender(rank_matrix,cos_sim)
selected_docs_content

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,supreme court friday urged government provide ...,supreme court initiative begin hybrid physical...,with this total number arrest made connection ...,taken over probe into identical pils filed all...,sushil batranew delhi india march hour after c...,petition supreme court challenged entry woman ...,order resolve rising number cheque bounce case...,delhi police made more arrest fort violence ca...,panaji india march hotel cctv footage will dem...,delhi india march delhi court will pronounce j...
1,congress leader rahul gandhi sunday compared o...,delhi india march cumulative number covid vacc...,delhi india march indian medical association m...,india inoculated lakh healthcare worker tuesda...,national investigation agency last week examin...,urging centre make agri law prestige issue sen...,delhi india march cumulative number covid vacc...,thiruvananthapuram kerala india march kerala c...,total crore covid vaccine dos have been admini...,national investigation agency summoned around ...
2,siddharth sharmanew delhi india march congress...,former congress president rahul gandhi tuesday...,aiman khannew delhi india march bharatiya jana...,delhi india march bharatiya janta party tuesda...,delhi india march congress leader party former...,face between political rival bharatiya janata ...,distancing himself from jammu meet leader seni...,guwahati assam india february rashtriya janta ...,siddharth sharmanew delhi india march congress...,leader jyotiraditya scindia quit congress last...
3,with this total number arrest made connection ...,nearly three woman worldwide subjected physica...,woman likely play major role ensuring that pro...,amaravati andhra pradesh india march andhra pr...,congress leader rahul gandhi sunday compared o...,delhi police made more arrest fort violence ca...,covid pandemic negatively affected woman incom...,delhi reuters thousand woman joined protest fa...,delhi india march national commission woman fr...,urging centre make agri law prestige issue sen...
4,rajya sabha chairman venkaiah naidu tuesday as...,with this total number arrest made connection ...,petrol price tuesday neared litre mark nationa...,delhi india march rajya sabha been adjourned a...,tirath singh rawat been sworn chief minister u...,several opposition party including congress ha...,delhi police made more arrest fort violence ca...,shailesh yadavnew delhi india march surging di...,delhi india march rajya sabha adjourned till m...,dehradun uttarakhand india march predicting ch...
5,daily covid case india registered increase fou...,daily coronavirus covid case fell below third ...,eighteen state including assam rajasthan odish...,maharashtra kerala punjab tamil nadu gujarat k...,india total tally covid case surged with infec...,daily coronavirus covid case fell below third ...,daily covid case india registered increase fou...,covid fatality have been reported state union ...,maharashtra kerala punjab tamil nadu gujarat w...,daily infection fell below fourth time this mo...
6,daily covid case india registered increase fou...,daily coronavirus covid case fell below third ...,delhi india march union minister piyush goyal ...,delhi india march bharat biotech serum institu...,delhi india march cumulative number covid vacc...,daily coronavirus covid case fell below third ...,daily covid case india registered increase fou...,delhi india march prime minister narendra modi...,court said rather selfish petitioner seek vacc...,delhi india march cumulative number covid vacc...
7,siddharth sharmanew delhi india march congress...,delhi india march aadmi party rajya sabha sanj...,congress monday demanded rollback increase pri...,delhi india march rajya sabha chairman venkaia...,delhi india march rajya sabha chairman venkaia...,face between political rival bharatiya janata ...,first budget session parliament obituary notic...,cornering government spiralling fuel price con...,rajya sabha chairman venkaiah naidu urged chai...,delhi india march sabha speaker birla monday s...
8,supreme court thursday asked group young lawye...,assembly election trinamool called photo certi...,delhi india march national investigation agenc...,supreme court initiative begin hybrid physical...,taken over probe into identical pils filed all...,educated youngster showing forward reduce cast...,covid vaccination certificate poll bound state...,vikasnagar dehradun india march case registere...,order resolve rising number cheque bounce case...,panaji india march hotel cctv footage will dem...
9,british parliament monday discussed farmer pro...,congress leader rahul gandhi sunday compared o...,farmer leader rakesh tikait monday took union ...,sonipat palwal haryana india march farmer have...,group gandhian organisation called farmer move...,high commission india london condemned debate ...,urging centre make agri law prestige issue sen...,more than faculty member various educational i...,meerut uttar pradesh india february extending ...,shiromani akali monday asked central governmen...


# **Collaborative recommender function: Predict missing ratings using Matrix factorization**

In [120]:
#collaborative recommender function

def collaborative_recommender(rank_matrix,num_iter,news_corpus):
  A = np.array(rank_matrix)
  M = A.shape[0]
  N = A.shape[1]
  A_df = pd.DataFrame(A)
  K = 25
  W = np.abs(np.random.uniform(low=0, high=1, size=(M, K)))
  H = np.abs(np.random.uniform(low=0, high=1, size=(K, N)))
  W = np.divide(W, K*W.max())
  H = np.divide(H, K*H.max())

  #cost func
  def cost(A, W, H):
    mask = pd.DataFrame(A).notnull().values
    WH = np.dot(W, H)
    WH_mask = WH[mask]
    A_mask = A[mask]
    A_WH_mask = A_mask-WH_mask
    return linalg.norm(A_WH_mask, 2)
  
  #optimization loop
  num_display_cost = max(int(num_iter/10), 1)

  for i in range(num_iter):
      if i%2 ==0:
          # Learn H, given A and W
          for j in range(N):
              mask_rows = pd.Series(A[:,j]).notnull()
              H[:,j] = nnls(W[mask_rows], A[:,j][mask_rows])[0]
      else:
          for j in range(M):
              mask_rows = pd.Series(A[j,:]).notnull()
              W[j,:] = nnls(H.transpose()[mask_rows], A[j,:][mask_rows])[0]
              
      WH = np.dot(W, H)
      c = cost(A, W, H)
      #if i%num_display_cost==0:
      print(i, c)
  W = pd.DataFrame(W)
  H = pd.DataFrame(H)
  A = W.dot(H)
  
  #new rank matrix
  A = A.round(0).astype(int)
  A = np.clip(A,1,10)
  #finding the top 10 documents
  selected_docs = A.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=10)
  selected_docs.columns =['D1', 'D2', 'D3', 'D4', 'D5','D6','D7','D8','D9','D10']
  doc_dict = news_corpus.Content.to_dict()
  selected_docs = selected_docs.replace(doc_dict)
  return selected_docs

# **Top 10 news : Collaborative recommender.**

In [121]:
selected_docs_collab = collaborative_recommender(rank_matrix,num_iter,news_corpus)
selected_docs_collab

0 726.2268657461658
1 582.4669510476019
2 479.03908530658276
3 423.16275918689195
4 387.2697093191862
5 362.9311655281337
6 345.35855726791095
7 331.6910201787042
8 321.7109034448361
9 313.7376772432454


Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10
0,supreme court wednesday asked government respo...,constitution bench chief justice india sharad ...,supreme court wednesday sought response from c...,person including year assaulted policeman duty...,supreme court said march would hear march plea...,exercise dustlik between india uzbekistan from...,court delhi march convicted ariz khan allegedl...,supreme court asked centre much financial liab...,supreme court initiative begin hybrid physical...,andhra pradesh government orally stated suprem...
1,person including year assaulted policeman duty...,high commission india london condemned debate ...,maintaining that agricultural law internal mat...,india resolute response border helped positive...,prime minister narendra modi will address conf...,ministry home affair asked national investigat...,describing contentious farm law death warrant ...,narendra modi prime minister poor farmer charg...,india thursday said conceded territory part di...,national investigation agency sought detail so...
2,congress always stood internal discussion ther...,prime minister narendra modi monday said celeb...,crisis over continuation uttarakhand chief min...,leader jyotiraditya scindia quit congress last...,congress changing selects candidate assembly e...,centre friday high level committee planning co...,week after show strength jammu section group d...,former congress president rahul gandhi tuesday...,senior congress leader adhir ranjan chowdhury ...,distancing himself from jammu meet leader seni...
3,person including year assaulted policeman duty...,union home ministry monday said crore been san...,national investigation agency taken over case ...,congress general secretary priyanka gandhi vad...,covid fatality have been reported state union ...,andhra pradesh government orally stated suprem...,delhi high court march asked centre explain ra...,maharashtra kerala punjab tamil nadu gujarat k...,maharashtra kerala punjab tamil nadu gujarat k...,with several state country continuing report i...
4,congress always stood internal discussion ther...,third both house parliament could barely busin...,inclusion india united state latest strategy r...,high commission india london condemned debate ...,union home ministry monday said crore been san...,leader opposition rajya sabha mallikarjun khar...,rajya sabha chairman venkaiah naidu march said...,finance minister nirmala sitharaman lashed thi...,describing contentious farm law death warrant ...,jammu kashmir pradesh congress committee jkpcc...
5,india recorded highest daily tally covid case ...,association democratic reform represented advo...,crisis over continuation uttarakhand chief min...,case coronavirus infection india were recorded...,india covid tally rose with over fresh case be...,covid fatality have been reported state union ...,election commission india friday expressed ful...,maharashtra kerala punjab tamil nadu gujarat k...,india total tally covid case rose with infecti...,former congress president rahul gandhi tuesday...
6,india recorded highest daily tally covid case ...,congress always stood internal discussion ther...,leader opposition rajya sabha mallikarjun khar...,case coronavirus infection india were recorded...,prime minister narendra modi march said poor n...,india covid tally rose with over fresh case be...,congress changing selects candidate assembly e...,india total tally covid case rose with infecti...,former congress president rahul gandhi tuesday...,senior congress leader adhir ranjan chowdhury ...
7,supreme court wednesday asked government respo...,congress always stood internal discussion ther...,terming death mohan delkar strike dignity parl...,supreme court wednesday sought response from c...,participation congress member meeting standing...,high commission india london condemned debate ...,international woman which celebrated with much...,noted classical dancer nominated rajya sabha m...,union home ministry monday said crore been san...,thick with tension army soldier surrounded kas...
8,petition been filed supreme court highlighting...,health ministry tuesday informed election comm...,informed supreme court wednesday that mamata b...,constitution bench chief justice india sharad ...,supreme court said march would hear march plea...,senior congress leader saifuddin tuesday terme...,supreme court tuesday agreed request made cent...,delhi prime minister narendra modi tuesday exp...,government said amendment introduced right inf...,supreme court which hearing case through video...
9,access covid vaccine cooperation technology cl...,close heel plan united nation regional confere...,discussion farm law british parliament amount ...,crisis over continuation uttarakhand chief min...,congress general secretary priyanka gandhi vad...,agriculture minister narendra singh tomar satu...,hundred farmer including woman held blockade f...,farmer agitation over farm law completed day s...,congress changing selects candidate assembly e...,farmer protest completed day border delhi seni...


# **Hydrid recommender**

In [124]:
def hybrid(collaborative_recommender,content_recommender,rank_matrix,cos_sim,news_corpus):
  selected_docs_collab = collaborative_recommender(rank_matrix,10,news_corpus)
  selected_docs_content = content_recommender(rank_matrix,cos_sim)
  selected_docs_collab.reset_index(drop=True, inplace=True)
  selected_docs_content.reset_index(drop=True, inplace=True)
  selected_docs_final = pd.concat([selected_docs_content.iloc[:,[0,1,2,3,4]],selected_docs_collab.iloc[:,[5,6,7,8,9]]],axis=1)

  return selected_docs_final

# **Top 10 news : Hybrid recommender.**

In [125]:
selected_docs_final = hybrid(collaborative_recommender,content_recommender,rank_matrix,cos_sim,news_corpus)
selected_docs_final

0 736.611815553904
1 593.4377576229934
2 480.6891834837512
3 421.06330199363
4 383.2711618297497
5 359.81629028503346
6 343.5259183419325
7 331.05963986321905
8 321.3779448685254
9 313.28192652770997


Unnamed: 0,0,1,2,3,4,D6,D7,D8,D9,D10
0,supreme court friday urged government provide ...,supreme court initiative begin hybrid physical...,with this total number arrest made connection ...,taken over probe into identical pils filed all...,sushil batranew delhi india march hour after c...,supreme court initiative begin hybrid physical...,andhra pradesh government orally stated suprem...,constitution bench supreme court thursday prop...,delhi high court march asked centre explain ra...,uttar pradesh government scoffed punjab argume...
1,congress leader rahul gandhi sunday compared o...,delhi india march cumulative number covid vacc...,delhi india march indian medical association m...,india inoculated lakh healthcare worker tuesda...,national investigation agency last week examin...,health minister harsh vardhan took first dose ...,national investigation agency sought detail so...,large part medium today functioning propaganda...,agitating farmer union february objected delhi...,congress general secretary priyanka gandhi vad...
2,siddharth sharmanew delhi india march congress...,former congress president rahul gandhi tuesday...,aiman khannew delhi india march bharatiya jana...,delhi india march bharatiya janta party tuesda...,delhi india march congress leader party former...,leader jyotiraditya scindia quit congress last...,prime minister narendra modi march said poor n...,congress changing selects candidate assembly e...,week after show strength jammu section group d...,external affair minister jaishankar will visit...
3,with this total number arrest made connection ...,nearly three woman worldwide subjected physica...,woman likely play major role ensuring that pro...,amaravati andhra pradesh india march andhra pr...,congress leader rahul gandhi sunday compared o...,year farmer from haryana hisar district march ...,case coronavirus infection india were recorded...,india covid tally rose with over fresh case be...,andhra pradesh government orally stated suprem...,constitution bench supreme court thursday prop...
4,rajya sabha chairman venkaiah naidu tuesday as...,with this total number arrest made connection ...,petrol price tuesday neared litre mark nationa...,delhi india march rajya sabha been adjourned a...,tirath singh rawat been sworn chief minister u...,high commission india london condemned debate ...,multiple adjournment motion marred working raj...,leader opposition rajya sabha mallikarjun khar...,rajya sabha chairman venkaiah naidu march said...,finance minister nirmala sitharaman lashed thi...
5,daily covid case india registered increase fou...,daily coronavirus covid case fell below third ...,eighteen state including assam rajasthan odish...,maharashtra kerala punjab tamil nadu gujarat k...,india total tally covid case surged with infec...,maharashtra kerala punjab tamil nadu gujarat k...,maharashtra kerala punjab tamil nadu gujarat k...,maharashtra kerala punjab tamil nadu gujarat w...,with several state country continuing report i...,covid vaccination session would scheduled this...
6,daily covid case india registered increase fou...,daily coronavirus covid case fell below third ...,delhi india march union minister piyush goyal ...,delhi india march bharat biotech serum institu...,delhi india march cumulative number covid vacc...,india total tally covid case rose with infecti...,cumulative number covid vaccine dos administer...,india covid case rose with infection being rep...,nand kumar singh chauhan sabha member from kha...,india covid tally surpassed crore march with i...
7,siddharth sharmanew delhi india march congress...,delhi india march aadmi party rajya sabha sanj...,congress monday demanded rollback increase pri...,delhi india march rajya sabha chairman venkaia...,delhi india march rajya sabha chairman venkaia...,former congress president rahul gandhi tuesday...,trinamool congress mahua moitra served notice ...,leader congress sabha adhir ranjan chowdhury m...,participation congress member meeting standing...,price domestic cooking doubled cylinder last s...
8,supreme court thursday asked group young lawye...,assembly election trinamool called photo certi...,delhi india march national investigation agenc...,supreme court initiative begin hybrid physical...,taken over probe into identical pils filed all...,supreme court tuesday agreed request made cent...,supreme court initiative begin hybrid physical...,covid fatality have been reported state union ...,finance minister nirmala sitharaman lashed thi...,election commission india friday expressed ful...
9,british parliament monday discussed farmer pro...,congress leader rahul gandhi sunday compared o...,farmer leader rakesh tikait monday took union ...,sonipat palwal haryana india march farmer have...,group gandhian organisation called farmer move...,high commission india london condemned debate ...,crisis over continuation uttarakhand chief min...,congress general secretary priyanka gandhi vad...,conveying strong appreciation resolute dedicat...,agriculture minister narendra singh tomar satu...


# **User Profile Updater**

In [77]:
def user_profile_updater(userid,articles_read,time_spent,rank_matrix):
  #updated code here
  
  rank_matrix = updated_rank_matrix
  return updated_rank_matrix

# **Implement ALS based matrix factorization instead of NNLS**

In [None]:
'''from pyspark.ml.recommendation import ALS 
from pyspark.sql.types import FloatType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

X_train, X_test = rank_matrix.randomSplit([0.6, 0.4])'''

In [None]:
'''als = mlALS(rank=5, maxIter=10, seed=0)
model = als.fit(rank_matrix)'''