<a href="https://colab.research.google.com/github/sarvesh237/lfkdsahkhfa/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [34]:
import pandas as pd
import numpy as np
from numpy import linalg
import matplotlib.pyplot as plt

from scipy.optimize import nnls

#for lemmatization
import nltk
nltk.download("wordnet")
nltk.download('stopwords')
#to remove stop words
from nltk.corpus import stopwords

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#gaussian mixture
from sklearn.mixture import GaussianMixture

from sklearn.decomposition import TruncatedSVD# TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Importing the collected data**

In [35]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [36]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, stop words, non-english characters, drop empty rows and duplicates.**

In [37]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

stop_words = stopwords.words('english')
stop_words.extend(['span','class','spacing','href','html','http','title','said','that'])
news_corpus['Content'] = news_corpus['Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_corpus = news_corpus.apply(lambda x: [item for item in x if item not in stop_words])


**Lemmatization**

In [38]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus = news_corpus.reset_index(drop=True)
news_corpus


Unnamed: 0,Content
0,medium report about swedish manufacturer scani...
1,access covid vaccine cooperation technology cl...
2,after severe criticism over holding consultati...
3,former congress president rahul gandhi thursda...
4,enforcement directorate attached three immovab...
...,...
4589,over mughal gold coin dating back early centur...
4590,china planning spend tibet five year plan allo...
4591,supreme court tuesday came with solution stale...
4592,indian american maju varghese previously serve...


# TF-IDF

In [39]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aajtak,aakash,aaksha,aamir,aamk,aandolan,aandolanjivi,aane,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,aatmanirbharbharat,aatmanirbharta,aatmiyata,aatmnirbhar,aatmnirbharta,aawam,aaya,...,zhao,zhoigar,zhuoran,ziarul,zila,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,ziyad,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,ztdrktlic,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **LSA : Latent Semantic Analysis**

In [40]:
sv_dec = TruncatedSVD(n_components = 50)  # components(no. of latent factors) < n ; n : no. of docs.
ls_analysis = sv_dec.fit_transform(vectors.T)

sv_dec.singular_values_ # eigenvalues for svd (in decreasing order)

array([13.49665211,  8.19753288,  6.65195647,  6.45052751,  6.13464354,
        5.3866029 ,  5.1544087 ,  4.58016842,  4.37335062,  4.27803473,
        4.22041165,  4.12362786,  4.06317868,  4.03833576,  3.93002883,
        3.91264698,  3.78113391,  3.73165397,  3.67994086,  3.61953162,
        3.56648486,  3.48162871,  3.40908799,  3.36779463,  3.31102424,
        3.29833405,  3.26750724,  3.22559759,  3.14200538,  3.14121446,
        3.10081487,  3.06484782,  3.01295513,  2.97587089,  2.96303914,
        2.94151938,  2.91285791,  2.85973445,  2.8541473 ,  2.78291206,
        2.76785189,  2.75939958,  2.74471575,  2.71160494,  2.70239922,
        2.69821118,  2.64889199,  2.63667048,  2.60485393,  2.58594331])

In [41]:
variance =  sv_dec.singular_values_/sum(sv_dec.singular_values_) # checking the variance captured by eigenvalues
variance

array([0.06980631, 0.04239863, 0.03440472, 0.03336291, 0.03172912,
       0.02786016, 0.02665922, 0.02368918, 0.0226195 , 0.02212651,
       0.02182848, 0.0213279 , 0.02101525, 0.02088676, 0.02032658,
       0.02023668, 0.01955648, 0.01930056, 0.0190331 , 0.01872065,
       0.01844629, 0.0180074 , 0.01763221, 0.01741864, 0.01712502,
       0.01705938, 0.01689994, 0.01668318, 0.01625083, 0.01624674,
       0.01603779, 0.01585176, 0.01558337, 0.01539156, 0.0153252 ,
       0.01521389, 0.01506565, 0.01479089, 0.01476199, 0.01439356,
       0.01431566, 0.01427195, 0.014196  , 0.01402475, 0.01397714,
       0.01395547, 0.01370039, 0.01363718, 0.01347262, 0.01337481])

In [42]:
sv_dec = TruncatedSVD(n_components = 25) # components with accepted variance
las = sv_dec.fit_transform(vectors)

#print(las,las.shape)

Topic - Document matrix

In [43]:
# Representation of each doc in terms of latent topics after dimensionality reduction
col = []
for i in range(las.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df = pd.DataFrame(las, columns = col)

topic_df["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

display(topic_df[l])


Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish manufacturer scani...,0.129416,-0.031667,-0.002052,0.016608,-0.058600,-0.036412,-0.017966,0.009526,0.001886,-0.019121,0.012417,0.012730,0.007330,-0.052761,0.006340,-0.013016,0.062350,-0.032589,-0.010976,0.028752,0.011422,-0.000659,0.042983,0.050679,0.070125
1,access covid vaccine cooperation technology cl...,0.241852,-0.033739,0.076434,0.116314,-0.063071,-0.107563,-0.079258,-0.060588,-0.053811,-0.137497,-0.011318,-0.103440,0.026861,0.043906,-0.112343,0.010974,0.109430,0.032084,0.035961,-0.008249,0.043540,-0.000916,0.062485,-0.117903,0.023479
2,after severe criticism over holding consultati...,0.187095,-0.047152,-0.003877,0.027577,-0.070233,-0.017883,-0.071044,-0.018298,-0.002325,-0.026658,-0.014115,-0.021932,0.024492,-0.003253,0.033506,-0.090496,0.028628,-0.069034,-0.065035,0.015018,0.053196,0.010428,0.190313,0.098050,0.181508
3,former congress president rahul gandhi thursda...,0.177060,-0.026037,0.029299,0.010230,-0.039880,-0.040962,-0.059132,0.027616,-0.032300,-0.081537,0.102603,-0.067950,-0.007325,-0.009988,0.019601,0.012926,0.060433,-0.029800,-0.008441,0.038934,0.033928,-0.015798,0.035852,0.017622,0.029977
4,enforcement directorate attached three immovab...,0.134256,-0.048097,-0.066813,-0.030225,-0.111698,0.012863,0.045541,0.013152,0.054149,-0.022086,-0.001665,0.027616,-0.039700,-0.094221,0.035515,-0.000566,0.088052,-0.025350,-0.025545,0.016818,-0.062682,-0.009904,-0.021882,0.096982,0.016655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,over mughal gold coin dating back early centur...,0.086212,-0.022274,-0.033223,-0.015105,-0.062886,-0.042075,0.091939,0.036286,0.021188,-0.010039,0.003195,0.002414,-0.012130,-0.038637,0.008666,-0.034364,0.005560,-0.016432,0.011398,-0.007968,0.022936,0.005730,0.000860,-0.007780,0.001602
4590,china planning spend tibet five year plan allo...,0.123348,-0.013563,0.004556,0.000889,-0.026682,-0.072795,-0.033792,-0.033587,-0.018982,-0.022736,-0.066732,0.000858,0.023816,-0.017320,-0.012072,-0.007920,0.037834,0.001625,-0.010407,0.010290,0.023971,-0.023967,-0.021558,-0.016908,-0.000608
4591,supreme court tuesday came with solution stale...,0.178528,-0.066110,-0.074740,-0.003264,-0.120685,0.127821,-0.102538,-0.017795,0.033247,-0.020366,-0.024598,0.011120,0.003855,-0.011865,0.029977,0.007692,0.020960,0.009755,0.012907,-0.003666,0.040183,0.011335,-0.036130,0.027570,0.021042
4592,indian american maju varghese previously serve...,0.080712,-0.038110,0.006437,0.013013,-0.024384,-0.039498,-0.024826,0.021066,0.018704,-0.050514,-0.050533,-0.039881,-0.010402,0.058180,-0.022844,-0.058033,0.037833,-0.008622,0.027783,0.032803,0.022525,0.004944,-0.034470,-0.074459,-0.003863


**Visualizing the topics**

In [44]:
'''import umap
import matplotlib.pyplot as plt

embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )
plt.show()'''

'import umap\nimport matplotlib.pyplot as plt\n\nembedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)\n\nplt.figure(figsize=(7,5))\nplt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )\nplt.show()'

**Topics and terms list**

In [45]:
Sigma = sv_dec.singular_values_ 
V_transpose = sv_dec.components_.T
terms = vectorizer.get_feature_names()

for index, component in enumerate(sv_dec.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)
del Sigma,V_transpose,terms,zipped,top_terms_key

Topic 0:  ['said', 'that', 'covid', 'case', 'will', 'farmer', 'have', 'state', 'with', 'minister']
Topic 1:  ['covid', 'vaccine', 'case', 'death', 'vaccination', 'health', 'total', 'reported', 'dose', 'active']
Topic 2:  ['election', 'assembly', 'party', 'vaccine', 'congress', 'seat', 'will', 'poll', 'minister', 'bengal']
Topic 3:  ['vaccine', 'vaccination', 'farmer', 'dose', 'first', 'covid', 'modi', 'dos', 'law', 'covaxin']
Topic 4:  ['farmer', 'law', 'farm', 'protest', 'congress', 'price', 'protesting', 'death', 'party', 'hour']
Topic 5:  ['court', 'vaccine', 'justice', 'supreme', 'vaccination', 'election', 'dose', 'petition', 'bench', 'hearing']
Topic 6:  ['police', 'vaccine', 'arrested', 'dose', 'farmer', 'vaccination', 'accused', 'bengal', 'west', 'delhi']
Topic 7:  ['sabha', 'price', 'congress', 'rajya', 'budget', 'house', 'party', 'opposition', 'session', 'parliament']
Topic 8:  ['lakh', 'price', 'sabha', 'from', 'fuel', 'rajya', 'budget', 'crore', 'petrol', 'bengal']
Topic 9: 

# **LDA : Latent Dirichlet Allocation**

In [46]:
#lda

lda = LatentDirichletAllocation(n_components=25)
lda_matrix = lda.fit_transform(df_vecs)
lda_topics = lda.fit(df_vecs)
for idx, topic in enumerate(lda.components_):
        print ("Topic ", idx, " ".join(feature_names[i] for i in topic.argsort()[:-10 - 1:-1]))

Topic  0 stadium hostel ranaut russian drdo anantnag sardar motera sasikala hideout
Topic  1 balakot pujari pampore sopore malik chaduni internship kishor academy muthoot
Topic  2 railway lakh banerjee mamata kolkata girl october bengal university nandigram
Topic  3 ansari dave lankan saint siddipet iffco tracked emhoff kidney retd
Topic  4 satellite space dravida pslv kazhagam munnetra amazonia isro aiadmk tamil
Topic  5 chamoli glacier tibetan rescue tapovan lokpal burst tunnel flood dalai
Topic  6 lottery earthquake magnitude richter seismology nirani ticket church quake prize
Topic  7 england ravidas trump disengagement match iran wicket energy tejas rathod
Topic  8 upsc aspirant prelim parganas burn junaid mirzapur dhaba trump aprilia
Topic  9 royal harry meghan prince timing interview freight ruckus mallikarjun winfrey
Topic  10 nifty index poor forecasted moderate quality disengagement bamboo category pangong
Topic  11 playerscript pageyoffset playerscriptcounter window sewer ma

In [47]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [48]:
#plot_top_words(lda, feature_names, 10, "t")

In [49]:
col = []
for i in range(lda_matrix.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df_2 = pd.DataFrame(lda_matrix, columns = col)

topic_df_2["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

#display(topic_df_2[l])

**Cosine Similarity**

In [50]:
cos_sim = pd.DataFrame(cosine_similarity(topic_df.iloc[:, :-1]))
cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,1.000000,0.502392,0.817552,0.754306,0.708120,0.340929,0.341186,0.470275,0.392267,0.555020,0.664313,0.111445,0.233904,0.229714,0.289948,0.112106,0.620342,0.367879,0.660780,0.334731,0.440284,0.673302,0.584561,0.538100,0.305321,0.413417,0.326805,0.710224,0.675698,0.673669,0.247396,0.240804,0.484881,0.226802,0.378255,0.308451,0.329677,0.410540,0.546659,0.529807,...,0.446785,0.579219,0.520651,0.174647,0.419471,0.710871,0.461244,0.508305,0.510588,0.615302,0.878986,0.479955,0.664697,0.401418,0.573489,0.418330,0.439010,0.643835,0.462155,0.500203,0.661420,0.270848,0.530606,0.553902,0.732477,0.532038,0.519751,0.454927,0.468708,0.564114,0.541266,0.372331,0.554820,0.433822,0.299242,0.536170,0.603626,0.528218,0.272636,0.133854
1,0.502392,1.000000,0.416474,0.684332,0.060839,0.176571,0.319980,0.328434,0.193063,0.595666,0.510520,0.091225,0.128465,0.150061,0.306429,0.186125,0.897469,0.800856,0.339552,0.051266,0.235044,0.250146,0.216415,0.353272,0.235938,0.284561,0.212153,0.419147,0.211850,0.475854,0.209448,0.357388,0.232545,0.092071,0.536416,0.110463,0.233845,0.550211,0.329796,0.137244,...,0.539167,0.490638,0.532780,0.002222,0.705012,0.689657,0.520595,0.347830,0.370729,0.738314,0.457962,0.625811,0.697660,0.535518,0.550020,0.181049,0.391638,0.560150,0.495467,0.495714,0.515535,0.108818,0.295837,0.409559,0.582555,0.569684,0.455798,0.642723,0.575152,0.570231,0.518541,0.251948,0.332801,0.305627,0.172065,0.208561,0.710168,0.294703,0.680827,0.163286
2,0.817552,0.416474,1.000000,0.603083,0.385907,0.299371,0.379610,0.476382,0.278581,0.406893,0.510513,0.122415,0.156611,0.222865,0.359168,0.100970,0.565768,0.352948,0.658655,0.141116,0.297497,0.548143,0.327571,0.499481,0.278821,0.331088,0.296882,0.604112,0.486320,0.598222,0.209969,0.226516,0.282510,0.198894,0.262037,0.233051,0.269367,0.236365,0.617517,0.252227,...,0.260294,0.535052,0.470344,-0.024533,0.303806,0.484750,0.379113,0.504657,0.409140,0.494162,0.774198,0.372423,0.688251,0.356430,0.480085,0.307327,0.447448,0.628774,0.306702,0.436063,0.524122,0.167596,0.545760,0.530287,0.525672,0.425898,0.404281,0.335930,0.382230,0.495316,0.470588,0.289282,0.456537,0.407945,0.260676,0.294088,0.411395,0.444311,0.201892,0.150821
3,0.754306,0.684332,0.603083,1.000000,0.398469,0.329240,0.321212,0.438363,0.582974,0.665954,0.675446,0.219016,0.039090,0.280241,0.313606,0.202952,0.721724,0.548811,0.567974,0.123527,0.626167,0.427648,0.324438,0.596817,0.369484,0.549811,0.159035,0.463061,0.464100,0.490051,0.273719,0.333302,0.391731,0.236090,0.438625,0.265940,0.447638,0.472075,0.484567,0.324941,...,0.456101,0.477112,0.589945,0.128389,0.575363,0.813500,0.455383,0.590350,0.499562,0.700644,0.776155,0.420758,0.696207,0.506917,0.672188,0.349590,0.581406,0.706298,0.454921,0.664249,0.678939,0.303169,0.561803,0.583996,0.685041,0.716975,0.541693,0.503044,0.463258,0.588283,0.612523,0.629890,0.235037,0.538357,0.358050,0.344526,0.510740,0.437166,0.407763,0.079320
4,0.708120,0.060839,0.385907,0.398469,1.000000,0.500816,0.154921,0.443711,0.333592,0.229532,0.452689,0.114849,0.212373,0.282222,0.253705,0.213896,0.253753,0.054102,0.629721,0.483498,0.335691,0.496833,0.675767,0.396124,0.216804,0.321867,0.191116,0.600236,0.685259,0.440148,0.368238,0.301533,0.650163,0.382175,0.093923,0.563735,0.233164,0.219296,0.339284,0.721860,...,0.260692,0.442876,0.328416,0.109315,0.074064,0.522996,0.246240,0.272871,0.311928,0.369608,0.556509,0.185362,0.222212,0.113852,0.292133,0.479265,0.176475,0.216499,0.287062,0.218623,0.507604,0.173452,0.217861,0.353570,0.542234,0.216137,0.369699,0.166339,0.261102,0.269964,0.353573,0.201037,0.511087,0.286549,0.266172,0.631339,0.348796,0.576916,0.071676,0.207702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.536170,0.208561,0.294088,0.344526,0.631339,0.238522,0.134690,0.433798,0.317839,0.202199,0.680095,0.118144,0.331120,0.083846,0.211200,0.058510,0.333092,0.194932,0.425413,0.252311,0.326772,0.493043,0.466070,0.470788,0.293277,0.566428,0.126232,0.441659,0.576221,0.317380,0.058983,0.158617,0.750933,0.087363,0.273135,0.302179,0.238871,0.357228,0.282442,0.426073,...,0.337741,0.229629,0.416267,0.211507,0.314932,0.672573,0.435043,0.383281,0.326087,0.491748,0.606459,0.477781,0.433056,0.366226,0.427351,0.380426,0.271192,0.350526,0.457144,0.399312,0.638235,0.194084,0.561043,0.378808,0.460541,0.391067,0.588342,0.313385,0.448564,0.352163,0.469482,0.279460,0.396182,0.301400,0.147273,1.000000,0.377360,0.263977,0.333025,0.221740
4590,0.603626,0.710168,0.411395,0.510740,0.348796,0.225008,0.312066,0.357764,0.234992,0.382295,0.654765,0.145991,0.448637,0.161202,0.351056,0.210082,0.809553,0.639803,0.495089,0.549696,0.260890,0.545150,0.531613,0.456355,0.208400,0.301987,0.541399,0.648429,0.515650,0.613412,0.228665,0.467184,0.370467,0.061223,0.608091,0.181437,0.192866,0.492298,0.445034,0.332009,...,0.718410,0.590578,0.604060,0.226693,0.708235,0.597591,0.701153,0.383341,0.454762,0.758148,0.479345,0.813439,0.666686,0.594419,0.535008,0.480044,0.463501,0.540264,0.613680,0.525132,0.613574,0.205618,0.258739,0.420618,0.757803,0.513878,0.594271,0.718941,0.673845,0.759339,0.610698,0.275209,0.739479,0.307980,0.326806,0.377360,1.000000,0.423232,0.618771,0.289187
4591,0.528218,0.294703,0.444311,0.437166,0.576916,0.893890,0.198358,0.655352,0.317456,0.267795,0.546491,0.162997,0.233522,0.721423,0.501502,0.606215,0.455673,0.239659,0.699108,0.354722,0.328412,0.401239,0.583048,0.437299,0.260521,0.259132,0.268370,0.632963,0.457896,0.459773,0.893242,0.608054,0.380589,0.814213,0.317010,0.866822,0.273863,0.215150,0.745084,0.333096,...,0.458494,0.877834,0.339959,0.100870,0.393732,0.461908,0.425015,0.379754,0.367365,0.493356,0.487852,0.362546,0.437298,0.383693,0.436103,0.526234,0.357226,0.455186,0.445092,0.428622,0.563424,0.181007,0.286101,0.770409,0.567499,0.432408,0.435886,0.417601,0.443077,0.473606,0.568506,0.259817,0.520382,0.316918,0.246136,0.263977,0.423232,1.000000,0.312341,0.209777
4592,0.272636,0.680827,0.201892,0.407763,0.071676,0.166134,0.013550,0.361481,0.335023,0.189126,0.447753,0.101564,0.082686,0.192133,0.517119,0.193926,0.699359,0.548382,0.373272,0.212797,0.293239,0.331978,0.163977,0.339978,0.568464,0.516682,0.077978,0.504913,0.218430,0.515203,0.277657,0.305675,0.222873,0.144222,0.450513,0.159042,0.156708,0.297097,0.301058,0.060845,...,0.578618,0.516631,0.469361,0.243567,0.776504,0.465956,0.561288,0.237560,0.329309,0.626263,0.340457,0.593999,0.488111,0.540944,0.500198,0.156033,0.320146,0.366702,0.695379,0.544179,0.445592,0.092146,0.400404,0.398374,0.468607,0.659418,0.507141,0.615647,0.630801,0.520896,0.469098,0.289481,0.374935,0.183141,0.201573,0.333025,0.618771,0.312341,1.000000,0.114257


# **Creating Users and initial ratings**

**Generating Users**

In [51]:
#fitting GMM to column "topic_i"

num_cols_topic_df = topic_df.shape[1]-1
gm = []

for i in range(num_cols_topic_df):
  gm.append(GaussianMixture(n_components=10).fit(topic_df.iloc[:, [i]]))

In [52]:
#generating gmm based topic values for each topic for 50 users
#24 x 50 matrix

Users = []

for i in range(50):
  temp = []
  for i in range(num_cols_topic_df):
    temp.append(gm[i].sample(1)[0][0][0])
  Users.append(temp)
Users=np.array(Users)
Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

In [53]:
#run the above cell before running this twice
slc = list(range(topic_df.shape[1]))
slc.remove(25)
temp_topic_df = topic_df.iloc[:, slc]
Users_temp = Users.T.set_index(temp_topic_df.T.index)

result = temp_topic_df.dot(Users_temp);
#result.T

**Generating User ratings** 




In [54]:
rank_matrix = result.rank().T/4593*10
rank_matrix = rank_matrix.round(0).astype(int)
rank_matrix.columns =np.linspace(0,4593,4594).astype(int)

#not every reader reads all news
#remove random elements from the rank matrix
for i in range(4594):
  random_entries = np.random.randint(0,50,25)
  rank_matrix.loc[random_entries,i] = "No rating"
rank_matrix = rank_matrix.apply(pd.to_numeric, errors='coerce')
rank_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,2.0,10.0,,,,,4.0,3.0,,,5.0,10.0,,,,,,,,,,3.0,6.0,4.0,,6.0,,1.0,4.0,2.0,,5.0,8.0,,,0.0,,,1.0,2.0,...,1.0,,,,,1.0,0.0,0.0,,,,1.0,1.0,0.0,,,2.0,,,,2.0,4.0,,,1.0,,,2.0,1.0,1.0,,0.0,2.0,,1.0,,,1.0,4.0,2.0
1,,10.0,7.0,6.0,,,9.0,4.0,2.0,7.0,6.0,0.0,7.0,1.0,,,8.0,9.0,2.0,7.0,7.0,8.0,,,,,,2.0,5.0,2.0,2.0,4.0,4.0,1.0,,2.0,8.0,,3.0,0.0,...,1.0,3.0,,4.0,4.0,,,0.0,1.0,1.0,,,1.0,0.0,,0.0,1.0,2.0,,1.0,2.0,1.0,0.0,,1.0,,,,1.0,,,,5.0,5.0,1.0,,,4.0,3.0,3.0
2,,3.0,9.0,4.0,8.0,5.0,7.0,,5.0,4.0,,10.0,4.0,1.0,2.0,1.0,4.0,,,,10.0,,8.0,5.0,10.0,,,2.0,7.0,,4.0,3.0,,3.0,2.0,6.0,,,4.0,4.0,...,1.0,,1.0,,,,0.0,,0.0,0.0,,0.0,1.0,0.0,,1.0,,,1.0,1.0,2.0,,0.0,1.0,1.0,,1.0,,,1.0,1.0,,4.0,4.0,5.0,2.0,2.0,,1.0,3.0
3,,,,7.0,,1.0,6.0,2.0,,8.0,,,,,0.0,,5.0,5.0,,8.0,,,3.0,,6.0,5.0,5.0,,5.0,,2.0,,7.0,,,1.0,7.0,5.0,2.0,,...,0.0,,0.0,0.0,2.0,,0.0,0.0,,0.0,,1.0,1.0,0.0,0.0,0.0,,,1.0,,1.0,1.0,0.0,0.0,,1.0,0.0,1.0,0.0,,0.0,,2.0,,6.0,1.0,,,,
4,3.0,,6.0,6.0,,,8.0,,6.0,,3.0,10.0,4.0,,2.0,7.0,,,,7.0,,4.0,,,9.0,,4.0,1.0,,1.0,9.0,,,6.0,,8.0,,2.0,3.0,2.0,...,,2.0,,0.0,1.0,,0.0,0.0,,0.0,1.0,,0.0,0.0,0.0,,1.0,,,1.0,,,0.0,,,0.0,,,,0.0,0.0,0.0,,4.0,2.0,1.0,2.0,7.0,1.0,2.0
5,,,5.0,,4.0,6.0,,4.0,6.0,2.0,,,7.0,,3.0,,,,3.0,7.0,9.0,,9.0,,5.0,5.0,,1.0,4.0,,9.0,7.0,,,,,7.0,2.0,,0.0,...,,,1.0,,2.0,0.0,,0.0,1.0,,,0.0,0.0,0.0,,0.0,,1.0,,,,1.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,,2.0,3.0,8.0,2.0,
6,,4.0,6.0,5.0,4.0,,,,9.0,1.0,6.0,,1.0,,5.0,,7.0,9.0,6.0,,,4.0,,5.0,,9.0,,,8.0,,,,,8.0,6.0,,4.0,3.0,6.0,6.0,...,1.0,4.0,2.0,1.0,,2.0,2.0,1.0,,,,2.0,,,1.0,3.0,3.0,,1.0,4.0,,,,1.0,1.0,2.0,,,,2.0,1.0,3.0,3.0,,7.0,3.0,,8.0,5.0,7.0
7,,,6.0,,,,0.0,6.0,,3.0,5.0,10.0,3.0,3.0,1.0,7.0,,5.0,,4.0,9.0,5.0,,5.0,5.0,6.0,4.0,,,3.0,,5.0,,,,,,,,,...,1.0,3.0,1.0,1.0,2.0,1.0,1.0,,1.0,1.0,2.0,1.0,1.0,1.0,0.0,2.0,3.0,3.0,,1.0,2.0,2.0,,1.0,1.0,,1.0,,,1.0,,,,0.0,0.0,2.0,,,,
8,4.0,6.0,,7.0,,2.0,,3.0,8.0,4.0,5.0,,,,1.0,,,5.0,2.0,5.0,,2.0,6.0,4.0,9.0,,,1.0,6.0,3.0,4.0,2.0,8.0,1.0,,,7.0,,2.0,5.0,...,1.0,,1.0,0.0,,1.0,0.0,,2.0,0.0,,1.0,1.0,1.0,,1.0,3.0,1.0,,1.0,2.0,,,1.0,1.0,1.0,,2.0,,1.0,,2.0,1.0,,,,4.0,6.0,1.0,0.0
9,5.0,,7.0,7.0,1.0,0.0,,,9.0,7.0,5.0,,3.0,3.0,6.0,1.0,,8.0,,,10.0,,1.0,,,,,,5.0,4.0,,,3.0,1.0,8.0,0.0,7.0,,2.0,,...,,2.0,5.0,1.0,,2.0,2.0,3.0,3.0,,,,,3.0,2.0,,4.0,4.0,,,3.0,3.0,,,3.0,4.0,3.0,,,,,5.0,,9.0,9.0,1.0,,3.0,4.0,7.0


# **Content based recommender function**

In [55]:
def content_recommender(rank_matrix,cos_sim):
  
  #find top 5 ratings
  top_five = rank_matrix.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=5)

  #best two similar docs for every doc
  best_two = cos_sim.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=3)
  best_two.columns =['0', '1','2']
  del best_two['0']
  best_two['combined']= best_two.values.tolist()
  best_two
  best_two_dict_1 = best_two['1'].to_dict()
  best_two_dict_2 = best_two['2'].to_dict()

  selected_docs_1 = top_five.replace(best_two_dict_1)
  selected_docs_2 = top_five.replace(best_two_dict_2)
  selected_docs_with_ID = pd.concat([selected_docs_1,selected_docs_2],axis=1,ignore_index=True)

  #replace value by the actual news
  doc_dict = news_corpus.Content.to_dict()

  selected_docs = selected_docs_with_ID.replace(doc_dict)
  return selected_docs, selected_docs_with_ID

# **Top 10 news : Content based recommender.**

In [56]:
selected_docs_content,selected_docs_content_with_ID = content_recommender(rank_matrix,cos_sim)
selected_docs_content

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,washington reuters president biden will hold o...,daily covid case india registered increase fou...,british parliament monday discussed farmer pro...,nearly three woman worldwide subjected physica...,delhi india march rajya sabha been adjourned a...,moscowindia strategic tie with russia very dee...,case coronavirus infection india were recorded...,indian high commission london condemned debate...,covid pandemic negatively affected woman incom...,delhi india march rajya sabha adjourned till m...
1,washington reuters president biden will hold o...,lauding india vaccine leadership woman officia...,delhi india march prime minister narendra modi...,british parliament monday discussed farmer pro...,amaravati andhra pradesh india march andhra pr...,moscowindia strategic tie with russia very dee...,international monetary fund chief economist gi...,kolkata west bengal india march kolkata police...,indian high commission london condemned debate...,delhi india march ministry science technology ...
2,daily covid case india registered increase fou...,siddharth sharmanew delhi india march congress...,several opposition party including congress ha...,delhi india march rajya sabha been adjourned a...,delhi india march rajya sabha chairman venkaia...,case coronavirus infection india were recorded...,face between political rival bharatiya janata ...,parliament will have extended four break this ...,delhi india march rajya sabha adjourned till m...,after nearly year work merger sabha rajya sabh...
3,british parliament monday discussed farmer pro...,tirath singh rawat been sworn chief minister u...,congress leader rahul gandhi sunday compared o...,daily coronavirus covid case fell below third ...,delhi india march union minister piyush goyal ...,indian high commission london condemned debate...,dehradun uttarakhand india march predicting ch...,meerut uttar pradesh india march after farmer ...,daily covid case india registered increase fou...,delhi india march prime minister narendra modi...
4,daily covid case india registered increase fou...,british parliament monday discussed farmer pro...,daily coronavirus covid case fell below third ...,delhi india march union minister piyush goyal ...,maharashtra kerala punjab tamil nadu gujarat k...,case coronavirus infection india were recorded...,indian high commission london condemned debate...,daily covid case india registered increase fou...,delhi india march prime minister narendra modi...,maharashtra kerala punjab tamil nadu gujarat w...
5,british parliament monday discussed farmer pro...,delhi india march cumulative number covid vacc...,india will begin administer second dose covid ...,delhi india march private hospital under ayush...,india inoculated lakh healthcare worker tuesda...,indian high commission london condemned debate...,delhi india march cumulative number covid vacc...,health secretary rajesh bhushan written chief ...,delhi india march review status progress covid...,total crore covid vaccine dos have been admini...
6,suspense loom over fate leadership uttarakhand...,tirath singh rawat been sworn chief minister u...,daily coronavirus covid case fell below third ...,supreme court judge have informally told chief...,case coronavirus infection india were recorded...,dehradun uttarakhand india march shortly after...,dehradun uttarakhand india march predicting ch...,daily covid case india registered increase fou...,several lawyer have written chief justice indi...,daily rise coronavirus infection india recorde...
7,daily covid case india registered increase fou...,tirath singh rawat been sworn chief minister u...,daily coronavirus covid case fell below third ...,india total tally covid case rose with infecti...,farmer ready stay delhi border protest against...,case coronavirus infection india were recorded...,dehradun uttarakhand india march predicting ch...,daily covid case india registered increase fou...,daily covid case india registered increase fou...,delhi india february minister state agricultur...
8,woman likely play major role ensuring that pro...,narendra modi prime minister poor farmer charg...,second round meeting between police farmer uni...,daily coronavirus covid case fell below third ...,farmer leader rakesh tikait monday took union ...,delhi reuters thousand woman joined protest fa...,delhi india february minister state agricultur...,iron nail studded road ghazipur border restric...,daily covid case india registered increase fou...,more than faculty member various educational i...
9,siddharth sharmanew delhi india march congress...,british parliament monday discussed farmer pro...,congress leader rahul gandhi sunday compared o...,delhi india march congress leader party former...,chandigarh punjab india march congress leader ...,face between political rival bharatiya janata ...,indian high commission london condemned debate...,meerut uttar pradesh india march after farmer ...,leader jyotiraditya scindia quit congress last...,narendra modi prime minister poor farmer charg...


# **Collaborative recommender function: Predict missing ratings using Matrix factorization**

In [57]:
#collaborative recommender function

def collaborative_recommender(rank_matrix,num_iter,news_corpus):
  A = np.array(rank_matrix)
  M = A.shape[0]
  N = A.shape[1]
  A_df = pd.DataFrame(A)
  K = 25
  W = np.abs(np.random.uniform(low=0, high=1, size=(M, K)))
  H = np.abs(np.random.uniform(low=0, high=1, size=(K, N)))
  W = np.divide(W, K*W.max())
  H = np.divide(H, K*H.max())

  #cost func
  def cost(A, W, H):
    mask = pd.DataFrame(A).notnull().values
    WH = np.dot(W, H)
    WH_mask = WH[mask]
    A_mask = A[mask]
    A_WH_mask = A_mask-WH_mask
    return linalg.norm(A_WH_mask, 2)
  
  #optimization loop
  num_display_cost = max(int(num_iter/10), 1)

  for i in range(num_iter):
      if i%2 ==0:
          # Learn H, given A and W
          for j in range(N):
              mask_rows = pd.Series(A[:,j]).notnull()
              H[:,j] = nnls(W[mask_rows], A[:,j][mask_rows])[0]
      else:
          for j in range(M):
              mask_rows = pd.Series(A[j,:]).notnull()
              W[j,:] = nnls(H.transpose()[mask_rows], A[j,:][mask_rows])[0]
              
      WH = np.dot(W, H)
      c = cost(A, W, H)
      #if i%num_display_cost==0:
      print(i, c)
  W = pd.DataFrame(W)
  H = pd.DataFrame(H)
  A = W.dot(H)
  
  #new rank matrix
  A = A.round(0).astype(int)
  A = np.clip(A,1,10)
  #finding the top 10 documents
  selected_docs_with_ID = A.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=10)
  selected_docs_with_ID.columns =['D1', 'D2', 'D3', 'D4', 'D5','D6','D7','D8','D9','D10']
  doc_dict = news_corpus.Content.to_dict()
  selected_docs = selected_docs_with_ID.replace(doc_dict)
  return selected_docs, selected_docs_with_ID

# **Top 10 news : Collaborative recommender.**

In [58]:
num_iter = 10
selected_docs_collab,selected_docs_collab_with_ID = collaborative_recommender(rank_matrix,num_iter,news_corpus)
selected_docs_collab

0 744.9192559902333
1 620.9726278325913
2 513.4470017946236
3 443.32133684991135
4 395.49017596471185
5 367.76598651333074
6 349.4575886637778
7 336.1950480053227
8 326.1852617327083
9 318.04545920210325


Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10
0,india recorded highest daily tally covid case ...,india will continue dialogue with china till d...,high commission india london condemned debate ...,international woman which celebrated with much...,multiple adjournment motion marred working raj...,union home ministry monday said crore been san...,covid pandemic negatively affected woman incom...,year farmer from haryana hisar district march ...,case coronavirus infection india were recorded...,covid fatality have been reported state union ...
1,access covid vaccine cooperation technology cl...,first ever summit leader quadrilateral framewo...,prime minister narendra modi tuesday virtually...,lauding india vaccine leadership woman officia...,delhi prime minister narendra modi tuesday exp...,high commission india london condemned debate ...,department science technology expressed concer...,prime minister narendra modi monday said celeb...,rajya sabha chairman venkaiah naidu march said...,prime minister narendra modi march said poor n...
2,parliamentary standing committee education rec...,parliamentary committee recommended that minis...,prime minister narendra modi march said poor n...,agriculture minister narendra singh tomar satu...,prime minister narendra modi friday said produ...,delhi high court march asked centre explain ra...,congress leader rahul gandhi used three popula...,health minister harsh vardhan took first dose ...,national investigation agency took over probe ...,time come increase private sector participatio...
3,india recorded highest daily tally covid case ...,congress always stood internal discussion ther...,noting that budgetary allocation agriculture b...,high commission india london condemned debate ...,prime minister narendra modi monday said celeb...,crisis over continuation uttarakhand chief min...,congress general secretary priyanka gandhi vad...,case coronavirus infection india were recorded...,prime minister narendra modi march said poor n...,india covid tally rose with over fresh case be...
4,india recorded highest daily tally covid case ...,lauding india vaccine leadership woman officia...,high commission india london condemned debate ...,case coronavirus infection india were recorded...,prime minister narendra modi march said poor n...,india covid tally rose with over fresh case be...,congress changing selects candidate assembly e...,covid fatality have been reported state union ...,delhi high court march asked centre explain ra...,cumulative number covid vaccine dos administer...
5,supreme court wednesday asked government respo...,congress always stood internal discussion ther...,constitution bench chief justice india sharad ...,association democratic reform represented advo...,high commission india london condemned debate ...,union home ministry monday said crore been san...,prime minister narendra modi march said poor n...,supreme court initiative begin hybrid physical...,congress changing selects candidate assembly e...,centre friday high level committee planning co...
6,supreme court wednesday asked government respo...,congress always stood internal discussion ther...,informed supreme court wednesday that mamata b...,constitution bench chief justice india sharad ...,supreme court wednesday sought response from c...,exercise dustlik between india uzbekistan from...,noted classical dancer nominated rajya sabha m...,union home ministry monday said crore been san...,national investigation agency taken over case ...,leader jyotiraditya scindia quit congress last...
7,person including year assaulted policeman duty...,prime minister narendra modi monday said celeb...,crisis over continuation uttarakhand chief min...,congress general secretary priyanka gandhi vad...,case coronavirus infection india were recorded...,prime minister narendra modi march said poor n...,conveying strong appreciation resolute dedicat...,agriculture minister narendra singh tomar satu...,farmer agitation over farm law completed day s...,india covid tally rose with over fresh case be...
8,india recorded highest daily tally covid case ...,congress always stood internal discussion ther...,third both house parliament could barely busin...,former congress president rahul gandhi tuesday...,sabha early adjournment tuesday after frequent...,price domestic cooking doubled cylinder last s...,leader opposition rajya sabha mallikarjun khar...,congress general secretary priyanka gandhi vad...,continuing with attack government rashtriya vi...,year farmer from haryana hisar district march ...
9,congress always stood internal discussion ther...,third both house parliament could barely busin...,former congress president rahul gandhi tuesday...,high commission india london condemned debate ...,thousand woman farmer held protest march deliv...,rajya sabha chairman venkaiah naidu march said...,leader jyotiraditya scindia quit congress last...,congress general secretary priyanka gandhi vad...,continuing with attack government rashtriya vi...,agriculture minister narendra singh tomar satu...


# **Hydrid recommender**

In [59]:
def hybrid(collaborative_recommender,content_recommender,rank_matrix,cos_sim,news_corpus):
  selected_docs_collab,selected_docs_collab_with_ID = collaborative_recommender(rank_matrix,num_iter,news_corpus)
  selected_docs_content,selected_docs_content_with_ID = content_recommender(rank_matrix,cos_sim)
  
  selected_docs_collab.reset_index(drop=True, inplace=True)
  selected_docs_content.reset_index(drop=True, inplace=True)
  
  selected_docs_collab_with_ID.reset_index(drop=True, inplace=True)
  selected_docs_content_with_ID.reset_index(drop=True, inplace=True)
  
  selected_docs_final = pd.concat([selected_docs_content.iloc[:,[0,1,2,3,4]],selected_docs_collab.iloc[:,[5,6,7,8,9]]],axis=1)
  selected_docs_final_with_ID = pd.concat([selected_docs_content_with_ID.iloc[:,[0,1,2,3,4]],selected_docs_collab_with_ID.iloc[:,[5,6,7,8,9]]],axis=1)
  
  return selected_docs_final,selected_docs_final_with_ID

# **Top 10 news : Hybrid recommender.**

In [60]:
selected_docs_final,selected_docs_final_with_ID = hybrid(collaborative_recommender,content_recommender,rank_matrix,cos_sim,news_corpus)
selected_docs_final

0 761.7231098650902
1 632.6300735778132
2 521.9599457007373
3 447.7771970346495
4 400.8915166469912
5 373.50174727226886
6 354.43799494084817
7 340.2550983468785
8 329.3720928265241
9 320.6810141517886


Unnamed: 0,0,1,2,3,4,D6,D7,D8,D9,D10
0,washington reuters president biden will hold o...,daily covid case india registered increase fou...,british parliament monday discussed farmer pro...,nearly three woman worldwide subjected physica...,delhi india march rajya sabha been adjourned a...,congress general secretary priyanka gandhi vad...,covid pandemic negatively affected woman incom...,case coronavirus infection india were recorded...,conveying strong appreciation resolute dedicat...,farmer agitation over farm law completed day s...
1,washington reuters president biden will hold o...,lauding india vaccine leadership woman officia...,delhi india march prime minister narendra modi...,british parliament monday discussed farmer pro...,amaravati andhra pradesh india march andhra pr...,after devastating blaze claimed nine life koil...,prime minister narendra modi tuesday virtually...,inclusion india united state latest strategy r...,delhi prime minister narendra modi tuesday exp...,union home ministry monday said crore been san...
2,daily covid case india registered increase fou...,siddharth sharmanew delhi india march congress...,several opposition party including congress ha...,delhi india march rajya sabha been adjourned a...,delhi india march rajya sabha chairman venkaia...,prime minister narendra modi monday said celeb...,continuing with attack government rashtriya vi...,case coronavirus infection india were recorded...,prime minister narendra modi march said poor n...,prime minister narendra modi friday said produ...
3,british parliament monday discussed farmer pro...,tirath singh rawat been sworn chief minister u...,congress leader rahul gandhi sunday compared o...,daily coronavirus covid case fell below third ...,delhi india march union minister piyush goyal ...,prime minister narendra modi march said poor n...,agriculture minister narendra singh tomar satu...,prime minister narendra modi friday said produ...,based human right watchdog freedom house accus...,time come increase private sector participatio...
4,daily covid case india registered increase fou...,british parliament monday discussed farmer pro...,daily coronavirus covid case fell below third ...,delhi india march union minister piyush goyal ...,maharashtra kerala punjab tamil nadu gujarat k...,congress changing selects candidate assembly e...,covid fatality have been reported state union ...,farmer protest completed day border delhi seni...,finance minister nirmala sitharaman lashed thi...,prime minister narendra modi friday said produ...
5,british parliament monday discussed farmer pro...,delhi india march cumulative number covid vacc...,india will begin administer second dose covid ...,delhi india march private hospital under ayush...,india inoculated lakh healthcare worker tuesda...,andhra pradesh government orally stated suprem...,prime minister narendra modi friday said produ...,delhi high court march asked centre explain ra...,cumulative number covid vaccine dos administer...,health ministry urged state utilise capacity p...
6,suspense loom over fate leadership uttarakhand...,tirath singh rawat been sworn chief minister u...,daily coronavirus covid case fell below third ...,supreme court judge have informally told chief...,case coronavirus infection india were recorded...,delhi prime minister narendra modi tuesday exp...,crisis over continuation uttarakhand chief min...,supreme court initiative begin hybrid physical...,india covid tally rose with over fresh case be...,andhra pradesh government orally stated suprem...
7,daily covid case india registered increase fou...,tirath singh rawat been sworn chief minister u...,daily coronavirus covid case fell below third ...,india total tally covid case rose with infecti...,farmer ready stay delhi border protest against...,india covid tally rose with over fresh case be...,covid fatality have been reported state union ...,farmer protest completed day border delhi seni...,delhi high court march asked centre explain ra...,maharashtra kerala punjab tamil nadu gujarat k...
8,woman likely play major role ensuring that pro...,narendra modi prime minister poor farmer charg...,second round meeting between police farmer uni...,daily coronavirus covid case fell below third ...,farmer leader rakesh tikait monday took union ...,leader opposition rajya sabha mallikarjun khar...,congress general secretary priyanka gandhi vad...,continuing with attack government rashtriya vi...,year farmer from haryana hisar district march ...,case coronavirus infection india were recorded...
9,siddharth sharmanew delhi india march congress...,british parliament monday discussed farmer pro...,congress leader rahul gandhi sunday compared o...,delhi india march congress leader party former...,chandigarh punjab india march congress leader ...,multiple adjournment motion marred working raj...,thousand woman farmer held protest march deliv...,prime minister narendra modi monday said celeb...,congress general secretary priyanka gandhi vad...,continuing with attack government rashtriya vi...


# **User Profile Updater**

Implemented in the flask app.

# **Implement ALS based matrix factorization instead of NNLS**

In [62]:
'''from pyspark.ml.recommendation import ALS 
from pyspark.sql.types import FloatType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

X_train, X_test = rank_matrix.randomSplit([0.6, 0.4])'''

'from pyspark.ml.recommendation import ALS \nfrom pyspark.sql.types import FloatType\nfrom pyspark.ml.evaluation import RegressionEvaluator\nfrom pyspark.sql.functions import col\n\nX_train, X_test = rank_matrix.randomSplit([0.6, 0.4])'

In [63]:
'''als = mlALS(rank=5, maxIter=10, seed=0)
model = als.fit(rank_matrix)'''

'als = mlALS(rank=5, maxIter=10, seed=0)\nmodel = als.fit(rank_matrix)'