<a href="https://colab.research.google.com/github/sarvesh237/lfkdsahkhfa/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [128]:
import pandas as pd
import numpy as np
from numpy import linalg
import matplotlib.pyplot as plt

from scipy.optimize import nnls

#for lemmatization
import nltk
nltk.download("wordnet")
nltk.download('stopwords')
#to remove stop words
from nltk.corpus import stopwords

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#gaussian mixture
from sklearn.mixture import GaussianMixture

from sklearn.decomposition import TruncatedSVD# TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Importing the collected data**

In [129]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [130]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, stop words, non-english characters, drop empty rows and duplicates.**

In [131]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

stop_words = stopwords.words('english')
stop_words.extend(['span','class','spacing','href','html','http','title','said','that'])
news_corpus['Content'] = news_corpus['Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_corpus = news_corpus.apply(lambda x: [item for item in x if item not in stop_words])


Storing a second news corpus to show the original news without preprocessing.

In [132]:
news_corpus_org = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus_org = news_corpus_org.dropna() #dropping NaN
news_corpus_org = news_corpus_org[news_corpus_org.Content != ''] #dropping empty rows
news_corpus_org = news_corpus_org.drop_duplicates()
news_corpus_org["Content"] = news_corpus_org['Content'].str.replace('[^\w\s]',' ')
news_corpus_org = news_corpus_org[news_corpus_org['Content'].map(lambda x: x.isascii())]

news_corpus_org_temp = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus_org_temp = news_corpus_org_temp.dropna() #dropping NaN
news_corpus_org_temp = news_corpus_org_temp[news_corpus_org_temp.Content != ''] #dropping empty rows
news_corpus_org_temp = news_corpus_org_temp.drop_duplicates()

org_news_dict = news_corpus_org_temp.Content.to_dict()
news_corpus_org = news_corpus_org.replace(org_news_dict)
news_corpus_org = news_corpus_org.reset_index(drop=True)

In [133]:
news_corpus_org

Unnamed: 0,Content
0,Media reports about Swedish bus manufacturer S...
1,Access to COVID 19 vaccines cooperation on te...
2,After severe criticism over not holding consul...
3,Former Congress president Rahul Gandhi on Thur...
4,The Enforcement Directorate has attached three...
...,...
4589,Over 200 Mughal era gold coins dating back to...
4590,China is planning to spend big in Tibet as it...
4591,The Supreme Court Tuesday came out with a solu...
4592,Indian American Maju Varghese who previously ...


**Lemmatization**

In [134]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus = news_corpus.reset_index(drop=True)
news_corpus

Unnamed: 0,Content
0,medium report about swedish manufacturer scani...
1,access covid vaccine cooperation technology cl...
2,after severe criticism over holding consultati...
3,former congress president rahul gandhi thursda...
4,enforcement directorate attached three immovab...
...,...
4589,over mughal gold coin dating back early centur...
4590,china planning spend tibet five year plan allo...
4591,supreme court tuesday came with solution stale...
4592,indian american maju varghese previously serve...


# TF-IDF

In [135]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aajtak,aakash,aaksha,aamir,aamk,aandolan,aandolanjivi,aane,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,aatmanirbharbharat,aatmanirbharta,aatmiyata,aatmnirbhar,aatmnirbharta,aawam,aaya,...,zhao,zhoigar,zhuoran,ziarul,zila,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,ziyad,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,ztdrktlic,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **LSA : Latent Semantic Analysis**

In [136]:
sv_dec = TruncatedSVD(n_components = 50)  # components(no. of latent factors) < n ; n : no. of docs.
ls_analysis = sv_dec.fit_transform(vectors.T)

sv_dec.singular_values_ # eigenvalues for svd (in decreasing order)

array([13.49665211,  8.19753288,  6.65195647,  6.45052751,  6.13464354,
        5.38660293,  5.15440847,  4.58017139,  4.37335818,  4.2780283 ,
        4.22044797,  4.12360549,  4.06317411,  4.03834038,  3.93003051,
        3.91274862,  3.78093644,  3.73163756,  3.68002078,  3.61972012,
        3.56659977,  3.48168166,  3.408241  ,  3.36860646,  3.31004809,
        3.29664291,  3.26523697,  3.22167059,  3.14591254,  3.13518902,
        3.09942404,  3.06320379,  3.00366759,  2.98335575,  2.95415626,
        2.93693343,  2.91755421,  2.8775497 ,  2.83674051,  2.79190267,
        2.78202929,  2.7620429 ,  2.73294853,  2.71873928,  2.71456128,
        2.68901031,  2.66753527,  2.65519384,  2.61136247,  2.59100024])

In [137]:
variance =  sv_dec.singular_values_/sum(sv_dec.singular_values_) # checking the variance captured by eigenvalues
variance

array([0.06978863, 0.04238789, 0.03439601, 0.03335445, 0.03172108,
       0.0278531 , 0.02665247, 0.0236832 , 0.02261381, 0.02212087,
       0.02182314, 0.02132238, 0.0210099 , 0.02088149, 0.02032144,
       0.02023208, 0.01955051, 0.01929559, 0.01902869, 0.01871689,
       0.01844221, 0.01800312, 0.01762337, 0.01741843, 0.01711563,
       0.01704632, 0.01688392, 0.01665865, 0.01626692, 0.01621147,
       0.01602653, 0.01583925, 0.0155314 , 0.01542637, 0.01527538,
       0.01518633, 0.01508612, 0.01487926, 0.01466825, 0.0144364 ,
       0.01438535, 0.014282  , 0.01413156, 0.01405809, 0.01403648,
       0.01390436, 0.01379332, 0.0137295 , 0.01350286, 0.01339757])

In [138]:
sv_dec = TruncatedSVD(n_components = 25) # components with accepted variance
las = sv_dec.fit_transform(vectors)

#print(las,las.shape)

Topic - Document matrix

In [139]:
# Representation of each doc in terms of latent topics after dimensionality reduction
col = []
for i in range(las.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df = pd.DataFrame(las, columns = col)

topic_df["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

#display(topic_df[l])


Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish manufacturer scani...,0.129416,-0.031665,-0.002044,0.016616,-0.058626,-0.036387,-0.018073,0.009407,0.001573,-0.020310,0.011206,0.011709,0.009639,-0.052951,0.009387,-0.011292,0.065675,-0.036746,-0.011712,0.017897,0.001792,0.006180,-0.041241,-0.047876,0.017119
1,access covid vaccine cooperation technology cl...,0.241852,-0.033735,0.076402,0.116281,-0.063050,-0.107387,-0.079549,-0.059717,-0.053851,-0.141224,-0.008260,-0.107245,0.028668,0.050513,-0.110135,0.006742,0.119714,0.037130,0.031863,0.003244,0.052344,-0.040560,-0.068649,0.092410,0.106989
2,after severe criticism over holding consultati...,0.187095,-0.047153,-0.003871,0.027602,-0.070254,-0.018248,-0.070942,-0.019196,-0.003131,-0.022006,-0.016976,-0.021076,0.017538,-0.006655,0.034366,-0.072964,0.013113,-0.073350,-0.024956,-0.033308,0.044701,0.049106,-0.164749,-0.142543,0.158238
3,former congress president rahul gandhi thursda...,0.177060,-0.026037,0.029299,0.010233,-0.039886,-0.040989,-0.059030,0.027473,-0.033055,-0.081083,0.101577,-0.065349,-0.008102,-0.009909,0.024679,0.017538,0.055959,-0.035931,-0.005492,0.029772,0.027677,0.010480,-0.027654,-0.054769,0.083129
4,enforcement directorate attached three immovab...,0.134256,-0.048095,-0.066807,-0.030231,-0.111699,0.013014,0.045302,0.013598,0.053417,-0.024709,-0.002205,0.026191,-0.036438,-0.091741,0.039611,-0.001708,0.100758,-0.027998,-0.037899,0.026215,-0.069957,-0.007333,0.008031,-0.065848,-0.126043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,over mughal gold coin dating back early centur...,0.086212,-0.022274,-0.033222,-0.015101,-0.062891,-0.042111,0.091876,0.036135,0.021458,-0.009708,0.003565,0.002258,-0.011424,-0.040063,0.008464,-0.035179,0.003282,-0.014990,0.016194,-0.011605,0.019021,0.002386,-0.004402,0.020397,-0.003340
4590,china planning spend tibet five year plan allo...,0.123348,-0.013562,0.004563,0.000897,-0.026670,-0.072700,-0.033863,-0.033571,-0.018346,-0.023547,-0.066934,0.000126,0.023595,-0.016524,-0.011382,-0.007965,0.036142,-0.003305,-0.018334,-0.000917,0.019481,-0.017093,0.018879,0.014326,0.004757
4591,supreme court tuesday came with solution stale...,0.178528,-0.066109,-0.074732,-0.003261,-0.120678,0.127808,-0.102550,-0.017943,0.033452,-0.020645,-0.025115,0.010295,0.003446,-0.011927,0.030074,0.010819,0.021190,0.011157,0.014006,-0.011774,0.041547,0.015026,0.030108,-0.022771,-0.013507
4592,indian american maju varghese previously serve...,0.080712,-0.038110,0.006435,0.012994,-0.024374,-0.039543,-0.025032,0.021182,0.019076,-0.052122,-0.049790,-0.038690,-0.011171,0.058288,-0.024366,-0.061309,0.041329,0.003296,0.026854,0.043794,0.033352,-0.020197,0.021609,0.072353,-0.011090


**Visualizing the topics**

In [140]:
'''import umap
import matplotlib.pyplot as plt

embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )
plt.show()'''

'import umap\nimport matplotlib.pyplot as plt\n\nembedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)\n\nplt.figure(figsize=(7,5))\nplt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )\nplt.show()'

**Topics and terms list**

In [141]:
Sigma = sv_dec.singular_values_ 
V_transpose = sv_dec.components_.T
terms = vectorizer.get_feature_names()

for index, component in enumerate(sv_dec.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)
del Sigma,V_transpose,terms,zipped,top_terms_key

Topic 0:  ['said', 'that', 'covid', 'case', 'will', 'farmer', 'have', 'state', 'with', 'minister']
Topic 1:  ['covid', 'vaccine', 'case', 'death', 'vaccination', 'health', 'total', 'reported', 'dose', 'active']
Topic 2:  ['election', 'assembly', 'party', 'vaccine', 'congress', 'seat', 'will', 'poll', 'minister', 'bengal']
Topic 3:  ['vaccine', 'vaccination', 'farmer', 'dose', 'first', 'covid', 'modi', 'dos', 'law', 'covaxin']
Topic 4:  ['farmer', 'law', 'farm', 'protest', 'congress', 'price', 'protesting', 'death', 'party', 'hour']
Topic 5:  ['court', 'vaccine', 'justice', 'supreme', 'vaccination', 'election', 'dose', 'petition', 'bench', 'hearing']
Topic 6:  ['police', 'vaccine', 'arrested', 'dose', 'farmer', 'vaccination', 'accused', 'bengal', 'west', 'delhi']
Topic 7:  ['sabha', 'price', 'congress', 'rajya', 'budget', 'house', 'party', 'opposition', 'session', 'parliament']
Topic 8:  ['lakh', 'price', 'sabha', 'from', 'fuel', 'rajya', 'budget', 'crore', 'petrol', 'bengal']
Topic 9: 

# **LDA : Latent Dirichlet Allocation**

In [142]:
#lda

lda = LatentDirichletAllocation(n_components=25)
lda_matrix = lda.fit_transform(df_vecs)
lda_topics = lda.fit(df_vecs)
for idx, topic in enumerate(lda.components_):
        print ("Topic ", idx, " ".join(feature_names[i] for i in topic.argsort()[:-10 - 1:-1]))

Topic  0 death case lakh total recovery covid active fatality reported hour
Topic  1 census caste tejas lokpal secc toy sewage poultry madhavan slaf
Topic  2 kiifb shiva lord coronil ceraweek shivratri cleaning rathod sewer patanjali
Topic  3 janaushadhi timing aushadhi diwas gallery revise pariyojana kendra alwar kendras
Topic  4 blast israeli embassy gyawali explosion chikkaballapur mining nirani israel nepal
Topic  5 delkar facebook purohit rainfall ammk pampore furthering himalayan intermediary pragya
Topic  6 smoking pamela cocaine ndps narcotic goswami alipore rakesh substance ombudsman
Topic  7 harry meghan royal prince interview stabbed oprah adarsh winfrey markle
Topic  8 said that will with minister have india state from covid
Topic  9 chacko forecasted poor owaisi moderate aimim safar majlis muslimeen wind
Topic  10 satellite strain negative space isro detected aayog upsc chowdhury niti
Topic  11 cattle khwaja ganie sopore match fishery virudhunagar bribe factory cruelty
Top

In [143]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [144]:
#plot_top_words(lda, feature_names, 10, "t")

In [145]:
col = []
for i in range(lda_matrix.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df_2 = pd.DataFrame(lda_matrix, columns = col)

topic_df_2["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

#display(topic_df_2[l])

**Cosine Similarity**

In [146]:
cos_sim = pd.DataFrame(cosine_similarity(topic_df.iloc[:, :-1]))
cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,1.000000,0.540730,0.739073,0.770177,0.651602,0.388909,0.356321,0.417554,0.438595,0.615276,0.723603,0.108563,0.399154,0.240313,0.275825,0.132589,0.654597,0.462839,0.679343,0.560660,0.459603,0.613105,0.614099,0.620181,0.327577,0.451332,0.461293,0.729583,0.699478,0.624095,0.262095,0.309768,0.520833,0.199720,0.398818,0.330814,0.291403,0.432116,0.456086,0.485478,...,0.414266,0.551448,0.498635,0.149106,0.428075,0.747384,0.402217,0.549957,0.481442,0.629664,0.853102,0.409086,0.636353,0.287638,0.535847,0.452559,0.445152,0.637302,0.464432,0.491297,0.682218,0.249293,0.530113,0.525543,0.666237,0.517500,0.529444,0.415599,0.417782,0.524130,0.523919,0.400562,0.591168,0.409516,0.265134,0.536725,0.652327,0.528449,0.296047,0.174712
1,0.540730,1.000000,0.463514,0.670312,0.006444,0.183568,0.294418,0.253835,0.227776,0.547776,0.475911,0.084223,0.111158,0.136396,0.338195,0.180949,0.902427,0.788760,0.325694,0.100408,0.247269,0.255149,0.183556,0.338780,0.218235,0.286661,0.235970,0.403016,0.207671,0.444170,0.218554,0.365113,0.215559,0.045337,0.472494,0.091806,0.151668,0.539096,0.297312,0.091093,...,0.455636,0.510315,0.592998,0.054069,0.672599,0.686063,0.378703,0.333551,0.361317,0.733809,0.459885,0.476267,0.665764,0.418674,0.526204,0.198515,0.394737,0.521205,0.409212,0.477219,0.470688,0.117355,0.292747,0.364947,0.486977,0.483328,0.388776,0.552144,0.457279,0.490165,0.449137,0.316034,0.316538,0.308185,0.217323,0.204510,0.697566,0.282366,0.661827,0.185183
2,0.739073,0.463514,1.000000,0.680140,0.176394,0.360762,0.381705,0.412192,0.362366,0.507295,0.535892,0.128789,0.293407,0.244564,0.349480,0.126104,0.604585,0.418734,0.587042,0.393617,0.299556,0.487610,0.299481,0.547827,0.262216,0.355547,0.467619,0.578761,0.448159,0.509582,0.239436,0.311696,0.248087,0.188780,0.250924,0.238621,0.224710,0.254817,0.486464,0.105274,...,0.133630,0.468968,0.434585,-0.006116,0.216573,0.472870,0.228521,0.474922,0.359364,0.442753,0.648347,0.127350,0.600068,0.088600,0.328984,0.275352,0.455254,0.595928,0.173296,0.358704,0.447054,0.137534,0.599537,0.478174,0.316355,0.262603,0.326696,0.192670,0.196988,0.338744,0.343062,0.311585,0.516892,0.360975,0.291228,0.265560,0.428627,0.437386,0.134876,0.197047
3,0.770177,0.670312,0.680140,1.000000,0.267629,0.332632,0.315328,0.427148,0.607872,0.680229,0.625299,0.207232,0.068936,0.252292,0.305833,0.162830,0.693771,0.492016,0.508437,0.226430,0.594093,0.434551,0.298479,0.550051,0.337741,0.518958,0.225639,0.451221,0.423261,0.477314,0.262912,0.312575,0.345608,0.245928,0.416415,0.243682,0.456865,0.461674,0.437030,0.251638,...,0.352737,0.445964,0.577058,0.132339,0.481045,0.727060,0.330669,0.525873,0.473983,0.595438,0.734960,0.238502,0.665057,0.328420,0.569608,0.319735,0.576895,0.680951,0.330184,0.596591,0.570933,0.282206,0.581536,0.538589,0.516766,0.573994,0.436471,0.388273,0.330967,0.470700,0.476280,0.623490,0.276963,0.480619,0.371709,0.294830,0.472394,0.413831,0.320109,0.097031
4,0.651602,0.006444,0.176394,0.267629,1.000000,0.442063,0.137499,0.406864,0.255267,0.191958,0.466737,0.093063,0.295296,0.261317,0.183051,0.212442,0.194310,0.131756,0.629958,0.472488,0.296146,0.388448,0.646627,0.404039,0.225677,0.288557,0.170100,0.563142,0.656580,0.431224,0.311445,0.283829,0.647739,0.347698,0.116703,0.536838,0.208140,0.172474,0.332426,0.737914,...,0.320146,0.352291,0.234251,0.034072,0.115780,0.502805,0.346426,0.334327,0.281445,0.377476,0.547484,0.326740,0.241175,0.235102,0.326564,0.462486,0.157650,0.210994,0.371114,0.246071,0.530225,0.155006,0.146463,0.343545,0.597563,0.300799,0.439653,0.234141,0.361106,0.339064,0.422520,0.158636,0.416747,0.284519,0.163265,0.563908,0.334605,0.523739,0.127428,0.154691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.536725,0.204510,0.265560,0.294830,0.563908,0.238910,0.132140,0.430655,0.295044,0.180624,0.670448,0.121879,0.367632,0.082526,0.215714,0.057746,0.324184,0.198829,0.414728,0.275417,0.323828,0.477768,0.453389,0.486672,0.291295,0.571340,0.138953,0.437003,0.578558,0.308084,0.066210,0.149176,0.737003,0.073878,0.277508,0.295575,0.225721,0.360199,0.272234,0.374236,...,0.303934,0.244868,0.416618,0.192021,0.322221,0.662542,0.371841,0.350870,0.294675,0.489686,0.609186,0.448334,0.420796,0.332563,0.402321,0.389426,0.243937,0.312293,0.452683,0.376453,0.629286,0.181356,0.509503,0.357101,0.410587,0.387375,0.556251,0.294151,0.411581,0.316365,0.444201,0.265289,0.380423,0.298756,0.139266,1.000000,0.375143,0.253502,0.340685,0.222119
4590,0.652327,0.697566,0.428627,0.472394,0.334605,0.224635,0.309591,0.387209,0.234315,0.380769,0.661333,0.147585,0.505513,0.153169,0.344762,0.191491,0.792269,0.628016,0.494214,0.639613,0.261039,0.498424,0.500221,0.449654,0.202867,0.314590,0.598303,0.665414,0.511432,0.625182,0.225424,0.453822,0.373565,0.068786,0.630039,0.178467,0.224072,0.514043,0.447059,0.313942,...,0.691364,0.595458,0.612465,0.178959,0.690332,0.587313,0.653885,0.412233,0.441849,0.727204,0.504987,0.771798,0.708781,0.577558,0.546218,0.497828,0.451975,0.545974,0.593064,0.515558,0.586535,0.191081,0.237019,0.423263,0.711874,0.502863,0.575182,0.704521,0.654475,0.742580,0.595090,0.302292,0.748975,0.316274,0.327053,0.375143,1.000000,0.422122,0.599948,0.285305
4591,0.528449,0.282366,0.437386,0.413831,0.523739,0.904136,0.196960,0.648621,0.313798,0.280122,0.565013,0.163771,0.278724,0.726489,0.480290,0.612602,0.441485,0.259854,0.699750,0.423131,0.326991,0.380864,0.570001,0.448775,0.261202,0.261240,0.299329,0.631984,0.461611,0.464280,0.890932,0.629445,0.385670,0.816443,0.329857,0.871447,0.258860,0.222316,0.762616,0.301860,...,0.452490,0.854144,0.313334,0.080226,0.375396,0.448601,0.413099,0.400622,0.363576,0.476909,0.495141,0.344398,0.445264,0.363683,0.424810,0.530626,0.355433,0.458598,0.433815,0.430226,0.541989,0.172177,0.283347,0.771735,0.539553,0.417647,0.435715,0.415901,0.440823,0.463367,0.560016,0.256651,0.518402,0.316696,0.241214,0.253502,0.422122,1.000000,0.303806,0.210793
4592,0.296047,0.661827,0.134876,0.320109,0.127428,0.162807,-0.002654,0.315500,0.309661,0.157198,0.459790,0.094480,0.063618,0.198886,0.499773,0.206225,0.681737,0.575829,0.393196,0.235546,0.301188,0.284600,0.153254,0.331064,0.559061,0.507274,0.050743,0.478641,0.232977,0.482737,0.270998,0.305273,0.247673,0.105505,0.379380,0.156504,0.092009,0.300717,0.301035,0.116427,...,0.553458,0.527471,0.505844,0.260075,0.788210,0.495701,0.489377,0.322912,0.314898,0.675345,0.387329,0.589777,0.488382,0.546860,0.553888,0.207014,0.317690,0.351802,0.689986,0.562206,0.469416,0.111597,0.335877,0.363006,0.492843,0.654749,0.507413,0.596554,0.605791,0.509116,0.498063,0.329131,0.291331,0.246207,0.206502,0.340685,0.599948,0.303806,1.000000,0.116583


# **Creating Users and initial ratings**

**Generating Users**

In [147]:
#fitting GMM to column "topic_i"

num_cols_topic_df = topic_df.shape[1]-1
gm = []

for i in range(num_cols_topic_df):
  gm.append(GaussianMixture(n_components=10).fit(topic_df.iloc[:, [i]]))

In [148]:
#generating gmm based topic values for each topic for 50 users
#24 x 50 matrix

Users = []

for i in range(50):
  temp = []
  for i in range(num_cols_topic_df):
    temp.append(gm[i].sample(1)[0][0][0])
  Users.append(temp)
Users=np.array(Users)
Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

In [149]:
#run the above cell before running this twice
slc = list(range(topic_df.shape[1]))
slc.remove(25)
temp_topic_df = topic_df.iloc[:, slc]
Users_temp = Users.T.set_index(temp_topic_df.T.index)

result = temp_topic_df.dot(Users_temp);
#result.T

**Generating User ratings** 




In [150]:
rank_matrix = result.rank().T/4593*10
rank_matrix = rank_matrix.round(0).astype(int)
rank_matrix.columns =np.linspace(0,4593,4594).astype(int)

#not every reader reads all news
#remove random elements from the rank matrix
for i in range(4594):
  random_entries = np.random.randint(0,50,25)
  rank_matrix.loc[random_entries,i] = "No rating"
rank_matrix = rank_matrix.apply(pd.to_numeric, errors='coerce')
rank_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,6.0,10.0,,8.0,5.0,4.0,,1.0,4.0,,3.0,6.0,,,,,5.0,,3.0,8.0,,5.0,7.0,2.0,7.0,5.0,3.0,1.0,,2.0,7.0,5.0,8.0,,,6.0,,,3.0,3.0,...,0.0,,,1.0,1.0,,0.0,0.0,,0.0,,,,,0.0,,,,,,1.0,,,,0.0,,,1.0,0.0,,,,4.0,4.0,,,,6.0,1.0,2.0
1,3.0,8.0,,5.0,4.0,4.0,9.0,2.0,,,,,,1.0,,3.0,,7.0,4.0,,9.0,2.0,6.0,4.0,7.0,,5.0,1.0,,1.0,,4.0,,2.0,,5.0,5.0,1.0,3.0,,...,0.0,,1.0,0.0,,0.0,,,0.0,0.0,1.0,,,0.0,0.0,0.0,,1.0,1.0,1.0,,,,,0.0,1.0,,,,1.0,0.0,1.0,3.0,6.0,7.0,,4.0,5.0,1.0,
2,6.0,,8.0,7.0,5.0,,,,,7.0,,7.0,,3.0,2.0,7.0,7.0,8.0,4.0,6.0,8.0,,,,8.0,,4.0,2.0,4.0,3.0,,5.0,8.0,5.0,5.0,,3.0,,,3.0,...,,4.0,1.0,0.0,2.0,,,0.0,,0.0,2.0,0.0,1.0,0.0,0.0,0.0,,,1.0,0.0,,1.0,,1.0,,,,1.0,0.0,1.0,0.0,0.0,4.0,1.0,,1.0,,8.0,,
3,,,,,6.0,8.0,8.0,5.0,6.0,3.0,3.0,,,,4.0,9.0,,,,3.0,10.0,,9.0,3.0,,5.0,2.0,2.0,6.0,2.0,9.0,7.0,8.0,8.0,6.0,,8.0,,5.0,2.0,...,0.0,4.0,1.0,,,,0.0,0.0,,,,1.0,1.0,,0.0,0.0,,,1.0,1.0,2.0,0.0,0.0,,,,,0.0,,0.0,,1.0,,,3.0,1.0,,,1.0,
4,,8.0,9.0,7.0,5.0,7.0,8.0,,,,5.0,,,,1.0,6.0,,6.0,,8.0,1.0,,,6.0,1.0,4.0,,2.0,8.0,,8.0,,,6.0,3.0,,7.0,3.0,7.0,5.0,...,1.0,,,,2.0,1.0,,1.0,2.0,1.0,,,,1.0,1.0,,,,,1.0,3.0,3.0,1.0,,1.0,,2.0,,1.0,2.0,1.0,,4.0,1.0,,2.0,5.0,,1.0,1.0
5,3.0,,3.0,,,,4.0,6.0,3.0,3.0,4.0,9.0,2.0,,1.0,,6.0,7.0,4.0,6.0,9.0,,8.0,,9.0,,2.0,,6.0,1.0,,,9.0,6.0,4.0,,9.0,,,,...,,2.0,0.0,1.0,2.0,0.0,0.0,,0.0,0.0,,0.0,,0.0,,,1.0,,,1.0,1.0,0.0,0.0,0.0,0.0,,1.0,1.0,,1.0,0.0,,2.0,,2.0,,,7.0,2.0,
6,5.0,9.0,5.0,6.0,4.0,,10.0,,,7.0,,0.0,,,,8.0,6.0,,,,8.0,4.0,7.0,,,,3.0,1.0,7.0,,1.0,,,0.0,7.0,2.0,5.0,5.0,,1.0,...,0.0,,1.0,,3.0,,0.0,0.0,2.0,,3.0,1.0,1.0,0.0,0.0,,0.0,2.0,1.0,2.0,2.0,4.0,0.0,,1.0,1.0,1.0,,1.0,,,,2.0,8.0,6.0,3.0,4.0,3.0,2.0,6.0
7,,,,5.0,5.0,10.0,,,,,5.0,6.0,4.0,8.0,5.0,9.0,5.0,,,2.0,5.0,1.0,,,,3.0,5.0,3.0,,,,8.0,6.0,9.0,6.0,10.0,3.0,,8.0,1.0,...,1.0,,,0.0,,,1.0,1.0,,,,2.0,2.0,1.0,1.0,,3.0,3.0,3.0,3.0,3.0,,1.0,4.0,,3.0,,,2.0,2.0,,2.0,3.0,,5.0,1.0,,9.0,,3.0
8,2.0,9.0,,6.0,0.0,4.0,8.0,,7.0,4.0,5.0,9.0,2.0,,,,6.0,,2.0,0.0,7.0,7.0,7.0,3.0,7.0,,,1.0,,,,5.0,,3.0,,,6.0,4.0,,2.0,...,,,1.0,,3.0,1.0,0.0,0.0,2.0,1.0,,,1.0,,0.0,,2.0,1.0,1.0,,2.0,,0.0,1.0,0.0,1.0,,,1.0,0.0,,1.0,0.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0
9,,,2.0,,7.0,,,6.0,,7.0,,0.0,6.0,2.0,6.0,1.0,3.0,,2.0,8.0,9.0,,8.0,4.0,,,6.0,5.0,,4.0,,4.0,,,9.0,2.0,,7.0,3.0,3.0,...,2.0,2.0,,6.0,3.0,,,2.0,2.0,3.0,,4.0,3.0,2.0,2.0,1.0,2.0,4.0,4.0,,4.0,1.0,3.0,3.0,3.0,4.0,4.0,,,,4.0,,,,,3.0,4.0,,,8.0


# **Content based recommender function**

In [151]:
def content_recommender(rank_matrix,cos_sim):
  
  #find top 5 ratings
  top_five = rank_matrix.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=5)

  #best two similar docs for every doc
  best_two = cos_sim.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=3)
  best_two.columns =['0', '1','2']
  del best_two['0']
  best_two['combined']= best_two.values.tolist()
  best_two
  best_two_dict_1 = best_two['1'].to_dict()
  best_two_dict_2 = best_two['2'].to_dict()

  selected_docs_1 = top_five.replace(best_two_dict_1)
  selected_docs_2 = top_five.replace(best_two_dict_2)
  selected_docs_with_ID = pd.concat([selected_docs_1,selected_docs_2],axis=1,ignore_index=True)

  #replace value by the actual news
  doc_dict = news_corpus_org.Content.to_dict()

  selected_docs = selected_docs_with_ID.replace(doc_dict)
  return selected_docs, selected_docs_with_ID

# **Top 10 news : Content based recommender.**

In [152]:
selected_docs_content,selected_docs_content_with_ID = content_recommender(rank_matrix,cos_sim)
selected_docs_content

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,MoscowIndia s strategic ties with Russia has v...,The first Quadrilateral Security Dialogue ...,The British parliament on Monday discussed fa...,Mumbai Maharashtra India March 8 ANI ...,New Delhi India March 7 ANI Union Minis...,WASHINGTON Reuters U S President Joe Bid...,External Affairs Minister S Jaishankar will on...,The Indian High Commission in London has cond...,Mumbai Maharashtra India March 10 ANI ...,New Delhi India March 7 ANI Prime Minis...
1,The British parliament on Monday discussed fa...,The daily new coronavirus COVID 19 cases fe...,New cases of coronavirus infection in India w...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Eighteen States UTs including Assam Rajasth...,The Indian High Commission in London has cond...,Daily COVID 19 cases in India registered an i...,The daily new coronavirus COVID 19 cases fe...,Maharashtra Kerala Punjab Tamil Nadu and G...,With several States in the country continuing ...
2,The British parliament on Monday discussed fa...,Congress leader Rahul Gandhi on Sunday compar...,Farmer leader Rakesh Tikait on Monday took a d...,The Indian High Commission in London has cond...,New Delhi India March 5 ANI Prime Minis...,The Indian High Commission in London has cond...,Urging the Centre not to make the agri laws a ...,More than 850 faculty members of various educa...,The High Commission of India in London has con...,The Union Cabinet on Wednesday approved the Pr...
3,By Siddharth SharmaNew Delhi India March 7...,The British parliament on Monday discussed fa...,Congress leader Rahul Gandhi on Sunday compar...,Farmer leader Rakesh Tikait on Monday took a d...,Congress leader Rahul Gandhi on Sunday compar...,A face off between two political rivals the ...,The Indian High Commission in London has cond...,Urging the Centre not to make the agri laws a ...,More than 850 faculty members of various educa...,Congress General Secretary Priyanka Gandhi Vad...
4,The daily new coronavirus COVID 19 cases fe...,New cases of coronavirus infection in India w...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,New Delhi India March 4 ANI Bharat Biot...,Eighteen States UTs including Assam Rajasth...,Daily COVID 19 cases in India registered an i...,The daily new coronavirus COVID 19 cases fe...,Maharashtra Kerala Punjab Tamil Nadu and G...,Court said it was rather selfish of petitio...,With several States in the country continuing ...
5,The British parliament on Monday discussed fa...,The daily new coronavirus COVID 19 cases fe...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,A group of Gandhian organisations has called ...,New Delhi India March 5 ANI Prime Minis...,The Indian High Commission in London has cond...,Daily COVID 19 cases in India registered an i...,Maharashtra Kerala Punjab Tamil Nadu and G...,The Shiromani Akali Dal SAD on Monday asked ...,The Union Cabinet on Wednesday approved the Pr...
6,Assembly Elections 2021 Trinamool had called...,Nearly one in three women worldwide is subject...,New Delhi India March 5 ANI Prime Minis...,MoscowIndia s strategic ties with Russia has v...,India has become the third topmost country in ...,The Covid 19 vaccination certificates in the p...,The COVID 19 pandemic has negatively affected ...,The Union Cabinet on Wednesday approved the Pr...,Underlining that terrorism continues to pose ...,India had surpassed one crore cumulative COVID...
7,The Supreme Court s public relations departmen...,The CBI has taken over a probe into 26 identic...,The Supreme Court on Tuesday scheduled a heari...,The Supreme Court has agreed to examine whethe...,New Delhi India March 6 ANI The Supreme...,Educated youngsters are showing the way forwar...,Panaji Goa India March 7 ANI Hotel CC...,The Supreme Court on Wednesday said the Rajiv ...,New Delhi India March 8 ANI The Chief J...,New Delhi India March 8 ANI The Supreme...
8,New Delhi India March 9 ANI Prime Minist...,Women are likely to play a major role in ensur...,Mumbai Maharashtra India March 8 ANI ...,New Delhi India March 1 ANI The cumulat...,Eighteen States UTs including Assam Rajasth...,Kolkata West Bengal India March 9 ANI ...,NEW DELHI Reuters Thousands of women join...,Mumbai Maharashtra India March 10 ANI ...,Nearly 17 lakh COVID vaccine doses were admini...,With several States in the country continuing ...
9,New Delhi India March 9 ANI Prime Minist...,Tirath Singh Rawat has been sworn in as the ne...,By Aiman KhanNew Delhi India March 6 ANI ...,New Delhi India March 5 ANI Congress le...,The Congress on Monday demanded a rollback of ...,Kolkata West Bengal India March 9 ANI ...,Dehradun Uttarakhand India March 9 ANI ...,Wayanad Kerala India March 4 ANI Ahea...,Bhadradri Kothagudem Telangana India Mar...,Terming the additional excise burden on fuel a...


# **Collaborative recommender function: Predict missing ratings using Matrix factorization**

In [153]:
#collaborative recommender function

def collaborative_recommender(rank_matrix,num_iter,news_corpus):
  A = np.array(rank_matrix)
  M = A.shape[0]
  N = A.shape[1]
  A_df = pd.DataFrame(A)
  K = 25
  W = np.abs(np.random.uniform(low=0, high=1, size=(M, K)))
  H = np.abs(np.random.uniform(low=0, high=1, size=(K, N)))
  W = np.divide(W, K*W.max())
  H = np.divide(H, K*H.max())

  #cost func
  def cost(A, W, H):
    mask = pd.DataFrame(A).notnull().values
    WH = np.dot(W, H)
    WH_mask = WH[mask]
    A_mask = A[mask]
    A_WH_mask = A_mask-WH_mask
    return linalg.norm(A_WH_mask, 2)
  
  #optimization loop
  num_display_cost = max(int(num_iter/10), 1)

  for i in range(num_iter):
      if i%2 ==0:
          # Learn H, given A and W
          for j in range(N):
              mask_rows = pd.Series(A[:,j]).notnull()
              H[:,j] = nnls(W[mask_rows], A[:,j][mask_rows])[0]
      else:
          for j in range(M):
              mask_rows = pd.Series(A[j,:]).notnull()
              W[j,:] = nnls(H.transpose()[mask_rows], A[j,:][mask_rows])[0]
              
      WH = np.dot(W, H)
      c = cost(A, W, H)
      #if i%num_display_cost==0:
      print(i, c)
  W = pd.DataFrame(W)
  H = pd.DataFrame(H)
  A = W.dot(H)
  
  #new rank matrix
  A = A.round(0).astype(int)
  A = np.clip(A,1,10)
  #finding the top 10 documents
  selected_docs_with_ID = A.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=10)
  selected_docs_with_ID.columns =['1', '2', '3', '4', '5','6','7','8','9','10']
  doc_dict = news_corpus_org.Content.to_dict()
  selected_docs = selected_docs_with_ID.replace(doc_dict)
  return selected_docs, selected_docs_with_ID

# **Top 10 news : Collaborative recommender.**

In [154]:
num_iter = 10
selected_docs_collab,selected_docs_collab_with_ID = collaborative_recommender(rank_matrix,num_iter,news_corpus)
selected_docs_collab

0 713.528329126503
1 589.3421350452194
2 489.9962121643897
3 430.8667847376438
4 392.96539453924737
5 367.9589733130258
6 349.8990463746707
7 336.2615107307151
8 325.942391879531
9 317.43049138476357


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,Access to COVID 19 vaccines cooperation on te...,Lauding India for its vaccine leadership a t...,The Union home ministry on Monday said 200 ...,The National Investigation Agency NIA has ta...,No new COVID 19 fatalities have been reported ...,Maintaining that the new agricultural laws are...,Finance Minister Nirmala Sitharaman lashed out...,Prime Minister Narendra Modi on Friday said p...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Maharashtra Kerala Punjab Tamil Nadu Gujar...
1,Rajya Sabha Chairman M Venkaiah Naidu on Mar...,India s COVID 19 tally rose to 1 11 92 088 wi...,The Congress is changing the way it selects ca...,No new COVID 19 fatalities have been reported ...,As the farmers protest completed 100 days on t...,A week after the show of strength in Jammu by ...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Maharashtra Kerala Punjab Tamil Nadu and G...,India s COVID 19 cases rose to 1 11 24 527 wit...
2,Access to COVID 19 vaccines cooperation on te...,Prime Minister Narendra Modi and his Japanese...,Discussion on the farm laws by the British Par...,Lauding India for its vaccine leadership a t...,The inclusion of India in the United States s...,The High Commission of India in London has con...,Congress General Secretary Priyanka Gandhi Vad...,Agriculture Minister Narendra Singh Tomar on S...,Maintaining that the new agricultural laws are...,Finance Minister Nirmala Sitharaman lashed out...
3,Access to COVID 19 vaccines cooperation on te...,The Congress had always stood for internal dis...,For the third day in a row both Houses of Par...,The Lok Sabha s early adjournment on Tuesday a...,Congress General Secretary Priyanka Gandhi Vad...,Agriculture Minister Narendra Singh Tomar on S...,The Andhra Pradesh government orally stated in...,The Delhi High Court March 4 asked the Centre...,Describing the contentious farm laws as death...,Narendra Modi is not the Prime Minister of th...
4,New Delhi Prime Minister Narendra Modi on Tue...,More than 20 million vaccine doses have been a...,India s COVID 19 tally rose to 1 11 92 088 wi...,No new COVID 19 fatalities have been reported ...,The Delhi High Court March 4 asked the Centre...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Health Minister Harsh Vardhan took his first d...,Maharashtra Kerala Punjab Tamil Nadu and G...,India s COVID 19 cases rose to 1 11 24 527 wit...
5,The High Commission of India in London has con...,India s COVID 19 tally rose to 1 11 92 088 wi...,As the farmers protest completed 100 days on t...,The Delhi High Court March 4 asked the Centre...,The cumulative number of COVID 19 vaccine dose...,India s COVID 19 tally surpassed 1 11 crore o...,Describing the contentious farm laws as death...,The Union Health Ministry will soon issue the ...,The National Investigation Agency NIA has so...,Faced with farm union leader Rakesh Tikait s t...
6,The Health Ministry on Tuesday informed the El...,The first ever summit of the leaders of the Qu...,Prime Minister Narendra Modi and his Japanese...,Prime Minister Narendra Modi on Tuesday virtua...,The High Commission of India in London has con...,Prime Minister Narendra Modi on Monday said th...,Prime Minister Narendra Modi on Friday said p...,External Affairs Minister Jaishankar will visi...,Health Minister Harsh Vardhan on Wednesday rei...,The Election Commission on Tuesday said it had...
7,A petition has been filed in the Supreme Court...,A Constitution Bench led by Chief Justice of I...,The Supreme Court said on March 10 it would h...,The Supreme Court on Monday decided to end the...,The Supreme Court has asked the Centre as to h...,The Supreme Court which is hearing cases thr...,The Supreme Court s initiative to begin hybrid...,The Supreme Court on Friday asked a healthcare...,The Andhra Pradesh government orally stated in...,A Constitution Bench of the Supreme Court on T...
8,A total of 27 590 complaints were received by...,A day after a devastating blaze claimed nine ...,Thousands of women farmers held protest marche...,The National Investigation Agency NIA has ta...,New cases of coronavirus infection in India w...,No new COVID 19 fatalities have been reported ...,Health Minister Harsh Vardhan on Wednesday rei...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,India s total tally of COVID 19 cases rose to...,The cumulative number of COVID 19 vaccine dose...
9,The Health Ministry on Tuesday informed the El...,An analysis of MLAs who switched parties and r...,Senior Congress leader Saifuddin Soz on Tuesd...,Former Congress president Rahul Gandhi on Tues...,The Election Commission EC on Tuesday issued...,A day after a devastating blaze claimed nine ...,Prime Minister Narendra Modi on Tuesday virtua...,New Delhi Prime Minister Narendra Modi on Tue...,Prime Minister Narendra Modi greeted women on ...,A week after the show of strength in Jammu by ...


# **Hydrid recommender**

In [155]:
def hybrid(collaborative_recommender,content_recommender,rank_matrix,cos_sim,news_corpus):
  selected_docs_collab,selected_docs_collab_with_ID = collaborative_recommender(rank_matrix,num_iter,news_corpus)
  selected_docs_content,selected_docs_content_with_ID = content_recommender(rank_matrix,cos_sim)
  
  selected_docs_collab.reset_index(drop=True, inplace=True)
  selected_docs_content.reset_index(drop=True, inplace=True)
  
  selected_docs_collab_with_ID.reset_index(drop=True, inplace=True)
  selected_docs_content_with_ID.reset_index(drop=True, inplace=True)
  
  selected_docs_final = pd.concat([selected_docs_content.iloc[:,[0,1,2,3,4]],selected_docs_collab.iloc[:,[5,6,7,8,9]]],axis=1)
  selected_docs_final_with_ID = pd.concat([selected_docs_content_with_ID.iloc[:,[0,1,2,3,4]],selected_docs_collab_with_ID.iloc[:,[5,6,7,8,9]]],axis=1)
  
  return selected_docs_final,selected_docs_final_with_ID

# **Top 10 news : Hybrid recommender.**

In [156]:
selected_docs_final,selected_docs_final_with_ID = hybrid(collaborative_recommender,content_recommender,rank_matrix,cos_sim,news_corpus)
selected_docs_final

0 710.4784913423131
1 587.5262645356196
2 497.602834614696
3 437.062909408558
4 395.10677838788246
5 368.6069616430496
6 350.7596194014105
7 337.4166803856911
8 327.3707128166906
9 318.9910044595572


Unnamed: 0,0,1,2,3,4,6,7,8,9,10
0,MoscowIndia s strategic ties with Russia has v...,The first Quadrilateral Security Dialogue ...,The British parliament on Monday discussed fa...,Mumbai Maharashtra India March 8 ANI ...,New Delhi India March 7 ANI Union Minis...,Maintaining that the new agricultural laws are...,Prime Minister Narendra Modi on Friday said p...,External Affairs Minister Jaishankar will visi...,The time has come to increase the private sect...,Remarking that a normal bilateral trade relati...
1,The British parliament on Monday discussed fa...,The daily new coronavirus COVID 19 cases fe...,New cases of coronavirus infection in India w...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Eighteen States UTs including Assam Rajasth...,The Congress is changing the way it selects ca...,A week after the show of strength in Jammu by ...,Former Congress president Rahul Gandhi on Tues...,India s COVID 19 tally surpassed 1 11 crore o...,India s COVID 19 tally climbed to 1 10 96 731 ...
2,The British parliament on Monday discussed fa...,Congress leader Rahul Gandhi on Sunday compar...,Farmer leader Rakesh Tikait on Monday took a d...,The Indian High Commission in London has cond...,New Delhi India March 5 ANI Prime Minis...,Prime Minister Narendra Modi on March 7 said t...,Agriculture Minister Narendra Singh Tomar on S...,Maintaining that the new agricultural laws are...,Finance Minister Nirmala Sitharaman lashed out...,Prime Minister Narendra Modi on Friday said p...
3,By Siddharth SharmaNew Delhi India March 7...,The British parliament on Monday discussed fa...,Congress leader Rahul Gandhi on Sunday compar...,Farmer leader Rakesh Tikait on Monday took a d...,Congress leader Rahul Gandhi on Sunday compar...,The Andhra Pradesh government orally stated in...,The Delhi High Court March 4 asked the Centre...,Former Congress president Rahul Gandhi on Tues...,The Delhi High Court has directed Air India t...,Describing the contentious farm laws as death...
4,The daily new coronavirus COVID 19 cases fe...,New cases of coronavirus infection in India w...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,New Delhi India March 4 ANI Bharat Biot...,Eighteen States UTs including Assam Rajasth...,Prime Minister Narendra Modi on Friday said p...,The Delhi High Court March 4 asked the Centre...,Health Minister Harsh Vardhan on Wednesday rei...,Health Minister Harsh Vardhan took his first d...,The cumulative number of COVID 19 vaccine dose...
5,The British parliament on Monday discussed fa...,The daily new coronavirus COVID 19 cases fe...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,A group of Gandhian organisations has called ...,New Delhi India March 5 ANI Prime Minis...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,With several States in the country continuing ...,Prime Minister Narendra Modi early on Monday t...,Remarking that a normal bilateral trade relati...,India s COVID 19 tally climbed to 1 10 96 731 ...
6,Assembly Elections 2021 Trinamool had called...,Nearly one in three women worldwide is subject...,New Delhi India March 5 ANI Prime Minis...,MoscowIndia s strategic ties with Russia has v...,India has become the third topmost country in ...,Prime Minister Narendra Modi on Monday said th...,The crisis over the continuation of Uttarakhan...,Prime Minister Narendra Modi greeted women on ...,Conveying his strong appreciation for the res...,Prime Minister Narendra Modi will address a c...
7,The Supreme Court s public relations departmen...,The CBI has taken over a probe into 26 identic...,The Supreme Court on Tuesday scheduled a heari...,The Supreme Court has agreed to examine whethe...,New Delhi India March 6 ANI The Supreme...,The Supreme Court on Friday asked a healthcare...,The Andhra Pradesh government orally stated in...,A Constitution Bench of the Supreme Court on T...,The Delhi High Court March 4 asked the Centre...,Health Minister Harsh Vardhan on Wednesday rei...
8,New Delhi India March 9 ANI Prime Minist...,Women are likely to play a major role in ensur...,Mumbai Maharashtra India March 8 ANI ...,New Delhi India March 1 ANI The cumulat...,Eighteen States UTs including Assam Rajasth...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Health Minister Harsh Vardhan on Wednesday rei...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Maharashtra Kerala Punjab Tamil Nadu Gujar...,Health Minister Harsh Vardhan took his first d...
9,New Delhi India March 9 ANI Prime Minist...,Tirath Singh Rawat has been sworn in as the ne...,By Aiman KhanNew Delhi India March 6 ANI ...,New Delhi India March 5 ANI Congress le...,The Congress on Monday demanded a rollback of ...,A day after a devastating blaze claimed nine ...,New Delhi Prime Minister Narendra Modi on Tue...,BJP leader Jyotiraditya Scindia who quit the ...,A week after the show of strength in Jammu by ...,The Election Commission of India ECI on Frid...


# **User Profile Updater**

Implemented in the flask app.

# **Implement ALS based matrix factorization instead of NNLS**

In [157]:
'''from pyspark.ml.recommendation import ALS 
from pyspark.sql.types import FloatType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

X_train, X_test = rank_matrix.randomSplit([0.6, 0.4])'''

'from pyspark.ml.recommendation import ALS \nfrom pyspark.sql.types import FloatType\nfrom pyspark.ml.evaluation import RegressionEvaluator\nfrom pyspark.sql.functions import col\n\nX_train, X_test = rank_matrix.randomSplit([0.6, 0.4])'

In [158]:
'''als = mlALS(rank=5, maxIter=10, seed=0)
model = als.fit(rank_matrix)'''

'als = mlALS(rank=5, maxIter=10, seed=0)\nmodel = als.fit(rank_matrix)'