<a href="https://colab.research.google.com/github/sarvesh237/lfkdsahkhfa/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#for lemmatization
import nltk
nltk.download("wordnet")
nltk.download('stopwords')
#to remove stop words
from nltk.corpus import stopwords

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#gaussian mixture
from sklearn.mixture import GaussianMixture

from sklearn.decomposition import TruncatedSVD# TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Importing the collected data**

In [2]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [3]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, stop words, non-english characters, drop empty rows and duplicates.**

In [4]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

stop_words = stopwords.words('english')
stop_words.extend(['span','class','spacing','href','html','http','title','said','that'])
news_corpus['Content'] = news_corpus['Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_corpus = news_corpus.apply(lambda x: [item for item in x if item not in stop_words])


**Lemmatization**

In [5]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus = news_corpus.reset_index(drop=True)
news_corpus


Unnamed: 0,Content
0,medium report about swedish manufacturer scani...
1,access covid vaccine cooperation technology cl...
2,after severe criticism over holding consultati...
3,former congress president rahul gandhi thursda...
4,enforcement directorate attached three immovab...
...,...
4589,over mughal gold coin dating back early centur...
4590,china planning spend tibet five year plan allo...
4591,supreme court tuesday came with solution stale...
4592,indian american maju varghese previously serve...


# TF-IDF

In [6]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aajtak,aakash,aaksha,aamir,aamk,aandolan,aandolanjivi,aane,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,aatmanirbharbharat,aatmanirbharta,aatmiyata,aatmnirbhar,aatmnirbharta,aawam,aaya,...,zhao,zhoigar,zhuoran,ziarul,zila,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,ziyad,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,ztdrktlic,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **LSA : Latent Semantic Analysis**

In [7]:
sv_dec = TruncatedSVD(n_components = 50)  # components(no. of latent factors) < n ; n : no. of docs.
ls_analysis = sv_dec.fit_transform(vectors.T)

sv_dec.singular_values_ # eigenvalues for svd (in decreasing order)

array([13.49665211,  8.19753288,  6.65195647,  6.45052751,  6.13464353,
        5.38660293,  5.15440822,  4.58017403,  4.37335128,  4.2780279 ,
        4.22043603,  4.12364123,  4.06323146,  4.03831518,  3.92999499,
        3.91271679,  3.78111143,  3.73173602,  3.680109  ,  3.61969662,
        3.5669012 ,  3.48129213,  3.4089485 ,  3.3678328 ,  3.3106611 ,
        3.29772602,  3.26965902,  3.22474741,  3.1445596 ,  3.13835766,
        3.10090535,  3.06724154,  3.01239997,  2.98002465,  2.96376833,
        2.94560073,  2.92085056,  2.87129651,  2.84717718,  2.79011881,
        2.76876859,  2.7617536 ,  2.74772197,  2.71078336,  2.6955714 ,
        2.6815724 ,  2.64966958,  2.63765628,  2.61467877,  2.59688561])

In [8]:
variance =  sv_dec.singular_values_/sum(sv_dec.singular_values_) # checking the variance captured by eigenvalues
variance

array([0.06979342, 0.0423908 , 0.03439837, 0.03335675, 0.03172326,
       0.02785502, 0.0266543 , 0.02368484, 0.02261532, 0.02212239,
       0.02182457, 0.02132403, 0.02101164, 0.0208828 , 0.02032266,
       0.02023331, 0.01955275, 0.01929743, 0.01903045, 0.01871805,
       0.01844504, 0.01800234, 0.01762824, 0.01741562, 0.01711998,
       0.01705309, 0.01690795, 0.0166757 , 0.01626104, 0.01622897,
       0.0160353 , 0.01586121, 0.01557762, 0.0154102 , 0.01532614,
       0.01523219, 0.0151042 , 0.01484795, 0.01472322, 0.01442817,
       0.01431776, 0.01428149, 0.01420893, 0.01401791, 0.01393925,
       0.01386686, 0.01370188, 0.01363976, 0.01352094, 0.01342893])

In [9]:
sv_dec = TruncatedSVD(n_components = 25) # components with accepted variance
las = sv_dec.fit_transform(vectors)

#print(las,las.shape)

Topic - Document matrix

In [10]:
# Representation of each doc in terms of latent topics after dimensionality reduction
col = []
for i in range(las.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df = pd.DataFrame(las, columns = col)

topic_df["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

display(topic_df[l])


Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish manufacturer scani...,0.129416,-0.031666,-0.002052,0.016608,-0.058608,-0.036321,-0.018136,0.010258,0.001819,-0.020495,0.013433,0.009477,0.008780,-0.053245,0.006271,-0.014699,0.065994,-0.035251,-0.007144,0.023197,0.006538,-0.002164,0.037968,-0.056671,0.008300
1,access covid vaccine cooperation technology cl...,0.241852,-0.033737,0.076395,0.116299,-0.062965,-0.107645,-0.079646,-0.061723,-0.052594,-0.138553,-0.009383,-0.099563,0.026188,0.039240,-0.100622,0.025678,0.102929,0.033599,0.030561,-0.005196,0.035620,-0.059470,0.072818,0.088779,-0.022386
2,after severe criticism over holding consultati...,0.187095,-0.047153,-0.003879,0.027571,-0.070243,-0.017988,-0.070746,-0.017600,-0.002586,-0.023714,-0.015134,-0.024099,0.021822,-0.001804,0.029581,-0.084000,0.029113,-0.064009,-0.015252,-0.010829,0.074787,0.017388,0.180076,-0.165264,-0.098620
3,former congress president rahul gandhi thursda...,0.177060,-0.026038,0.029290,0.010240,-0.039845,-0.040891,-0.059149,0.027383,-0.032841,-0.082667,0.101038,-0.067635,-0.005888,-0.007943,0.026004,0.017563,0.056929,-0.034532,-0.006079,0.026890,0.030128,-0.001188,0.029563,-0.037880,-0.036833
4,enforcement directorate attached three immovab...,0.134256,-0.048094,-0.066805,-0.030236,-0.111718,0.012947,0.045189,0.013325,0.054137,-0.023697,-0.000723,0.025705,-0.041250,-0.094003,0.040071,-0.008177,0.089603,-0.030205,-0.039296,0.034667,-0.064164,-0.018764,-0.007914,-0.090963,0.065135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,over mughal gold coin dating back early centur...,0.086212,-0.022274,-0.033227,-0.015107,-0.062895,-0.042102,0.091870,0.036388,0.021727,-0.009649,0.003722,0.001994,-0.012234,-0.040946,0.004561,-0.037412,0.003669,-0.015475,0.017797,-0.009166,0.023571,0.004948,-0.000086,0.009615,-0.005060
4590,china planning spend tibet five year plan allo...,0.123348,-0.013562,0.004550,0.000898,-0.026678,-0.072730,-0.033836,-0.033183,-0.018665,-0.023777,-0.065839,-0.001094,0.026212,-0.018149,-0.010681,-0.006547,0.038783,-0.003838,-0.016964,-0.008470,0.017331,-0.021145,-0.024500,0.019438,-0.007585
4591,supreme court tuesday came with solution stale...,0.178528,-0.066109,-0.074738,-0.003252,-0.120698,0.127855,-0.102484,-0.017315,0.033196,-0.020357,-0.024550,0.008731,0.003009,-0.009762,0.031555,0.007079,0.023624,0.012471,0.013345,-0.009573,0.033037,0.020550,-0.035007,-0.023479,0.007779
4592,indian american maju varghese previously serve...,0.080712,-0.038109,0.006436,0.013005,-0.024398,-0.039541,-0.025047,0.020961,0.019772,-0.049900,-0.049619,-0.038501,-0.012554,0.060082,-0.025703,-0.060964,0.041822,-0.001901,0.021609,0.038478,0.013627,0.000543,-0.024460,0.067236,0.037107


**Visualizing the topics**

In [11]:
'''import umap
import matplotlib.pyplot as plt

embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )
plt.show()'''

'import umap\nimport matplotlib.pyplot as plt\n\nembedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)\n\nplt.figure(figsize=(7,5))\nplt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )\nplt.show()'

**Topics and terms list**

In [12]:
Sigma = sv_dec.singular_values_ 
V_transpose = sv_dec.components_.T
terms = vectorizer.get_feature_names()

for index, component in enumerate(sv_dec.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)
del Sigma,V_transpose,terms,zipped,top_terms_key

Topic 0:  ['said', 'that', 'covid', 'case', 'will', 'farmer', 'have', 'state', 'with', 'minister']
Topic 1:  ['covid', 'vaccine', 'case', 'death', 'vaccination', 'health', 'total', 'reported', 'dose', 'active']
Topic 2:  ['election', 'assembly', 'party', 'vaccine', 'congress', 'seat', 'will', 'poll', 'minister', 'bengal']
Topic 3:  ['vaccine', 'vaccination', 'farmer', 'dose', 'first', 'covid', 'modi', 'dos', 'law', 'covaxin']
Topic 4:  ['farmer', 'law', 'farm', 'protest', 'congress', 'price', 'protesting', 'death', 'party', 'hour']
Topic 5:  ['court', 'vaccine', 'justice', 'supreme', 'vaccination', 'election', 'dose', 'petition', 'bench', 'hearing']
Topic 6:  ['police', 'vaccine', 'arrested', 'dose', 'farmer', 'vaccination', 'accused', 'bengal', 'west', 'delhi']
Topic 7:  ['sabha', 'price', 'congress', 'rajya', 'budget', 'house', 'party', 'opposition', 'session', 'parliament']
Topic 8:  ['lakh', 'price', 'sabha', 'from', 'fuel', 'rajya', 'budget', 'crore', 'petrol', 'bengal']
Topic 9: 

# **LDA : Latent Dirichlet Allocation**

In [13]:
#lda

lda = LatentDirichletAllocation(n_components=25)
lda_matrix = lda.fit_transform(df_vecs)
lda_topics = lda.fit(df_vecs)
for idx, topic in enumerate(lda.components_):
        print ("Topic ", idx, " ".join(feature_names[i] for i in topic.argsort()[:-10 - 1:-1]))

Topic  0 ambani vaze hiren mansukh playerscript mumbai explosive mukesh found sachin
Topic  1 poor moderate forecasted geeta birthday batla khan bhima koregaon safar
Topic  2 covid vaccine vaccination case health death ministry total dose reported
Topic  3 party congress said state from minister election will that police
Topic  4 england wicket balakot milk spinner test rim hathras mahotsav inning
Topic  5 court police petition case justice plea bench accused filed petitioner
Topic  6 sabha maritime satellite motion birla rajya adjourned port nifty isro
Topic  7 senate trillion owaisi secc bill aimim harris democrat drill dowry
Topic  8 amazon publisher intermediary pregnancy broadcasting tandav harry meghan prince royal
Topic  9 fastag firecracker haasan virudhunagar senator gavi katchi accomplishment kamra mirzapur
Topic  10 farmer law farm protesting kisan tractor tikait repeal haryana agitation
Topic  11 kumbh haridwar mela maha shivratri lord shiva scindia boat warrior
Topic  12 j

In [14]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [15]:
#plot_top_words(lda, feature_names, 10, "t")

In [16]:
col = []
for i in range(lda_matrix.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df_2 = pd.DataFrame(lda_matrix, columns = col)

topic_df_2["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

#display(topic_df_2[l])

**Cosine Similarity**

In [17]:
cos_sim = pd.DataFrame(cosine_similarity(topic_df.iloc[:, :-1]))
cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,1.000000,0.537011,0.738100,0.762278,0.750391,0.389087,0.335747,0.434205,0.409020,0.592790,0.756447,0.106405,0.321861,0.274203,0.263242,0.153695,0.655147,0.436496,0.712884,0.495367,0.467458,0.628062,0.626565,0.641965,0.341338,0.474852,0.386331,0.728570,0.721330,0.555512,0.265281,0.324277,0.540916,0.193099,0.346056,0.339767,0.274816,0.442213,0.468046,0.549690,...,0.448227,0.574454,0.497578,0.163641,0.452120,0.780832,0.439904,0.635996,0.473329,0.663338,0.868045,0.475633,0.614507,0.367278,0.610917,0.446572,0.445386,0.641723,0.515898,0.503075,0.702514,0.270581,0.526784,0.516710,0.769035,0.553426,0.580889,0.445146,0.447636,0.579641,0.588312,0.383072,0.529184,0.445059,0.280804,0.550052,0.623621,0.531634,0.322682,0.149972
1,0.537011,1.000000,0.421765,0.678612,0.089263,0.174572,0.319195,0.283827,0.214466,0.557216,0.517396,0.082749,0.147399,0.146510,0.305605,0.215708,0.891706,0.833279,0.368780,0.089868,0.266556,0.308578,0.241781,0.372969,0.224856,0.290361,0.220824,0.420653,0.213952,0.507982,0.202235,0.432214,0.246043,0.060391,0.526206,0.110567,0.188222,0.554410,0.407507,0.159946,...,0.513202,0.454258,0.561590,0.063637,0.617416,0.693933,0.513147,0.343081,0.377698,0.762797,0.454162,0.619872,0.691684,0.540591,0.549037,0.153578,0.385947,0.538614,0.417519,0.480050,0.502647,0.110575,0.240002,0.396840,0.584716,0.482164,0.439658,0.602303,0.543670,0.589028,0.519699,0.282800,0.324242,0.317503,0.188656,0.201218,0.720000,0.295684,0.626013,0.164325
2,0.738100,0.421765,1.000000,0.632816,0.355278,0.345818,0.370842,0.432197,0.264323,0.482196,0.633272,0.127155,0.199836,0.271465,0.295223,0.145166,0.562004,0.379468,0.723802,0.283759,0.299922,0.579326,0.342508,0.572785,0.304628,0.384394,0.323054,0.633533,0.505196,0.598714,0.228249,0.335379,0.355312,0.177836,0.327334,0.267258,0.190537,0.271183,0.634815,0.287395,...,0.272360,0.438046,0.386525,-0.010723,0.205573,0.540390,0.461878,0.673809,0.383550,0.503727,0.741970,0.374141,0.718078,0.394801,0.528387,0.216221,0.446100,0.645608,0.248332,0.392658,0.494947,0.153759,0.524657,0.551053,0.565854,0.323655,0.505750,0.374011,0.419793,0.532420,0.515310,0.219772,0.477429,0.378660,0.262579,0.306139,0.410736,0.451595,0.144016,0.140447
3,0.762278,0.678612,0.632816,1.000000,0.387255,0.334034,0.312138,0.469011,0.572708,0.672091,0.696235,0.212600,0.050916,0.283182,0.279855,0.194307,0.694333,0.505425,0.577890,0.169838,0.618962,0.445908,0.329512,0.609736,0.368832,0.559449,0.172338,0.484159,0.452114,0.492064,0.274334,0.350622,0.399686,0.251381,0.460116,0.272294,0.473597,0.492191,0.506150,0.328461,...,0.455961,0.440945,0.550974,0.109430,0.503081,0.794665,0.483218,0.640771,0.481738,0.652898,0.767417,0.411503,0.708108,0.529378,0.687666,0.290358,0.566197,0.703168,0.410869,0.611766,0.625794,0.288489,0.520815,0.594485,0.681957,0.631373,0.561679,0.508845,0.467517,0.596989,0.611925,0.593772,0.254369,0.497310,0.354949,0.331423,0.492971,0.437007,0.350717,0.057358
4,0.750391,0.089263,0.355278,0.387255,1.000000,0.492043,0.163400,0.425569,0.328459,0.193588,0.454582,0.103401,0.256376,0.276532,0.266807,0.201481,0.298743,0.119457,0.618321,0.506364,0.334324,0.454350,0.669141,0.427208,0.214981,0.314932,0.217271,0.572599,0.689118,0.320181,0.341149,0.282766,0.644377,0.343782,0.003644,0.540492,0.187311,0.188014,0.282693,0.714517,...,0.223653,0.482270,0.344609,0.095490,0.164813,0.539379,0.157182,0.335229,0.271891,0.409510,0.571004,0.158586,0.169363,0.041204,0.287352,0.524696,0.185122,0.209632,0.352516,0.252952,0.538581,0.175197,0.264550,0.282940,0.538757,0.287913,0.349383,0.098916,0.182032,0.246926,0.340558,0.226089,0.432975,0.318852,0.247075,0.586558,0.333825,0.546695,0.155818,0.215914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.550052,0.201218,0.306139,0.331423,0.586558,0.236375,0.132496,0.433181,0.311950,0.201961,0.681103,0.121733,0.325787,0.080784,0.213277,0.057450,0.317410,0.182501,0.424804,0.275509,0.321265,0.530660,0.460526,0.483653,0.294619,0.568470,0.127437,0.439595,0.563111,0.310798,0.061171,0.168081,0.737955,0.076028,0.270912,0.296034,0.218946,0.357745,0.288712,0.392635,...,0.335498,0.218534,0.413555,0.205710,0.292680,0.655595,0.430287,0.380163,0.325053,0.473481,0.606262,0.472604,0.418848,0.378108,0.423801,0.347605,0.262469,0.342491,0.434269,0.378201,0.603813,0.185657,0.530902,0.378525,0.450614,0.351592,0.580664,0.316199,0.444704,0.355091,0.468194,0.270705,0.390785,0.290394,0.147964,1.000000,0.374355,0.257983,0.318475,0.220428
4590,0.623621,0.720000,0.410736,0.492971,0.333825,0.222311,0.310078,0.371990,0.244598,0.368049,0.639916,0.143500,0.473016,0.158619,0.334903,0.212707,0.801005,0.638408,0.492292,0.618439,0.270841,0.529582,0.528646,0.458696,0.202038,0.312975,0.578966,0.648514,0.491693,0.559719,0.226967,0.491213,0.353929,0.059660,0.583907,0.180131,0.215217,0.502428,0.427910,0.300098,...,0.702736,0.577633,0.601160,0.162902,0.657987,0.579243,0.671545,0.376022,0.441880,0.723157,0.454437,0.790791,0.634888,0.580547,0.523728,0.459733,0.426713,0.511346,0.568760,0.480135,0.567796,0.193684,0.194842,0.409854,0.734276,0.455397,0.570268,0.698100,0.649438,0.743450,0.601975,0.293848,0.738578,0.283135,0.310835,0.374355,1.000000,0.416008,0.578028,0.295469
4591,0.531634,0.295684,0.451595,0.437007,0.546695,0.914113,0.205826,0.668501,0.321004,0.288840,0.559159,0.162854,0.247407,0.736225,0.502393,0.624209,0.456921,0.253580,0.710670,0.412903,0.333478,0.386494,0.587662,0.483011,0.269964,0.267623,0.293911,0.625948,0.471613,0.379570,0.897413,0.649554,0.377451,0.812351,0.273954,0.874313,0.265511,0.229955,0.702988,0.295272,...,0.451982,0.885188,0.330937,0.074344,0.386153,0.451772,0.394274,0.410254,0.358418,0.483277,0.480886,0.327206,0.396351,0.350011,0.425767,0.522232,0.356057,0.452668,0.445468,0.415419,0.547264,0.180985,0.283874,0.756535,0.557725,0.411051,0.444229,0.400509,0.420071,0.457727,0.577810,0.260397,0.501434,0.316003,0.246039,0.257983,0.416008,1.000000,0.312527,0.227805
4592,0.322682,0.626013,0.144016,0.350717,0.155818,0.164824,-0.001491,0.350699,0.361622,0.191877,0.455595,0.093050,0.074337,0.220178,0.523565,0.233044,0.668966,0.581670,0.394305,0.266538,0.300213,0.256507,0.156620,0.384037,0.577220,0.514282,0.061580,0.464211,0.260583,0.428991,0.256420,0.356252,0.209375,0.119847,0.308162,0.157364,0.138685,0.294404,0.253434,0.094392,...,0.566125,0.515901,0.500885,0.233500,0.768887,0.450656,0.516677,0.263808,0.340541,0.644442,0.363956,0.574191,0.425455,0.526250,0.499748,0.229500,0.352073,0.367693,0.706960,0.583883,0.470081,0.121988,0.409254,0.360524,0.479458,0.659835,0.504863,0.559185,0.587076,0.526448,0.504483,0.347150,0.274588,0.264359,0.225883,0.318475,0.578028,0.312527,1.000000,0.122300


# **Creating Users and initial ratings**

**Generating Users**

In [18]:
#fitting GMM to column "topic_i"

num_cols_topic_df = topic_df.shape[1]-1
gm = []

for i in range(num_cols_topic_df):
  gm.append(GaussianMixture(n_components=10).fit(topic_df.iloc[:, [i]]))

In [19]:
#generating gmm based topic values for each topic for 50 users
#24 x 50 matrix

Users = []

for i in range(50):
  temp = []
  for i in range(num_cols_topic_df):
    temp.append(gm[i].sample(1)[0][0][0])
  Users.append(temp)
Users=np.array(Users)
Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

Option 2 (Not using): Assigning random 1-10 ratings to each topic based on normal distribution. This assumes that the topics are independant.

In [20]:
'''Users = np.random.randint(0,10,[25,50])
Users = Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

Users'''

'Users = np.random.randint(0,10,[25,50])\nUsers = Users.T\nl2norm = np.sqrt((Users * Users).sum(axis=1))\nUsers = pd.DataFrame(Users/l2norm.reshape(50,1))\n\nUsers'

In [21]:
#run the above cell before running this twice
slc = list(range(topic_df.shape[1]))
slc.remove(25)
temp_topic_df = topic_df.iloc[:, slc]
Users_temp = Users.T.set_index(temp_topic_df.T.index)

result = temp_topic_df.dot(Users_temp);
#result.T

**Generating User ratings** 




In [22]:
rank_matrix = result.rank().T/4593*10
rank_matrix = rank_matrix.round(0).astype(int)
rank_matrix.columns =np.linspace(0,4593,4594).astype(int)

#not every reader reads all news
#remove random elements from the rank matrix
for i in range(4594):
  random_entries = np.random.randint(0,50,25)
  rank_matrix.loc[random_entries,i] = "No rating"
rank_matrix = rank_matrix.apply(pd.to_numeric, errors='coerce')
rank_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,6.0,,7.0,5.0,,9.0,9.0,4.0,,,4.0,,1.0,6.0,6.0,9.0,5.0,,7.0,,,,,3.0,8.0,6.0,,,6.0,3.0,10.0,,,,,10.0,,,,3.0,...,1.0,,,1.0,,,1.0,,1.0,,,1.0,1.0,1.0,,2.0,,,3.0,3.0,3.0,1.0,1.0,3.0,2.0,,,,,2.0,,2.0,5.0,7.0,8.0,,,,4.0,
1,5.0,,9.0,,2.0,,,6.0,6.0,7.0,6.0,10.0,2.0,5.0,,9.0,8.0,,5.0,2.0,,,4.0,,8.0,6.0,2.0,1.0,6.0,,,8.0,5.0,5.0,,6.0,,4.0,,,...,,3.0,1.0,0.0,,1.0,1.0,1.0,3.0,1.0,,1.0,2.0,1.0,0.0,,,,,,3.0,,,2.0,1.0,2.0,1.0,,1.0,2.0,1.0,,1.0,3.0,1.0,,3.0,,3.0,0.0
2,3.0,8.0,,6.0,0.0,,8.0,2.0,8.0,5.0,4.0,,,3.0,,1.0,,7.0,3.0,5.0,10.0,1.0,1.0,5.0,9.0,,5.0,,,,1.0,2.0,,1.0,7.0,0.0,5.0,2.0,3.0,2.0,...,3.0,,4.0,,,2.0,2.0,2.0,3.0,,3.0,,,,2.0,2.0,5.0,4.0,3.0,,,3.0,,2.0,3.0,4.0,3.0,,3.0,,3.0,,,,,1.0,5.0,3.0,4.0,
3,3.0,6.0,1.0,7.0,2.0,5.0,5.0,1.0,,,4.0,3.0,1.0,6.0,2.0,10.0,3.0,,2.0,2.0,10.0,6.0,,,7.0,7.0,2.0,1.0,,,,9.0,,,8.0,,,,5.0,2.0,...,,,,4.0,1.0,1.0,0.0,,4.0,,1.0,0.0,1.0,,0.0,1.0,1.0,1.0,,,1.0,8.0,0.0,1.0,0.0,,,,,,1.0,,,4.0,3.0,2.0,,4.0,,
4,5.0,9.0,,8.0,4.0,,,3.0,2.0,,,10.0,9.0,1.0,2.0,0.0,5.0,5.0,,5.0,,5.0,2.0,,,,8.0,,8.0,,1.0,,8.0,1.0,5.0,1.0,1.0,3.0,3.0,8.0,...,5.0,,3.0,,4.0,,3.0,3.0,,4.0,4.0,,4.0,,3.0,5.0,8.0,6.0,4.0,3.0,,,,4.0,4.0,3.0,5.0,6.0,5.0,6.0,,3.0,3.0,4.0,6.0,6.0,,1.0,4.0,1.0
5,4.0,8.0,6.0,,,,,5.0,,,4.0,9.0,3.0,3.0,,,,,4.0,,10.0,5.0,6.0,4.0,9.0,7.0,3.0,,5.0,2.0,7.0,,9.0,5.0,,7.0,,,,1.0,...,0.0,2.0,1.0,,1.0,1.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,0.0,,,,,1.0,2.0,,0.0,1.0,1.0,,1.0,1.0,,1.0,,,2.0,,2.0,3.0,2.0,,,1.0
6,4.0,9.0,,,3.0,,,2.0,4.0,,3.0,,,1.0,,6.0,,8.0,,6.0,9.0,,,3.0,,5.0,,1.0,4.0,,6.0,5.0,7.0,2.0,,,5.0,,,,...,0.0,2.0,,0.0,,1.0,,,1.0,,2.0,,,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,,4.0,,,3.0,,,3.0
7,6.0,9.0,8.0,6.0,,,,,0.0,,,4.0,4.0,2.0,,3.0,6.0,6.0,4.0,7.0,2.0,,,3.0,,2.0,5.0,1.0,,2.0,8.0,,9.0,3.0,,7.0,1.0,,4.0,3.0,...,,,,1.0,,,,1.0,1.0,,,1.0,1.0,1.0,,1.0,2.0,2.0,,1.0,2.0,1.0,,1.0,1.0,,1.0,,1.0,1.0,,0.0,,,2.0,,,,,0.0
8,,9.0,4.0,6.0,,1.0,8.0,1.0,3.0,4.0,,,8.0,2.0,,,8.0,9.0,4.0,,,8.0,7.0,7.0,0.0,7.0,,4.0,,6.0,,,5.0,1.0,,1.0,0.0,,6.0,,...,3.0,2.0,,8.0,4.0,,2.0,2.0,4.0,4.0,2.0,3.0,4.0,3.0,2.0,4.0,7.0,,2.0,3.0,4.0,6.0,2.0,3.0,,2.0,,,3.0,,3.0,3.0,2.0,1.0,1.0,3.0,7.0,1.0,6.0,
9,,,5.0,,7.0,9.0,4.0,,1.0,4.0,,10.0,7.0,5.0,4.0,8.0,,3.0,7.0,,0.0,6.0,8.0,,,,4.0,4.0,7.0,4.0,9.0,,,8.0,1.0,9.0,1.0,2.0,7.0,,...,4.0,6.0,,3.0,3.0,4.0,3.0,,4.0,,4.0,3.0,4.0,,3.0,,,3.0,4.0,2.0,5.0,5.0,,4.0,,,4.0,4.0,,4.0,,1.0,,,,7.0,4.0,7.0,4.0,3.0


Find top 5 ratings of a user.

In [23]:
top_five = rank_matrix.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=5)
top_five

Unnamed: 0,0,1,2,3,4
0,30,35,135,157,163
1,11,64,74,86,90
2,20,69,71,112,126
3,15,20,86,100,101
4,11,51,127,137,141
5,20,51,76,181,193
6,147,166,181,210,239
7,71,148,181,239,245
8,127,137,181,193,199
9,11,95,166,183,192


# **Top 10 news : Content based recommender.**

In [24]:
#best two similar docs for every doc
best_two = cos_sim.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=3)
best_two.columns =['0', '1','2']
del best_two['0']
best_two['combined']= best_two.values.tolist()
best_two
best_two_dict_1 = best_two['1'].to_dict()
best_two_dict_2 = best_two['2'].to_dict()

In [25]:
selected_docs_1 = top_five.replace(best_two_dict_1)
selected_docs_2 = top_five.replace(best_two_dict_2)
selected_docs = pd.concat([selected_docs_1,selected_docs_2],axis=1,ignore_index=True)
#selected_docs = result.T.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=10)
#selected_docs.columns =['D1', 'D2', 'D3', 'D4', 'D5','D6','D7','D8','D9','D10']
display(selected_docs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,952,775,413,2687,30,163,5,1306,3585,952
1,404,1855,108,1213,2404,602,1877,4519,1933,2437
2,2491,357,1722,355,368,783,4034,1873,1159,1573
3,954,2491,1213,2373,2412,1697,783,1933,2463,2465
4,404,141,3198,117,2157,602,2275,76,321,2275
5,2491,141,2157,2389,340,783,2275,2275,2794,527
6,902,2892,2389,3094,3253,749,1704,2794,2268,2716
7,1722,1873,2389,3253,436,1873,71,2794,2716,1139
8,3198,117,2389,340,3234,76,321,2794,527,2955
9,404,2411,2892,192,183,602,2221,1704,217,228


**Replacing index values by the actual news.**

In [26]:
doc_dict = news_corpus.Content.to_dict()

selected_docs = selected_docs.replace(doc_dict)
selected_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,supreme court friday asked government respond ...,taken over probe into identical pils filed all...,supreme court judge have informally told chief...,jodhpur rajasthan india march rajasthan high c...,constitution bench chief justice india sharad ...,constitution bench supreme court thursday prop...,petition been filed supreme court highlighting...,several lawyer have written chief justice indi...,mumbai maharashtra india february supreme cour...,supreme court friday asked government respond ...
1,daily covid case india registered increase fou...,lauding india vaccine leadership woman officia...,covid pandemic negatively affected woman incom...,woman likely play major role ensuring that pro...,amaravati andhra pradesh india march andhra pr...,daily coronavirus covid case fell below third ...,international monetary fund chief economist gi...,nearly three woman worldwide subjected physica...,delhi reuters thousand woman joined protest fa...,delhi india march ministry science technology ...
2,siddharth sharmanew delhi india march congress...,petrol price tuesday neared litre mark nationa...,british parliament monday discussed farmer pro...,agitating farmer union february objected delhi...,farmer leader rakesh tikait monday took union ...,face between political rival bharatiya janata ...,kohima nagaland india february amid rise fuel ...,indian high commission london condemned debate...,second round meeting between police farmer uni...,more than faculty member various educational i...
3,supreme court friday urged government provide ...,siddharth sharmanew delhi india march congress...,woman likely play major role ensuring that pro...,delhi india march occasion international woman...,delhi india march prime minister narendra modi...,petition supreme court challenged entry woman ...,face between political rival bharatiya janata ...,delhi reuters thousand woman joined protest fa...,bhopal madhya pradesh india march madhya prade...,delhi india march occasion international woman...
4,daily covid case india registered increase fou...,cumulative dos covid vaccine administered coun...,delhi india march cumulative number covid vacc...,case coronavirus infection india were recorded...,delhi india march nationwide covid vaccination...,daily coronavirus covid case fell below third ...,delhi india march nationwide covid vaccination...,nearly lakh covid vaccine dos were administere...,daily rise coronavirus infection india recorde...,delhi india march nationwide covid vaccination...
5,siddharth sharmanew delhi india march congress...,cumulative dos covid vaccine administered coun...,delhi india march nationwide covid vaccination...,delhi india march cumulative number covid vacc...,india total tally covid case surged with infec...,face between political rival bharatiya janata ...,delhi india march nationwide covid vaccination...,delhi india march nationwide covid vaccination...,pune maharashtra india march union minister in...,daily infection fell below fourth time this mo...
6,group gandhian organisation called farmer move...,delhi india march bharat biotech serum institu...,delhi india march cumulative number covid vacc...,delhi india march private hospital under ayush...,delhi india march prime minister narendra modi...,three contentious agricultural reform law shou...,court said rather selfish petitioner seek vacc...,pune maharashtra india march union minister in...,delhi india march review status progress covid...,lucknow uttar pradesh india march eminent shia...
7,british parliament monday discussed farmer pro...,indian high commission london condemned debate...,delhi india march cumulative number covid vacc...,delhi india march prime minister narendra modi...,moscowindia strategic tie with russia very dee...,indian high commission london condemned debate...,high commission india london condemned debate ...,pune maharashtra india march union minister in...,lucknow uttar pradesh india march eminent shia...,underlining that terrorism continues pose crit...
8,delhi india march cumulative number covid vacc...,case coronavirus infection india were recorded...,delhi india march cumulative number covid vacc...,india total tally covid case surged with infec...,delhi india march government monday opened cov...,nearly lakh covid vaccine dos were administere...,daily rise coronavirus infection india recorde...,pune maharashtra india march union minister in...,daily infection fell below fourth time this mo...,delhi india march president kovind along with ...
9,daily covid case india registered increase fou...,mumbai maharashtra india march national invest...,delhi india march bharat biotech serum institu...,maharashtra kerala punjab tamil nadu gujarat k...,maharashtra kerala punjab tamil nadu gujarat k...,daily coronavirus covid case fell below third ...,mumbai maharashtra india march national invest...,court said rather selfish petitioner seek vacc...,maharashtra kerala punjab tamil nadu gujarat w...,with several state country continuing report i...


# **Collaborative model**

User vs User matrix

In [27]:
'''user_similarity = Users.dot(Users.T);
#user_similarity'''

'user_similarity = Users.dot(Users.T);\n#user_similarity'

Top 5 similar users to every user

In [28]:
'''similar_users = user_similarity.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=6)
similar_users.columns =['USER0','USER1', 'USER2', 'USER3', 'USER4', 'USER5']
del similar_users['USER0']
#similar_users'''

"similar_users = user_similarity.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=6)\nsimilar_users.columns =['USER0','USER1', 'USER2', 'USER3', 'USER4', 'USER5']\ndel similar_users['USER0']\n#similar_users"

# **Predict missing ratings using Matrix factorization**

In [38]:
A = np.array(rank_matrix)
M = A.shape[0]
N = A.shape[1]
A_df = pd.DataFrame(A)
K = 25
W = np.abs(np.random.uniform(low=0, high=1, size=(M, K)))
H = np.abs(np.random.uniform(low=0, high=1, size=(K, N)))
W = np.divide(W, K*W.max())
H = np.divide(H, K*H.max())

Cost function : NNLS

In [39]:
def cost(A, W, H):
    from numpy import linalg
    mask = pd.DataFrame(A).notnull().values
    WH = np.dot(W, H)
    WH_mask = WH[mask]
    A_mask = A[mask]
    A_WH_mask = A_mask-WH_mask
    return linalg.norm(A_WH_mask, 2)

In [40]:
num_iter = 1000
num_display_cost = max(int(num_iter/10), 1)
from scipy.optimize import nnls

for i in range(num_iter):
    if i%2 ==0:
        # Learn H, given A and W
        for j in range(N):
            mask_rows = pd.Series(A[:,j]).notnull()
            H[:,j] = nnls(W[mask_rows], A[:,j][mask_rows])[0]
    else:
        for j in range(M):
            mask_rows = pd.Series(A[j,:]).notnull()
            W[j,:] = nnls(H.transpose()[mask_rows], A[j,:][mask_rows])[0]
    WH = np.dot(W, H)
    c = cost(A, W, H)
    #if i%num_display_cost==0:
    print(i, c)

0 780.4229018024697
1 618.6912495068716
2 502.46853008452416
3 441.70438690997713
4 403.10704206665196
5 378.61117805356196
6 360.5408754490684
7 345.9065535758132
8 334.2897843256209
9 324.85743076965923
10 317.3731214176457
11 311.4220421657149
12 306.70597178466625
13 302.6365665303328
14 299.00526800393624
15 295.65004302418447
16 292.58615937702086
17 289.76645504005927
18 287.14393177750924
19 284.742648252452
20 282.64208927247444
21 280.77987663859716
22 279.106496249318
23 277.57490876685625
24 276.21370650245433
25 275.0369296874666
26 273.97873244030916
27 273.038614613885
28 272.176858857912
29 271.38339141767653
30 270.6543352878077
31 269.9975311994219
32 269.3955350472979
33 268.82456025656694
34 268.2823255896905
35 267.7675531903794
36 267.2804125974753
37 266.812140167591
38 266.36022983607353
39 265.9269009586617
40 265.5101581608899
41 265.0999667538445
42 264.69896547330774
43 264.3030000683345
44 263.9214314583463
45 263.5759788765263
46 263.23868488353247
47 262.

In [41]:
W = pd.DataFrame(W)
H = pd.DataFrame(H)
A = W.dot(H)
A = A.round(0).astype(int)
A

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,6,5,6,5,7,9,9,4,7,4,4,1,2,6,6,9,5,1,7,8,6,5,10,3,7,6,4,4,6,2,10,11,5,10,6,10,3,2,6,3,...,1,6,1,2,3,2,1,1,1,1,3,1,1,1,1,2,2,2,3,3,3,1,1,3,2,1,2,2,1,2,1,2,5,8,8,1,3,10,3,8
1,5,7,9,6,2,5,3,6,5,7,6,9,2,4,1,9,8,10,5,2,13,4,5,2,8,6,2,1,6,5,10,8,5,5,2,6,2,4,14,4,...,1,3,1,1,2,1,1,1,3,1,3,1,2,1,0,1,3,3,1,4562,3,2,1,2,1,2,1,7,1,2,1,7,1,4,2,1,3,3,3,0
2,3,8,3,6,1,1,7,2,8,5,4,9,5,3,6,2,6,7,3,5,10,2,2,5,10,6,5,2,3,4,1,3,3,1,7,1,6,2,3,2,...,3,2,4,6,6,2,2,2,3,2,2,4,3,3,2,2,5,4,3,6,3,2,2,2,3,4,3,4,3,2,3,3,3,6,8,1,5,3,4,5
3,3,6,2,6,2,5,6,2,7,5,4,4,2,5,2,10,3,1,2,3,10,5,7,3,6,7,2,1,6,2,7,9,7,4,8,6,7,5,5,3,...,1,1,2,3,1,1,0,1,3,1,2,1,1,1,1,1,2,2,1,6893,2,6,1,2,1,1,2,1,1,2,1,2,2,4,3,2,3,4,0,5
4,5,8,1,8,4,2,12,3,2,4,8,10,9,1,2,2,6,5,5,4,1,5,2,6,0,8,8,3,8,4,1,5,8,1,4,2,2,3,3,8,...,5,2,3,4,4,3,3,3,5,4,3,4,4,5,3,6,8,6,4,3,5,6,3,4,4,3,5,6,5,6,5,3,3,4,5,5,7,1,4,1
5,4,8,7,5,6,5,5,5,6,2,3,8,3,3,2,7,5,9,4,6,10,4,7,4,9,7,2,2,4,2,7,5,10,5,5,7,6,3,4,2,...,0,3,1,1,1,1,0,0,1,1,3,0,1,0,0,1,1,2,1,1,2,0,0,1,1,1,1,0,0,1,1,2,2,4,3,3,2,5,2,1
6,4,9,7,6,4,4,9,3,5,4,4,10,4,1,2,4,5,7,3,6,9,5,6,4,7,5,4,1,5,1,6,5,7,2,7,5,5,3,4,2,...,0,2,1,1,2,1,0,0,1,0,2,1,1,0,0,0,2,2,1,1,1,1,0,1,0,1,1,1,1,1,0,1,2,4,3,2,3,5,1,3
7,5,9,7,6,5,9,6,2,1,7,5,6,4,2,2,5,6,6,3,6,3,6,4,3,6,2,5,1,4,1,8,5,9,3,6,7,2,6,5,3,...,1,3,1,0,3,2,1,1,1,1,4,1,1,1,1,1,1,2,3,2,2,3,1,2,1,2,2,2,1,2,1,1,3,3,2,3,5,7,3,1
8,4,9,3,6,2,1,8,2,3,3,7,7,8,2,1,7,8,9,4,2,4,8,7,7,2,6,5,4,7,6,1,9,5,1,5,1,1,6,6,6,...,3,2,2,7,4,2,2,2,4,4,2,3,4,3,2,4,7,4,2,3,4,7,2,3,3,2,4,6,4,5,3,3,2,2,3,3,7,1,5,3
9,4,2,4,6,8,9,4,7,1,3,4,9,8,5,4,8,4,3,7,4,0,6,8,7,1,5,5,4,7,4,9,5,10,8,2,9,4,2,7,8,...,4,5,2,4,3,4,3,2,3,3,3,3,3,3,2,5,6,3,4,2,5,5,3,3,2,2,4,3,2,4,3,2,2,4,4,6,4,7,3,3


# **Top 10 news : Collaborative recommender.**

In [42]:
selected_docs = A.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=10)
selected_docs.columns =['D1', 'D2', 'D3', 'D4', 'D5','D6','D7','D8','D9','D10']
doc_dict = news_corpus.Content.to_dict()
selected_docs = selected_docs.replace(doc_dict)
selected_docs

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10
0,supreme court decided examine contempt petitio...,chennai tamil nadu india march voter above tam...,petition been filed supreme court prohibit con...,delhi india march supreme court tuesday decide...,delhi india march delhi deputy chief minister ...,chennai tamil nadu india march ahead assembly ...,delhi india march election commission india we...,delhi india february first contingent border s...,tirunelveli tamil nadu india february congress...,delhi india february delhi high court issued n...
1,delhi india february delhi court tuesday grant...,delhi india february many covid case death wer...,promising development india russia relationshi...,hour after pakistan court convicted lashkar ta...,national commission woman sought immediate int...,fallon eric greitens political future seemed d...,amaravati andhra pradesh india february telugu...,india should able roll covid vaccination progr...,delhi india february supreme court friday refu...,defence minister rajnath singh sunday said tha...
2,delhi india february delhi court tuesday grant...,promising development india russia relationshi...,hour after pakistan court convicted lashkar ta...,national commission woman sought immediate int...,amaravati andhra pradesh india february telugu...,india should able roll covid vaccination progr...,delhi india february supreme court friday refu...,delhi india march delhi reported covid case la...,defence minister rajnath singh sunday said tha...,several opposition party including congress ha...
3,delhi india february many covid case death wer...,fallon eric greitens political future seemed d...,chandigarh punjab india february punjab chief ...,government informed rajya sabha wednesday that...,number fresh covid case recorded below india j...,chandigarh punjab india march congress leader ...,vaccination based vulnerability infection prof...,delhi india march prime minister narendra modi...,washim maharashtra india march washim police m...,hyderabad telangana india february telangana r...
4,delhi reuters india federal government saturda...,patna bihar india march expressing grief over ...,vijayawada andhra pradesh india march voting m...,joymala bagchinew delhi india march year citiz...,delhi reuters temporary export critical materi...,delhi india march union minister piyush goyal ...,delhi india march union finance minister nirma...,delhi india march ministry health family welfa...,nearly lakh covid vaccine dos were administere...,national investigation agency friday filed cha...
5,india provided lakh dos covid vaccine various ...,ashok gehlot rajasthan government union health...,delhi india march president kovind along with ...,delhi india march supreme court judge their fa...,delhi india march laga pata nahi chala already...,lauding india vaccine leadership woman officia...,remarking that normal bilateral trade relation...,india supplied covid vaccine country another n...,plea filed wednesday supreme court seeking set...,protesting farm union have welcomed supreme co...
6,effort silence voice said delhi riot lawyer me...,india provided lakh dos covid vaccine various ...,congress leader rahul gandhi raised sharp atta...,country began administering made india vaccine...,ashok gehlot rajasthan government union health...,bengaluru karnataka india march congress leade...,delhi india march prime minister narendra modi...,delhi india february second phase covid vaccin...,lauding india vaccine leadership woman officia...,union health secretary rajesh bhushan said tue...
7,national conference president farooq abdullah ...,effort silence voice said delhi riot lawyer me...,delhi india february second phase covid vaccin...,delhi india march second phase nationwide vacc...,bengaluru karnataka india february soon after ...,covid fatality have been reported state union ...,mumbai maharashtra india march maharashtra rep...,delhi india march prime minister narendra modi...,delhi india february urdu publication their th...,high level committee headed union home ministe...
8,delhi reuters india federal government saturda...,delhi india february delhi court tuesday grant...,patna bihar india march expressing grief over ...,promising development india russia relationshi...,hour after pakistan court convicted lashkar ta...,national commission woman sought immediate int...,amaravati andhra pradesh india february telugu...,india should able roll covid vaccination progr...,vijayawada andhra pradesh india march voting m...,delhi india february supreme court friday refu...
9,india china withdrew troop from ladakh last mo...,protesting farmer union saturday demanded high...,delhi reuters india federal government saturda...,patna bihar india march expressing grief over ...,vijayawada andhra pradesh india march voting m...,congress tuesday said president india should i...,coming across case where education system bein...,hyderabad telangana india march bharat biotech...,delhi india march special investigation team b...,security force january recovered cache arm amm...


# **Implement ALS based matrix factorization instead of NNLS**

In [None]:
from pyspark.ml.recommendation import ALS 
from pyspark.sql.types import FloatType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

X_train, X_test = rank_matrix.randomSplit([0.6, 0.4])

In [None]:
als = mlALS(rank=5, maxIter=10, seed=0)
model = als.fit(rank_matrix)