<a href="https://colab.research.google.com/github/sarvesh237/lfkdsahkhfa/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#for lemmatization
import nltk
nltk.download("wordnet")
nltk.download('stopwords')
#to remove stop words
from nltk.corpus import stopwords

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#gaussian mixture
from sklearn.mixture import GaussianMixture

from sklearn.decomposition import TruncatedSVD# TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Importing the collected data**

In [26]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [27]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, stop words, non-english characters, drop empty rows and duplicates.**

In [28]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

stop_words = stopwords.words('english')
stop_words.extend(['span','class','spacing','href','html','http','title','said','that'])
news_corpus['Content'] = news_corpus['Content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_corpus = news_corpus.apply(lambda x: [item for item in x if item not in stop_words])


**Lemmatization**

In [29]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus = news_corpus.reset_index(drop=True)
news_corpus


Unnamed: 0,Content
0,medium report about swedish manufacturer scani...
1,access covid vaccine cooperation technology cl...
2,after severe criticism over holding consultati...
3,former congress president rahul gandhi thursda...
4,enforcement directorate attached three immovab...
...,...
4589,over mughal gold coin dating back early centur...
4590,china planning spend tibet five year plan allo...
4591,supreme court tuesday came with solution stale...
4592,indian american maju varghese previously serve...


# TF-IDF

In [30]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aajtak,aakash,aaksha,aamir,aamk,aandolan,aandolanjivi,aane,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,aatmanirbharbharat,aatmanirbharta,aatmiyata,aatmnirbhar,aatmnirbharta,aawam,aaya,...,zhao,zhoigar,zhuoran,ziarul,zila,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,ziyad,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,ztdrktlic,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **LSA : Latent Semantic Analysis**

In [31]:
sv_dec = TruncatedSVD(n_components = 50)  # components(no. of latent factors) < n ; n : no. of docs.
ls_analysis = sv_dec.fit_transform(vectors.T)

sv_dec.singular_values_ # eigenvalues for svd (in decreasing order)

array([13.49665211,  8.19753288,  6.65195647,  6.45052751,  6.13464354,
        5.38660292,  5.15440854,  4.58017062,  4.37335669,  4.27803738,
        4.22045592,  4.12362618,  4.06321463,  4.038303  ,  3.93009434,
        3.91268762,  3.78109412,  3.73147078,  3.67994501,  3.61964622,
        3.56624964,  3.48166575,  3.40908751,  3.36903764,  3.30665106,
        3.29693609,  3.27028653,  3.22424747,  3.14306192,  3.13968751,
        3.10078889,  3.06935438,  3.00977077,  2.97234813,  2.96484476,
        2.94580717,  2.92401779,  2.86812987,  2.84187874,  2.78930585,
        2.75382618,  2.74328769,  2.73570171,  2.72109242,  2.70242924,
        2.6787682 ,  2.66589898,  2.65019088,  2.63717162,  2.62089126])

In [32]:
variance =  sv_dec.singular_values_/sum(sv_dec.singular_values_) # checking the variance captured by eigenvalues
variance

array([0.06978374, 0.04238492, 0.03439359, 0.03335212, 0.03171885,
       0.02785115, 0.0266506 , 0.02368153, 0.02261221, 0.02211937,
       0.02182165, 0.02132099, 0.02100864, 0.02087984, 0.02032035,
       0.02023035, 0.01954995, 0.01929338, 0.01902696, 0.01871519,
       0.01843911, 0.01800177, 0.01762651, 0.01741943, 0.01709687,
       0.01704664, 0.01690885, 0.0166708 , 0.01625104, 0.01623359,
       0.01603247, 0.01586994, 0.01556186, 0.01536837, 0.01532958,
       0.01523114, 0.01511848, 0.01482952, 0.01469379, 0.01442196,
       0.01423851, 0.01418403, 0.0141448 , 0.01406927, 0.01397277,
       0.01385043, 0.01378389, 0.01370267, 0.01363536, 0.01355118])

In [33]:
sv_dec = TruncatedSVD(n_components = 25) # components with accepted variance
las = sv_dec.fit_transform(vectors)

#print(las,las.shape)

Topic - Document matrix

In [34]:
# Representation of each doc in terms of latent topics after dimensionality reduction
col = []
for i in range(las.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df = pd.DataFrame(las, columns = col)

topic_df["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

display(topic_df[l])


Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish manufacturer scani...,0.129416,-0.031667,-0.002055,0.016614,-0.058615,-0.036406,-0.018097,0.010017,0.002911,-0.019765,0.011678,0.012328,0.009564,-0.050988,0.008091,-0.014729,0.066409,-0.036158,-0.004868,0.028574,0.015415,-0.005126,-0.023794,-0.054817,-0.039013
1,access covid vaccine cooperation technology cl...,0.241852,-0.033735,0.076413,0.116303,-0.063051,-0.107551,-0.079475,-0.060669,-0.053875,-0.140988,-0.005923,-0.106192,0.027071,0.047388,-0.103446,0.017664,0.114827,0.028572,0.025137,-0.005908,0.024445,-0.009910,-0.122915,0.081199,0.043887
2,after severe criticism over holding consultati...,0.187095,-0.047154,-0.003887,0.027581,-0.070269,-0.018086,-0.071016,-0.017926,-0.001756,-0.023133,-0.017239,-0.021874,0.020321,0.001610,0.024914,-0.082908,0.022911,-0.068445,-0.003593,-0.007393,0.080492,-0.007639,-0.135591,-0.182564,-0.040880
3,former congress president rahul gandhi thursda...,0.177060,-0.026037,0.029296,0.010239,-0.039889,-0.040937,-0.059202,0.027852,-0.032318,-0.081254,0.100714,-0.066793,-0.007584,-0.010843,0.028020,0.012935,0.054207,-0.037241,-0.000651,0.033014,0.039847,-0.013405,-0.008395,-0.037623,0.019109
4,enforcement directorate attached three immovab...,0.134256,-0.048096,-0.066811,-0.030223,-0.111696,0.012965,0.045206,0.013607,0.054012,-0.024715,-0.000727,0.027726,-0.031851,-0.085735,0.038797,-0.006458,0.104773,-0.027587,-0.032683,0.034110,-0.062375,0.006984,0.027979,-0.035797,-0.085410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,over mughal gold coin dating back early centur...,0.086212,-0.022274,-0.033226,-0.015106,-0.062893,-0.042098,0.091921,0.036214,0.021558,-0.010165,0.003841,0.002328,-0.011556,-0.038983,0.006495,-0.035991,0.001256,-0.012749,0.017105,-0.009987,0.019431,0.008811,-0.002618,0.012304,-0.007261
4590,china planning spend tibet five year plan allo...,0.123348,-0.013562,0.004551,0.000892,-0.026683,-0.072787,-0.033823,-0.033918,-0.018590,-0.024326,-0.065753,-0.000197,0.023878,-0.016490,-0.008183,-0.006624,0.036920,-0.002206,-0.013584,0.006375,0.016976,-0.018475,0.012962,0.021135,0.006548
4591,supreme court tuesday came with solution stale...,0.178528,-0.066110,-0.074740,-0.003261,-0.120682,0.127786,-0.102596,-0.017767,0.033514,-0.020176,-0.024953,0.010885,0.003273,-0.010736,0.030053,0.008747,0.020489,0.011038,0.016076,-0.006044,0.035399,0.025902,0.042161,0.005921,-0.028666
4592,indian american maju varghese previously serve...,0.080712,-0.038110,0.006441,0.013000,-0.024383,-0.039532,-0.024894,0.020867,0.019397,-0.050987,-0.049623,-0.039385,-0.013006,0.058917,-0.030036,-0.058182,0.042178,-0.003551,0.029194,0.043524,0.017813,0.005983,0.002972,0.093063,-0.029936


**Visualizing the topics**

In [35]:
'''import umap
import matplotlib.pyplot as plt

embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )
plt.show()'''

'import umap\nimport matplotlib.pyplot as plt\n\nembedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(las)\n\nplt.figure(figsize=(7,5))\nplt.scatter(embedding[:, 0], embedding[:, 1], c = range(4594),s=15 )\nplt.show()'

**Topics and terms list**

In [36]:
Sigma = sv_dec.singular_values_ 
V_transpose = sv_dec.components_.T
terms = vectorizer.get_feature_names()

for index, component in enumerate(sv_dec.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)
del Sigma,V_transpose,terms,zipped,top_terms_key

Topic 0:  ['said', 'that', 'covid', 'case', 'will', 'farmer', 'have', 'state', 'with', 'minister']
Topic 1:  ['covid', 'vaccine', 'case', 'death', 'vaccination', 'health', 'total', 'reported', 'dose', 'active']
Topic 2:  ['election', 'assembly', 'party', 'vaccine', 'congress', 'seat', 'will', 'poll', 'minister', 'bengal']
Topic 3:  ['vaccine', 'vaccination', 'farmer', 'dose', 'first', 'covid', 'modi', 'dos', 'law', 'covaxin']
Topic 4:  ['farmer', 'law', 'farm', 'protest', 'congress', 'price', 'protesting', 'death', 'party', 'hour']
Topic 5:  ['court', 'vaccine', 'justice', 'supreme', 'vaccination', 'election', 'dose', 'petition', 'bench', 'hearing']
Topic 6:  ['police', 'vaccine', 'arrested', 'dose', 'farmer', 'vaccination', 'accused', 'bengal', 'west', 'delhi']
Topic 7:  ['sabha', 'price', 'congress', 'rajya', 'budget', 'house', 'party', 'opposition', 'session', 'parliament']
Topic 8:  ['lakh', 'price', 'sabha', 'from', 'fuel', 'rajya', 'budget', 'crore', 'petrol', 'bengal']
Topic 9: 

# **LDA : Latent Dirichlet Allocation**

In [37]:
#lda

lda = LatentDirichletAllocation(n_components=25)
lda_matrix = lda.fit_transform(df_vecs)
lda_topics = lda.fit(df_vecs)
for idx, topic in enumerate(lda.components_):
        print ("Topic ", idx, " ".join(feature_names[i] for i in topic.argsort()[:-10 - 1:-1]))

Topic  0 gita ashram rathod swami bhagavad chidbhavananda apple version german kindle
Topic  1 tapovan similipal tunnel rishiganga lake debris burst chamoli dhauliganga padma
Topic  2 jarkiholi milk kallahalli sarma tape himanta biswa sexually scandal kumaraswamy
Topic  3 delkar vista mohan ganjhu puri namboothiri monkey poet redevelopment vishnu
Topic  4 ravidas pujari furthering sant pooja saint pamela cocaine goswami anniversary
Topic  5 farmer will said that minister india with this woman prime
Topic  6 virudhunagar firecracker artisan elephant factory gadchiroli koregaon bhima surgeon bamboo
Topic  7 ceraweek cbse tejas drdo madhavan kozhikode riyas bach olympic railtel
Topic  8 mufti fastag mehbooba dictionary joke pmla wrongful fcra nirav boeing
Topic  9 maninderjit khempreet kamra genome seafarer spear solan bigg dog sequencing
Topic  10 senate democrat biden white republican harris senator bhainsa trillion psaki
Topic  11 terrorist police pakistan kashmir myanmar encounter ter

In [38]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [39]:
#plot_top_words(lda, feature_names, 10, "t")

In [40]:
col = []
for i in range(lda_matrix.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df_2 = pd.DataFrame(lda_matrix, columns = col)

topic_df_2["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)

display(topic_df_2[l])

Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish manufacturer scani...,0.004586,0.004586,0.011774,0.004586,0.004586,0.004586,0.004586,0.004586,0.004586,0.004586,0.004586,0.563291,0.004586,0.004586,0.123166,0.004586,0.004586,0.004586,0.004586,0.205469,0.004586,0.004586,0.004586,0.004586,0.004586
1,access covid vaccine cooperation technology cl...,0.030222,0.002871,0.002871,0.002871,0.002871,0.025141,0.002871,0.002871,0.068023,0.002871,0.002871,0.542614,0.002871,0.002871,0.257316,0.002871,0.002871,0.002871,0.002871,0.002871,0.022140,0.002871,0.002871,0.002871,0.002871
2,after severe criticism over holding consultati...,0.004112,0.004112,0.004112,0.004112,0.004112,0.004112,0.054591,0.004112,0.004112,0.004112,0.004112,0.628771,0.004112,0.004112,0.163890,0.004112,0.004112,0.004112,0.004112,0.004112,0.066388,0.004112,0.004112,0.004112,0.004112
3,former congress president rahul gandhi thursda...,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178,0.036084,0.004178,0.004178,0.539046,0.004178,0.004178,0.209651,0.127491,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178,0.004178
4,enforcement directorate attached three immovab...,0.004818,0.004818,0.004818,0.004818,0.004818,0.004818,0.004818,0.004818,0.071403,0.004818,0.004818,0.615515,0.004818,0.004818,0.004818,0.004818,0.004818,0.004818,0.207078,0.004818,0.004818,0.004818,0.004818,0.004818,0.004818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,over mughal gold coin dating back early centur...,0.006132,0.006132,0.006132,0.006132,0.006132,0.006132,0.006132,0.374991,0.110165,0.006132,0.006132,0.334256,0.006132,0.006132,0.006132,0.006132,0.006132,0.006132,0.006132,0.006132,0.051827,0.006132,0.006132,0.006132,0.006132
4590,china planning spend tibet five year plan allo...,0.004949,0.004949,0.004949,0.004949,0.004949,0.004949,0.004949,0.004949,0.004949,0.004949,0.004949,0.487270,0.004949,0.004949,0.181021,0.184373,0.004949,0.004949,0.004949,0.004949,0.043414,0.004949,0.004949,0.004949,0.004949
4591,supreme court tuesday came with solution stale...,0.003091,0.003091,0.003091,0.003091,0.003091,0.003091,0.003091,0.003091,0.138107,0.003091,0.003091,0.509836,0.003091,0.080999,0.137857,0.003091,0.003091,0.003091,0.003091,0.003091,0.071387,0.003091,0.003091,0.003091,0.003091
4592,indian american maju varghese previously serve...,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.229999,0.005681,0.432824,0.005681,0.005681,0.212190,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681,0.005681


# **Creating Users**

Using GMM

In [41]:
#fitting GMM to column "topic_i"

num_cols_topic_df = topic_df.shape[1]-1
gm = []

for i in range(num_cols_topic_df):
  gm.append(GaussianMixture(n_components=10).fit(topic_df.iloc[:, [i]]))

In [42]:
#generating gmm based topic values for each topic for 50 users
#24 x 50 matrix

Users = []

for i in range(50):
  temp = []
  for i in range(num_cols_topic_df):
    temp.append(gm[i].sample(1)[0][0][0])
  Users.append(temp)
Users=np.array(Users)
Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

Option 2 (Not using): Assigning random 1-10 ratings to each topic based on normal distribution. This assumes that the topics are independant.

In [43]:
'''Users = np.random.randint(0,10,[25,50])
Users = Users.T
l2norm = np.sqrt((Users * Users).sum(axis=1))
Users = pd.DataFrame(Users/l2norm.reshape(50,1))

Users'''

'Users = np.random.randint(0,100,[25,50])\nUsers = Users.T\nl2norm = np.sqrt((Users * Users).sum(axis=1))\nUsers = pd.DataFrame(Users/l2norm.reshape(50,1))\n\nUsers'

# **Cosine Similarity between Users and Docs**

In [44]:
#run the above cell before running this twice
slc = list(range(topic_df.shape[1]))
slc.remove(25)
temp_topic_df = topic_df.iloc[:, slc]
Users_temp = Users.T.set_index(temp_topic_df.T.index)

result = temp_topic_df.dot(Users_temp);
result.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,0.082931,0.106975,0.095388,0.124679,0.073993,0.031535,0.05639,0.071179,0.12777,0.099465,0.115673,0.104725,0.023054,0.038014,3.1e-05,0.047127,0.095275,0.082651,0.05391,0.121954,0.177509,0.099779,0.129961,0.056643,0.117071,0.072198,0.025635,0.02751,0.091129,0.038129,0.019317,0.074087,0.10256,0.005779,0.047118,0.033161,0.128422,0.037367,0.064759,0.043136,...,0.05154,0.03845,0.042093,0.043813,0.090585,0.039036,0.032478,0.04631,0.050355,0.027665,0.083006,0.043585,0.03599,0.042201,0.015392,0.031317,0.090934,0.07156,0.063549,0.061673,0.053584,0.057937,0.022886,0.036865,0.051994,0.04225,0.071731,0.070116,0.05286,0.0472,0.052761,0.04914,0.043408,0.062669,0.105808,0.050208,0.089775,0.089364,0.031601,0.082221
1,0.088155,0.096523,0.095727,0.113237,0.127472,0.095953,0.237315,0.067119,0.17061,0.077427,0.088764,0.448332,0.079685,0.020174,0.08684,0.069097,0.086273,0.054178,0.093523,0.167234,0.232848,0.109233,0.173763,0.084274,0.120577,0.125086,0.071517,0.049509,0.111922,0.0767,0.117727,0.09246,0.135531,0.072681,0.111024,0.129567,0.082938,0.025754,0.089421,0.056354,...,0.034387,0.051342,0.052457,0.040736,0.062038,0.03494,0.023325,0.017831,0.035398,0.018202,0.059103,0.031739,0.02742,0.025999,0.00743,0.028527,0.070313,0.046086,0.047835,0.061546,0.053632,0.021186,0.012602,0.029801,0.036946,0.029808,0.038292,0.043052,0.034845,0.036872,0.034012,0.043748,0.087141,0.16097,0.217882,0.065674,0.079219,0.134479,0.052015,0.183674
2,0.10502,0.18013,0.147988,0.151022,0.141548,0.152645,0.141655,0.137755,0.156669,0.064962,0.118728,0.117573,0.138236,0.094391,0.0752,0.231448,0.149692,0.151677,0.112887,0.221715,0.242434,0.110006,0.157546,0.089605,0.131767,0.092386,0.117086,0.05529,0.115112,0.068034,0.185808,0.190721,0.179334,0.111714,0.119722,0.176924,0.168025,0.052993,0.132729,0.045647,...,0.045621,0.095907,0.053581,-0.010307,0.09959,0.044304,0.030334,0.034675,0.070875,0.038277,0.080715,0.044855,0.046367,0.03446,0.012889,0.032825,0.055066,0.0594,0.079379,0.066663,0.077171,0.066403,0.012421,0.053667,0.048844,0.060024,0.049113,0.067139,0.048218,0.053304,0.05108,0.050545,0.114453,0.099692,0.106109,0.061276,0.11103,0.192631,0.074166,0.099631
3,0.017609,0.012573,-0.002377,0.041471,-1.2e-05,-0.053891,0.113409,-0.018545,-0.007923,0.048066,0.012704,0.200809,0.031852,-0.06246,-0.006349,-0.125081,-0.020692,-0.046511,0.006724,-0.046305,-0.074097,0.074088,0.013346,0.029177,-0.068406,0.007811,0.017897,-0.007705,0.06701,-0.004215,-0.114863,-0.074523,0.061577,-0.079151,-0.011794,-0.072124,-0.001505,0.002493,-0.015773,0.052698,...,0.009138,-0.03527,0.00902,0.14022,0.014219,-0.001847,-0.000254,0.000936,0.010837,0.002982,0.022093,0.001176,0.007794,0.011562,0.000392,0.025396,0.054083,0.01796,0.012913,0.006394,0.012583,0.041526,0.007721,-0.002969,0.001366,0.002992,0.020766,0.012688,0.001805,0.018246,0.005146,-0.000273,-0.030874,0.039482,0.056087,0.014849,0.011664,-0.078857,-0.004319,0.004854
4,0.087591,0.186682,0.134874,0.121352,0.068606,0.080726,0.1806,0.107087,0.133794,0.136045,0.095959,0.117398,0.099208,0.084446,0.09187,0.163877,0.146367,0.173571,0.109337,0.236408,0.268115,0.07399,0.12123,0.097465,0.280537,0.17148,0.149968,0.071942,0.116971,0.074392,0.160124,0.149895,0.059339,0.110257,0.178226,0.090882,0.250358,0.117969,0.094658,0.024976,...,0.023874,0.089856,0.048357,0.024626,0.092895,0.023551,0.021871,0.016221,0.039487,0.026499,0.043362,0.035804,0.042628,0.020668,0.006163,0.024106,0.055742,0.068767,0.057024,0.052927,0.048081,0.0471,0.01489,0.030551,0.028388,0.055884,0.03554,0.038457,0.02973,0.046535,0.029081,0.042857,0.11526,0.12595,0.12961,0.027043,0.118472,0.130487,0.097268,0.082626
5,0.050002,0.107796,0.01246,0.071173,0.099518,0.002753,0.062638,0.100177,0.13067,0.013876,0.080436,-0.111243,0.104726,0.040693,0.036437,0.082532,0.09416,0.093038,0.02371,0.080296,0.192568,0.07067,0.0958,0.036451,0.11831,0.077116,0.028819,0.025701,0.036811,0.028914,-0.025088,0.07441,0.145465,-0.010494,0.084621,0.017945,0.177044,0.056412,0.03122,0.005801,...,0.024427,0.03713,0.047006,0.000589,0.081744,0.037058,0.021297,0.019752,0.048298,0.028073,0.051751,0.040664,0.028706,0.020706,0.010319,0.000393,0.000399,0.017539,0.069355,0.044593,0.059565,0.046304,0.011584,0.024045,0.030292,0.051155,0.037343,0.040309,0.03753,0.016325,0.033448,0.042522,0.064822,0.05408,0.033019,0.086167,0.06293,0.041644,0.068126,0.077845
6,0.103271,0.23832,0.161078,0.107599,0.095856,0.067394,0.211061,0.087779,0.103315,0.12477,0.102409,0.045218,0.117342,0.06594,0.08779,0.136239,0.150837,0.18285,0.091335,0.071319,0.120477,0.108559,0.102573,0.075259,0.177618,0.131947,0.090796,0.060111,0.121576,0.057833,0.104599,0.127574,0.1444,0.057662,0.12265,0.089357,0.121133,0.101161,0.082864,0.044911,...,0.024249,0.075398,0.058104,0.051113,0.095887,0.03324,0.021754,0.018866,0.046699,0.03658,0.07203,0.035576,0.039327,0.025894,0.009029,0.01993,0.060976,0.070086,0.066685,0.065158,0.061841,0.038257,0.028505,0.043456,0.029726,0.054149,0.049612,0.04481,0.040863,0.042158,0.040156,0.039507,0.080535,0.124853,0.099086,0.078418,0.089863,0.108905,0.113438,0.111948
7,0.064584,0.038634,0.024174,0.015674,0.080558,0.014718,0.076083,0.039699,0.034846,0.035627,0.051033,0.152268,0.145012,-0.031035,-0.007998,-0.092044,0.048324,0.039771,0.009729,0.15767,0.087075,0.112544,0.20169,0.055029,-0.044601,0.007881,0.114939,0.037848,0.105448,0.049329,0.003617,-0.017739,0.10687,0.02556,0.087698,0.017107,0.110603,0.052129,0.037555,0.070621,...,0.013241,0.032182,0.005296,0.060003,0.028622,0.011711,0.006826,-0.003587,-0.012563,0.010862,0.024321,0.02704,0.009181,0.004227,0.001063,0.026026,0.022117,0.023146,0.02766,0.008906,0.045127,-0.026005,0.003942,0.009387,0.014688,0.008493,0.021527,0.009662,0.009156,0.015599,0.012154,0.019663,0.069341,-0.029061,-0.057374,0.040896,0.068938,0.050702,-0.002379,0.031038
8,0.057618,-0.03545,0.047436,0.01034,0.152827,0.075159,0.065676,0.057419,0.041572,0.018561,0.04104,0.069039,0.017918,0.024008,0.034202,0.026067,0.001128,-0.013246,0.024739,0.019564,0.119376,0.07748,0.123144,0.026264,0.120153,0.046362,-0.014886,0.021239,0.076977,0.021545,0.032498,0.020302,0.269594,0.069411,0.034614,0.124212,0.05479,0.056677,0.024584,0.069932,...,0.000916,0.014522,0.02156,0.029806,-0.012272,0.022497,0.004937,0.004989,0.005951,0.00763,0.045487,0.006451,0.006234,0.002597,0.001965,0.004592,-0.003878,0.012859,0.011072,0.013545,0.027103,-0.018523,0.015981,0.020563,0.013234,0.009068,0.02627,-0.006413,0.017411,-0.007223,0.016003,0.011803,0.039586,0.064129,0.031381,0.104262,-0.008849,0.079859,0.003436,0.097936
9,-0.022996,-0.064158,-0.046906,0.019565,-0.057197,-0.092211,0.131627,-0.056053,0.118602,-0.001395,-0.009864,0.303851,0.017165,-0.017613,0.02436,-0.067718,-0.035966,-0.05342,-0.015808,0.086237,0.230036,-0.039843,-0.044448,0.015516,0.157583,0.060482,0.041927,-0.008199,-0.007471,0.024146,-0.109245,-0.028987,-0.052101,-0.094088,0.071778,-0.122154,-0.00065,-0.015992,-0.024762,-0.026097,...,0.002211,-0.051318,0.0115,-0.002848,-0.023218,-0.014327,-0.002907,-0.004359,0.003482,-0.015745,-0.026411,0.001764,-0.002283,0.001004,-0.004912,-0.011253,0.018165,-0.007434,-0.028012,0.008939,-0.019372,0.019821,-0.004975,-0.012798,0.00097,-0.00976,-0.008801,-0.001866,0.006038,-0.00013,-0.004581,0.027814,0.002006,0.081747,0.160834,-0.036544,0.005352,-0.059453,-0.033982,0.062183


# **Find the best 10 documents for each user.**

In [45]:
selected_docs = result.T.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=10)
selected_docs.columns =['D1', 'D2', 'D3', 'D4', 'D5','D6','D7','D8','D9','D10']
selected_docs

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10
0,2137,2414,2210,1629,2496,1762,2126,1706,2082,1783
1,3568,3584,890,544,238,1725,804,915,193,871
2,1688,166,308,127,3283,3323,3317,181,2248,2892
3,3910,3447,2559,2635,2352,2454,3048,3559,2739,3091
4,693,3568,934,755,1312,1195,613,817,1438,791
5,3474,2393,3185,3426,1794,615,74,3314,1213,2383
6,3196,2928,3161,1277,3029,181,1273,2882,3076,2918
7,1852,2269,2295,1851,1767,1849,2368,1891,1797,2177
8,3974,3983,2835,3543,522,573,3572,1017,3957,2112
9,3568,2899,3494,2762,3991,745,2554,2657,2658,368


**Replacing index values by the actual news.**

In [46]:
doc_dict = news_corpus.Content.to_dict()

selected_docs = selected_docs.replace(doc_dict)
selected_docs

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10
0,kolkata west bengal india march trinamool cong...,kolkata west bengal india march bharatiya jana...,kolkata west bengal india march west bengal ch...,bengal election mamata banerjee said attacked ...,delhi india march taking jibe west bengal chie...,west bengal election mamata banerjee make nand...,kolkata west bengal india march trinamool cong...,west bengal election mamata banerjee filed nom...,kolkata west bengal india march after alleged ...,hour after west bengal chief minister mamata b...
1,delhi india february total assembly constituen...,delhi india february election commission india...,with coronavirus case being reported india cov...,india tally covid case rose with infection whi...,india covid tally surpassed crore march with i...,india logged crore coronavirus case india covi...,with people testing positive coronavirus infec...,with coronavirus case india covid tally climbe...,india total tally covid case rose with infecti...,india registered coronavirus infection lowest ...
2,vaccination based vulnerability infection prof...,delhi high court march asked centre explain ra...,union health ministry will soon issue format m...,more than million vaccine dos have been admini...,delhi india march second phase nationwide covi...,delhi india february ahead commencement second...,delhi india february next phase world largest ...,health minister harsh vardhan wednesday reiter...,bengaluru karnataka india march year woman rec...,delhi india march bharat biotech serum institu...
3,delhi india february india reported case coron...,mumbai maharashtra india february maharashtra ...,delhi india march number covid case country sl...,delhi india march union minister narendra sing...,delhi india march many covid case death were r...,delhi india march india reported covid case re...,delhi india march many covid case death were r...,thiruvananthapuram kerala india february keral...,delhi india march delhi reported covid case re...,thiruvananthapuram kerala india march kerala r...
4,prime minister narendra modi said monday that ...,delhi india february total assembly constituen...,prime minister narendra modi saturday said par...,congress leader anand sharma friday described ...,protesting farmer leader january said they wil...,under attack from protesting union their gover...,congress leader rahul gandhi raised sharp atta...,congress leader ghulam nabi azad appealed gove...,eighth round talk between centre farm union wh...,government will provide lakh crore additional ...
5,jammu jammu kashmir india february senior cong...,delhi india march occasion international woman...,siddharth sharmanew delhi india march congress...,delhi india february andhra pradesh congress l...,former union minister veteran congress leader ...,urging centre make agri law prestige issue sen...,international woman which celebrated with much...,chandigarh haryana india february amidst inter...,woman likely play major role ensuring that pro...,siddharth sharmanew delhi india march indian y...
6,delhi india march covid vaccination concluded ...,delhi india march chief minister arvind kejriw...,hyderabad telangana india march union minister...,launching world biggest covid vaccination driv...,delhi india march president nath kovind wednes...,health minister harsh vardhan wednesday reiter...,hailing prime minister narendra modi launch wo...,delhi india march second phase vaccination cov...,delhi india march several union minister polit...,delhi india march delhi lieutenant governor an...
7,uttarakhand chief minister trivendra singh raw...,dehradun uttarakhand india march shortly after...,dehradun uttarakhand india march predicting ch...,trivendra singh rawat resigned chief minister ...,dhan singh rawat will likely chosen chief mini...,trivendra singh rawat resigned chief minister ...,delhi india march amid speculation about leade...,suspense loom over fate leadership uttarakhand...,bharatiya janata party tirath singh rawat took...,dehradun uttarakhand india march after taking ...
8,delhi india february delhi session court tuesd...,delhi india february delhi court tuesday grant...,delhi india march delhi high court thursday is...,delhi india february delhi uttar pradesh polic...,delhi court february extended seven day police...,three more have been arrested connection with ...,delhi india february delhi high court thursday...,delhi police given instruction firearm agitato...,delhi india february delhi court remanded acto...,delhi india march with view motivate rank delh...
9,delhi india february total assembly constituen...,ghazipur uttar pradesh india march amid ongoin...,delhi india february criticising centre over s...,chennai tamil nadu india march with just month...,delhi india february union home ministry exten...,ahead nationwide chakka called farmer union pr...,chennai tamil nadu india march tamil nadu cong...,chennai tamil nadu india march viduthalai chir...,chennai tamil nadu india march india anna drav...,farmer leader rakesh tikait monday took union ...


# **Collaborative model**

User vs User matrix

In [47]:
user_similarity = Users.dot(Users.T);
user_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,1.0,0.387115,0.427371,0.057771,0.305585,0.334064,0.235316,0.286643,0.103003,0.20624,0.030616,-0.009648,0.13478,-0.042593,0.452451,0.223956,0.382826,-0.306287,0.528937,0.320043,-0.025366,-0.097073,0.302401,0.1382,0.280993,-0.043895,0.323131,0.385908,0.271778,-0.07741,-0.049429,0.297944,0.42399,0.222715,0.382615,0.213136,0.142746,0.289755,-0.084718,0.087826,-0.324539,0.493775,0.337217,-0.033627,0.301097,-0.139901,0.415652,0.366255,0.252082,0.159296
1,0.387115,1.0,0.440297,0.065654,0.380574,0.122696,0.207498,0.298636,0.277926,0.314676,0.328874,0.339524,0.053213,-0.459062,0.308974,0.160549,0.333371,0.108463,0.416389,0.25402,0.423103,0.090776,0.430216,0.120535,0.098287,0.245815,0.355118,0.238243,0.170829,0.11093,0.24378,0.425327,0.169661,0.202545,0.37472,0.529846,0.283648,0.397414,0.377465,0.166107,-0.020531,-0.057213,0.304122,0.066897,0.313282,0.36901,0.622863,0.487421,0.418002,0.423277
2,0.427371,0.440297,1.0,-0.053974,0.43275,0.65584,0.405363,0.152492,0.145506,-0.034893,0.342844,0.26519,0.34652,0.036995,0.303693,0.126385,0.341949,-0.004759,0.480908,0.355419,0.415419,0.301608,0.179727,-0.078152,-0.103186,0.321505,0.444817,0.317583,0.577765,0.119017,0.172904,0.408527,0.479266,0.194179,0.529975,0.451187,0.325835,0.361223,0.335693,0.026879,0.057468,0.292549,0.250581,0.005561,0.198912,0.265528,0.718034,0.371995,0.177328,0.38919
3,0.057771,0.065654,-0.053974,1.0,-0.158289,-0.153298,0.141598,-0.014162,-0.189638,0.079955,0.51291,0.146776,0.096932,-0.009974,0.294018,0.299749,-0.296449,0.548778,-0.021684,0.057581,0.053692,0.137044,0.601797,-0.029566,0.088649,0.009879,0.07631,-0.094779,0.311403,0.081286,-0.168779,0.018526,0.161098,-0.005828,-0.032166,-0.062421,0.051646,0.099993,-0.142952,0.439409,0.11646,0.246525,0.176901,0.668942,0.015009,-0.039435,0.103518,0.055552,0.002067,0.061492
4,0.305585,0.380574,0.43275,-0.158289,1.0,0.227451,0.495441,0.167291,-0.036799,0.314787,0.334699,0.351144,0.326239,0.182825,0.217477,0.423983,0.439661,0.040546,0.433541,0.157922,0.277363,0.163195,0.079215,0.296652,0.258921,0.285679,0.479892,0.501025,0.280953,0.159229,0.328609,0.354305,0.038716,0.422233,0.506928,0.674242,0.42213,0.4496,0.264035,-0.038284,-0.094388,0.149365,0.330643,-0.106211,0.436302,0.406053,0.245268,0.20456,0.307236,0.563753
5,0.334064,0.122696,0.65584,-0.153298,0.227451,1.0,0.359758,0.128238,0.313917,-0.054054,-0.059062,0.043528,0.033068,-0.026157,0.500924,-0.078078,0.231731,-0.29131,0.362533,0.07529,-0.046865,0.354675,-0.179256,0.119273,0.016689,0.006777,0.426816,0.069798,0.630282,0.066745,0.193554,0.241656,0.635469,-0.132187,0.132422,0.244596,0.419094,0.171702,-0.015387,-0.444591,0.155039,0.338597,0.25362,-0.259604,0.256744,-0.05829,0.443274,0.160219,0.154033,0.107563
6,0.235316,0.207498,0.405363,0.141598,0.495441,0.359758,1.0,0.097292,0.29586,-0.173383,0.416594,-0.044944,0.283279,0.343275,0.29216,0.279482,0.059922,0.068664,0.442835,0.241283,0.48959,0.293476,0.17694,0.375294,0.023398,0.248731,0.565612,0.236391,0.713431,0.204499,0.457498,0.3927,0.364705,0.304545,0.382914,0.443469,0.444313,0.222779,0.266978,-0.017945,0.161762,0.307466,0.3105,-0.013659,0.118366,0.40128,0.295855,0.042015,0.300103,0.547439
7,0.286643,0.298636,0.152492,-0.014162,0.167291,0.128238,0.097292,1.0,0.241499,-0.089118,0.032245,0.096273,0.37388,-0.058942,0.21788,0.271892,0.125369,0.145858,0.254608,0.13806,-0.048614,0.202525,0.196684,0.246979,0.144493,0.091751,0.125936,0.253065,0.091761,0.412773,0.35134,0.335413,0.278371,-0.054012,-0.11937,0.259642,0.1332,0.386351,0.002276,-0.105902,-0.404097,0.187305,0.311966,0.213986,0.079153,-0.055557,0.369919,0.257457,0.189217,0.203231
8,0.103003,0.277926,0.145506,-0.189638,-0.036799,0.313917,0.29586,0.241499,1.0,-0.093447,-0.209579,-0.048207,-0.000472,-0.233825,0.434963,-0.091457,0.182567,-0.34749,0.379212,0.180583,0.048849,0.170694,-0.215911,0.109934,0.123507,0.189586,0.469779,0.005992,0.256328,0.300497,0.390947,0.307633,0.432089,0.006901,-0.095097,0.233807,-0.097807,0.025984,0.300064,-0.125214,0.379354,0.135402,0.293774,-0.43249,0.263854,-0.071245,0.194263,0.122163,0.157771,0.082666
9,0.20624,0.314676,-0.034893,0.079955,0.314787,-0.054054,-0.173383,-0.089118,-0.093447,1.0,0.091253,0.207043,-0.14607,-0.098328,0.408817,0.249535,0.098208,-0.032215,-0.148977,0.049324,0.117132,0.199091,0.023629,-0.228848,0.673873,0.147074,0.007377,0.4515,-0.065517,-0.002236,-0.126099,-0.064755,-0.191271,0.221668,0.226046,0.072903,0.042767,0.555426,-0.157162,0.091685,0.022783,-0.291555,0.587193,-0.025557,0.619068,-0.12935,-0.083549,0.262925,0.004105,-0.074039


Top 5 similar users to every user

In [48]:
similar_users = user_similarity.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=6)
similar_users.columns =['USER0','USER1', 'USER2', 'USER3', 'USER4', 'USER5']
del similar_users['USER0']
similar_users

Unnamed: 0,USER1,USER2,USER3,USER4,USER5
0,18,41,14,2,32
1,46,35,47,2,22
2,46,5,28,34,18
3,43,22,17,10,39
4,35,49,34,27,6
5,2,32,28,14,46
6,28,26,49,4,20
7,29,37,12,46,30
8,26,14,32,30,40
9,24,44,42,37,27
