In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import textwrap
from IPython.display import clear_output

In [3]:
userId=np.load('userId.npy')
userEmbedding=np.load('user_embedding.npy')
newsId=np.load('itemId.npy')
newsEmbedding=np.load('news_embeddings_numpy.npy')
news_data=pd.read_csv('preprocessed_news.csv')
category_embeddings=np.load('category_embeddings.npy')
subcategory_embeddings=np.load('subcategory_embeddings.npy')
title_embeddings=np.load('title_embeddings.npy')
abstract_embeddings=np.load('abstract_embeddings.npy')

In [6]:
# creating news embeddings using weighted combinations of different column embeddings
weighted_news_embeddings=0.3*category_embeddings+0.2*subcategory_embeddings+0.3*title_embeddings+0.2*abstract_embeddings

In [7]:
weighted_dict={}
for i in range(len(newsId)):
    weighted_dict[newsId[i]]=weighted_news_embeddings[i]

In [8]:
dict={}
for i in range(len(newsId)):
    dict[newsId[i]]=newsEmbedding[i]

In [9]:
random_news=news_data.groupby('category').sample(n=1)

In [10]:
preview_news=random_news[['category','subcategory','itemId','title','abstract']]

In [57]:
preview_news=preview_news.reset_index()

In [58]:
#displaying these for new user
preview_news

Unnamed: 0,index,category,subcategory,itemId,title,abstract
0,21408,autos,autosnews,N4685,Route 51 Crash Causing Heavy Delays,A crash along Route 51 is causing significant ...
1,7681,entertainment,entertainment-celebrity,N9134,"Jennifer Aniston, Hilary Duff, Megan Fox and M...",
2,16234,finance,finance-top-stocks,N47211,Microsoft (MSFT) to Report Q1 Earnings: What t...,Microsoft Corp. MSFT is set to report first-qu...
3,1864,foodanddrink,foodnews,N41490,World's best bar for 2019 revealed,The world's best bar awards have been handed o...
4,12481,health,nutrition,N20145,"Everyone Is Talking About Ruby Chocolate, But ...",The recently-discovered fourth chocolate is fr...
5,38560,kids,people-places,N44039,Strange disease threatens Caribbean coral reef,"The breathtaking reds, yellows and purples of ..."
6,31617,lifestyle,shop-home-goods,N4291,Organize Your Home With These Cute Small-Space...,Mustering up the energy to organize can be qui...
7,8215,middleeast,middleeast-top-stories,N54746,Saudi says Baghdadi 'distorted' image of Islam...,Saudi Arabia said Monday that Islamic State le...
8,47246,movies,movies-gallery,N27936,50 Worst Movie Sequels of All Time,Movie sequels and prequels are becoming more c...
9,43037,music,musicnews,N60249,Cleveland Orchestra to perform 'Harry Potter a...,


In [59]:
#user selecting these articles to read
user_read_articles=[0,5,8,10]

In [5]:
news_data['itemId'][48540]

'N60855'

In [60]:
def L2(u,a):
    return np.linalg.norm(u-a)

## weighted embeddings of news items

In [61]:
#calculating distances between selected articles and all other non selected articles
weighted_indices_to_recommend=[]
for i in user_read_articles:
    distances=[]
    for j in range(len(weighted_news_embeddings)):
        distances.append((L2(weighted_dict[preview_news['itemId'][i]],weighted_news_embeddings[j]),j))
    distances=sorted(distances)
    for k in distances:
        if k[0]!=0:
            weighted_indices_to_recommend.append(k)
            break

In [62]:
#taking average of article embeddings which user selected
weighter_user_embedding=np.zeros(384)
for i in user_read_articles:
    weighter_user_embedding+=weighted_dict[preview_news['itemId'][i]]
weighter_user_embedding/=len(user_read_articles)

In [63]:
#finding distance between user embedding and all the news article embeddings
weighted_distances=[]
for i in range(len(newsEmbedding)):
    weighted_distances.append((L2(weighter_user_embedding,weighted_news_embeddings[i]),i))

weighted_distances=sorted(weighted_distances)


## news item embeddings without weight

In [64]:
indices_to_recommend=[]

In [65]:
#finding user embeddings by taking average of all the articles user read
new_user_embedding=np.zeros(384)
for i in user_read_articles:
    new_user_embedding+=dict[preview_news['itemId'][i]]
new_user_embedding/=len(user_read_articles)
    

In [66]:
# calculating the distances between user read articles and all the remaining articles user didnt read
for i in user_read_articles:
    distances=[]
    for j in range(len(newsEmbedding)):
        distances.append((L2(dict[preview_news['itemId'][i]],newsEmbedding[j]),j))
    distances=sorted(distances)
    for k in distances:
        if k[0]!=0:
            indices_to_recommend.append(k)
            break
    
distances=[]
for i in range(len(newsEmbedding)):
    distances.append((L2(new_user_embedding,newsEmbedding[i]),i))

distances=sorted(distances)


In [67]:
indices_to_recommend

[(4.499967, 50073), (5.239134, 6567), (5.2826395, 22170), (4.8313417, 25525)]

In [68]:
#without taking average
for i in indices_to_recommend:
    print("category: ",news_data['category'][i[1]],"  subcategory: ",news_data['subcategory'][i[1]],'  title: ',news_data['title'][i[1]],'  abstract: ',news_data['abstract'][i[1]])

category:  autos   subcategory:  autosnews   title:  Interstate 83: Morning crash cleared, but causing backed up traffic in both directions   abstract:  Backed up traffic is from a crash that happened shortly after 5 a.m.
category:  news   subcategory:  newsscienceandtechnology   title:  These Coral Have a Unique Survival Strategy That Lets Them Recover After Deadly Warming Events   abstract:  The findings are some rare good news for corals around the world, which are facing numerous severe threats.
category:  movies   subcategory:  movies-gallery   title:  The 47 best scary movies of all time, ranked by critics   abstract:  From classics like "Carrie" and "Rosemary's Baby" to more contemporary titles like "Hereditary," these films are guaranteed to keep you up at night.
category:  news   subcategory:  newsus   title:  Sanitation worker caught on camera going out of his way to help 88-year-old metro woman   abstract:  INDEPENDENCE, Mo. -- A kind gesture led to an unlikely friendship be

In [69]:
#using average of articles read
for i in distances[:5]:
    print("category: ",news_data['category'][i[1]],"  subcategory: ",news_data['subcategory'][i[1]],'  title: ',news_data['title'][i[1]],'  abstract: ',news_data['abstract'][i[1]])

category:  news   subcategory:  newsus   title:  Traffic back to normal on Route 283 near Spooky Nook Road   abstract:  There were severe backlogs this morning on Route 283 in Lancaster County, but the crash has been cleared. - The delays were in the eastbound lanes. Several miles of traffic backed up. Sign up for our Newsletters - PennDOT said it was all caused by a fender-bender. - Again, the crash has been cleared and traffic is flowing normally. TOP STORIES FROM WGAL: Up to 50 shots fired at police in morning standoff Mother charged in infant's death...
category:  travel   subcategory:  travelarticle   title:  I-94 south reopens between Highway 20, CTH E   abstract:  All lanes of Interstate 94 southbound have reopened between Highway 20 and County Highway E after a crash. The crash was affecting drivers in Racine and Kenosha counties. It's not yet known what caused the crash. No other details were immediately released. READ MORE:I-94 south reopens between Highway 20, CTH E CHECK OU

In [70]:
#without taking average using weighted embbedings news
for i in weighted_indices_to_recommend:
    print("category: ",news_data['category'][i[1]],"  subcategory: ",news_data['subcategory'][i[1]],'  title: ',news_data['title'][i[1]],'  abstract: ',news_data['abstract'][i[1]])

category:  autos   subcategory:  autosnews   title:  Crashes on 280, 17 block lanes, slow morning commute   abstract:  A crash on one critical commute route was cleared early Thursday on Interstate 580 west but two more are creating traffic troubles on I-280 in South San Francisco and on Highway 17 north in Los Gatos, where all lanes are blocked. Two lanes of northbound I-280 are closed north of Avalon Drive in South San Francisco, slowing traffic and prompting the California Highway Patrol to issue a severe traffic alert at 6:27 a.m. and advise drivers to take...
category:  movies   subcategory:  movies-gallery   title:  50 Best Movie Sequels of All Time   abstract:  A successful film franchise can ensure box office earnings like few other movie offerings can.
category:  news   subcategory:  newsus   title:  Mark 'Munky' Berman Honored With Memorial Ride: 'My Dad Was A Great Man'   abstract:  Hundreds of bikers from around Colorado gathered for a memorial ride, Saturday, to honor the 

In [71]:
#using average of articles read using weighted news embeddings
for i in weighted_distances[:5]:
    print("category: ",news_data['category'][i[1]],"  subcategory: ",news_data['subcategory'][i[1]],'  title: ',news_data['title'][i[1]],'  abstract: ',news_data['abstract'][i[1]])

category:  news   subcategory:  newsworld   title:  Driver charged with OVI after pickup hits home, pinning girl, injuring another   abstract:  Police are investigating a car that went into a home Thursday night in Kenton County.
category:  news   subcategory:  newsus   title:  Motorcyclists find new 'dangerous' way to defiantly beat I-95 traffic   abstract:  It's as if some motorcyclists think the red carpet has been rolled out for them. Traffic is clipping along on I-95 and then seemingly out of nowhere the landscape ahead is a vast sea of red brake lights. But not all travelers are stuck in traffic hell: Some motorcyclists look at the interstate's latest upgrade -- the creation of express lanes, which are set apart with a nice 3-foot-or-so ...
category:  news   subcategory:  newsworld   title:  'We will never, ever recover' - victims lash out at suspect in Esplanade crash that killed 2, injured others   abstract:  When Tashonty Toney got behind the wheel of his vehicle on March 2 an