## Building a Content Based News Recommendation System


In [1]:
import pandas as pd
import numpy as np

In [2]:
news = pd.read_excel("news-english.xlsx")

In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', -1) #display complete(non-truncated) content inside a cell

  


In [4]:
news.head()

Unnamed: 0,Titles,Links
0,"Gold price hits fresh record high, crossing the six-digit mark in the domestic market",https://kathmandupost.com/money/2020/08/05/gold-price-hits-fresh-record-high-crossing-the-six-digit-mark-in-the-domestic-market
1,"Nepal may see 28.7 percent contraction in remittance in 2020, highest in the developing Asia, ADB says",https://kathmandupost.com/money/2020/08/04/nepal-may-see-28-7-percent-contraction-in-remittance-in-2020-highest-in-the-developing-asia-adb-says
2,Sugar shortage hits Valley as festival season nears,https://kathmandupost.com/money/2020/08/03/sugar-shortage-hits-valley-as-festival-season-nears
3,Nepal received less than one third of committed foreign loans last fiscal year,https://kathmandupost.com/money/2020/08/03/nepal-received-less-than-one-third-of-committed-foreign-loans-last-fiscal-year
4,Book imports at a trickle as issues remain unresolved,https://kathmandupost.com/money/2020/08/03/book-imports-at-a-trickle-as-issues-remain-unresolved


In [5]:
print("News:",news.shape)

News: (128, 2)


In [6]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Titles  128 non-null    object
 1   Links   128 non-null    object
dtypes: object(2)
memory usage: 2.1+ KB


### Content Based News Recommendation System

Now lets make a recommendations based on the news' titles given in the Titles column. So if our user gives us a news title, the goal is to recommend news that share similar titles.

In [7]:
news['Titles']=news['Titles'].str.strip()

In [8]:
news.head(1)['Titles']

0    Gold price hits fresh record high, crossing the six-digit mark in the domestic market
Name: Titles, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
news['Titles'] = news['Titles'].fillna('')

In [10]:
# Fitting the TF-IDF on the 'Titles' text
tfv_matrix = tfv.fit_transform(news['Titles'])

In [11]:
tfv_matrix

<128x78 sparse matrix of type '<class 'numpy.float64'>'
	with 357 stored elements in Compressed Sparse Row format>

In [12]:
tfv_matrix.shape

(128, 78)

In [13]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [14]:
sig[0]

array([0.76692609, 0.76159416, 0.76255778, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76402222, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.7640468 , 0.76159416, 0.76159416, 0.76159416,
       0.76311519, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.7628959 ,
       0.76254108, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76410979, 0.76159416, 0.76159416, 0.76297059,
       0.76301098, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76338822, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76322848, 0.76159416, 0.76159416, 0.76496007,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76419414, 0.76159416, 0.76159416, 0.76159

In [15]:
# Reverse mapping of indices and news titles
indices = pd.Series(news.index, index=news['Titles']).drop_duplicates()

In [16]:
indices

Titles
Gold price hits fresh record high, crossing the six-digit mark in the domestic market                     0  
Nepal may see 28.7 percent contraction in remittance in 2020, highest in the developing Asia, ADB says    1  
Sugar shortage hits Valley as festival season nears                                                       2  
Nepal received less than one third of committed foreign loans last fiscal year                            3  
Book imports at a trickle as issues remain unresolved                                                     4  
Over 70,000 youth entrepreneurs to benefit from reduced interest rate on loans                            5  
Already struggling to meet targets, Nepal’s footwear export numbers look at an uncertain future           6  
As gold price soars to six-digit figure, sales plummet                                                    7  
Nepal-made clothing brands were enjoying a great run and then the pandemic hit                            8  
Wen

In [17]:
indices['Gold import at five-year low last FY']

45

In [18]:
sig[45]

array([0.76301098, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76471241, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76320267, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76310675, 0.76159416, 0.76159416, 0.76336281,
       0.76692609, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76287459, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76384328,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159416,
       0.76159416, 0.76159416, 0.76159416, 0.76159416, 0.76159

# Insert Keyword

In [19]:
headline_keyword=input("enter headline keyword")

print(headline_keyword)

enter headline keywordgold
gold


In [20]:
import re
news_headline=news[news['Titles'].str.contains(headline_keyword, na=False, flags=re.IGNORECASE, regex=True)]

In [21]:
news_headline

Unnamed: 0,Titles,Links
0,"Gold price hits fresh record high, crossing the six-digit mark in the domestic market",https://kathmandupost.com/money/2020/08/05/gold-price-hits-fresh-record-high-crossing-the-six-digit-mark-in-the-domestic-market
7,"As gold price soars to six-digit figure, sales plummet",https://kathmandupost.com/money/2020/08/01/as-gold-price-soars-to-six-digit-figure-sales-plummet
21,Gold price rings ‘historic’ all time high; set at record 101400/tola on Wednesday,https://thehimalayantimes.com/business/gold-price-rings-historic-all-time-high-set-at-record-101400-tola-on-wednesday/
41,"Gold price soars to a fresh all-time high in Nepal, nears 100k",https://thehimalayantimes.com/business/gold-price-approximates-to-100k-silver-price-surges-too/
44,"Gold price nears Rs 100,000 per tola",https://thehimalayantimes.com/business/gold-price-nears-rs-100000-per-tola/
45,Gold import at five-year low last FY,https://thehimalayantimes.com/business/gold-import-at-five-year-low-last-fy/
55,"Gold price hits Rs 102,500; silver price reaches Rs 1,345",https://thehimalayantimes.com/business/gold-price-hits-rs-102500-silver-price-reaches-rs-1345/
64,Gold hits fresh record,https://thehimalayantimes.com/business/gold-hits-fresh-record/
98,"Gold price drops below the 100k mark, silver also drops value",https://thehimalayantimes.com/business/gold-price-drops-below-the-100k-mark-silver-also-drops-value/
101,"Gold price declines, silver hits new record",https://thehimalayantimes.com/business/gold-price-declines-silver-hits-new-record/


In [22]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the news titles 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar news
    sig_scores = sig_scores[1:11]

    # News title indices
    news_indices = [i[0] for i in sig_scores]

    # Top 10 most similar news
    return news['Titles'].iloc[news_indices],news['Links'].iloc[news_indices]
   

In [24]:
# Testing our news recommendation system 
give_rec('Gold price rings ‘historic’ all time high; set at record 101400/tola on Wednesday')

(41     Gold price soars to a fresh all-time high in Nepal, nears 100k                            
 7      As gold price soars to six-digit figure, sales plummet                                    
 44     Gold price nears Rs 100,000 per tola                                                      
 0      Gold price hits fresh record high, crossing the six-digit mark in the domestic market     
 106    Bullion price edges down slightly after record streak                                     
 101    Gold price declines, silver hits new record                                               
 125    Grocery shoppers scramble due to small window of time                                     
 119    Gold price drops to Rs 98,200/tola; silver is at Rs 1,315                                 
 71     Precious metals set new record price in domestic market                                   
 61     The festival season gets underway with high prices, low footfall and weak price monitoring
 Name: Tit