## Building Content Based News Recommendation System (Nepali News)


In [1]:
import pandas as pd
import numpy as np

In [2]:
news = pd.read_excel("news.xlsx")

In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', -1) #display complete(non-truncated) content inside a cell

  


In [4]:
news.head()

Unnamed: 0,Titles,Links
0,सांग्रिलाले दियो २० प्रतिशत ब्याज छुट,https://ekantipur.com/business/2020/07/31/159620832078358957.html
1,एनएमबी–सिप्रदी सहकार्य,https://ekantipur.com/business/2020/07/31/159620763017328351.html
2,एसबीआईका ग्राहकलाई चिरायुमा छुट,https://ekantipur.com/business/2020/07/31/159620219640947046.html
3,बजारमा अकमिस्ट प्लस र मिनी,https://ekantipur.com/business/2020/07/31/159620200979434866.html
4,‘अब देशमै छ रोजगारी’ अभियान सुरु,https://ekantipur.com/business/2020/07/31/159620191901889287.html


In [5]:
print("News:",news.shape)

News: (595, 2)


In [6]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595 entries, 0 to 594
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Titles  595 non-null    object
 1   Links   595 non-null    object
dtypes: object(2)
memory usage: 9.4+ KB


### Content Based News Recommendation System

Now lets make a recommendations based on the news' titles given in the Titles column. So if our user gives us a news title, the goal is to recommend news that share similar titles.

In [7]:
news['Titles']=news['Titles'].str.strip()

In [8]:
news.head(1)['Titles']

0    सांग्रिलाले दियो २० प्रतिशत ब्याज छुट
Name: Titles, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
news['Titles'] = news['Titles'].fillna('')

In [10]:
# Fitting the TF-IDF on the 'Titles' text
tfv_matrix = tfv.fit_transform(news['Titles'])

In [11]:
tfv_matrix

<595x1152 sparse matrix of type '<class 'numpy.float64'>'
	with 11397 stored elements in Compressed Sparse Row format>

In [12]:
tfv_matrix.shape

(595, 1152)

In [13]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [14]:
sig[0]

array([0.76195848, 0.76159922, 0.7616481 , 0.76159416, 0.76161556,
       0.76163085, 0.76159416, 0.76160866, 0.76159416, 0.76159416,
       0.76159416, 0.76160546, 0.76160671, 0.76162045, 0.76160766,
       0.7616081 , 0.76160242, 0.76160076, 0.76160428, 0.76160955,
       0.76159416, 0.76160426, 0.76161518, 0.76160772, 0.7616273 ,
       0.76160001, 0.76159729, 0.76159824, 0.7616111 , 0.76159912,
       0.7616197 , 0.76161143, 0.76161279, 0.76160357, 0.76161294,
       0.76160831, 0.76159416, 0.7616255 , 0.76161959, 0.76163027,
       0.7616142 , 0.76160488, 0.76162439, 0.76160748, 0.76159663,
       0.76159416, 0.7616089 , 0.7616015 , 0.76179259, 0.76160364,
       0.76160316, 0.76160401, 0.76162792, 0.76163098, 0.76161372,
       0.76162513, 0.76159757, 0.76160412, 0.76160508, 0.76164463,
       0.76160102, 0.76162122, 0.76161371, 0.76159972, 0.76161222,
       0.7616049 , 0.76161074, 0.76159416, 0.76161319, 0.76160728,
       0.76160015, 0.76160719, 0.76160911, 0.76159416, 0.76159

In [15]:
# Reverse mapping of indices and news titles
indices = pd.Series(news.index, index=news['Titles']).drop_duplicates()

In [16]:
indices

Titles
सांग्रिलाले दियो २० प्रतिशत ब्याज छुट                                                             0  
एनएमबी–सिप्रदी सहकार्य                                                                            1  
एसबीआईका ग्राहकलाई चिरायुमा छुट                                                                   2  
बजारमा अकमिस्ट प्लस र मिनी                                                                        3  
‘अब देशमै छ रोजगारी’ अभियान सुरु                                                                  4  
सुनको भाउले लाख छुनै लाग्यो, प्रतितोला ९९ हजार ३ सय रुपैयाँ                                       5  
नारायणगढ–मुग्लिङ सडक प्रयोग गर्दा शुल्क                                                           6  
कर्जा माग नहुँदा बैंकमा थुप्रियो पैसा                                                             7  
रूपन्देहीका होटलमा पाहुनाको पर्खाइ                                                                8  
‘वार्म अप’ मै पोखराको पर्यटन                                               

In [17]:
indices['सुनको भाउ एक लाख दुई हजार पाँच सय पुग्यो']

186

In [18]:
sig[45]

array([0.76159416, 0.76159416, 0.76159416, 0.76159803, 0.76159652,
       0.76159613, 0.76159743, 0.76159416, 0.76159702, 0.76160257,
       0.7616077 , 0.76159416, 0.76159416, 0.76159779, 0.76159909,
       0.76160054, 0.76160693, 0.76159726, 0.76159605, 0.76159416,
       0.76159655, 0.76159736, 0.76160263, 0.7616025 , 0.76160076,
       0.7616006 , 0.76159701, 0.76159632, 0.76159416, 0.76159643,
       0.76169934, 0.76159631, 0.76159822, 0.76159416, 0.76159884,
       0.76160276, 0.76159416, 0.76159602, 0.76159416, 0.76159732,
       0.76162663, 0.76160465, 0.76159647, 0.76159859, 0.76159641,
       0.76195848, 0.76159416, 0.76160685, 0.76159416, 0.76160405,
       0.76160945, 0.76163447, 0.76159751, 0.76159416, 0.76160545,
       0.76159843, 0.76160128, 0.76159847, 0.76159666, 0.76159999,
       0.7615985 , 0.76159946, 0.76159823, 0.7615967 , 0.76159662,
       0.76160249, 0.76159416, 0.76159416, 0.76159416, 0.76159737,
       0.76159781, 0.76159416, 0.76162107, 0.76159737, 0.76160

In [19]:
list(enumerate(sig[indices['सुनको भाउ एक लाख दुई हजार पाँच सय पुग्यो']]))

[(0, 0.7616049504683933),
 (1, 0.7615983465704065),
 (2, 0.761608268076897),
 (3, 0.7615968292715345),
 (4, 0.7616053662547796),
 (5, 0.7617444684104971),
 (6, 0.7615964156385462),
 (7, 0.7616094726816779),
 (8, 0.7616042902348148),
 (9, 0.7616046195480044),
 (10, 0.7615941559557649),
 (11, 0.7616263853908227),
 (12, 0.7616037928754427),
 (13, 0.7616057674889862),
 (14, 0.7616406058808842),
 (15, 0.7616156804858287),
 (16, 0.7616102250452579),
 (17, 0.7616332500715259),
 (18, 0.7616003710838889),
 (19, 0.761625134616318),
 (20, 0.7616026132989856),
 (21, 0.7615963674323367),
 (22, 0.7616246126923646),
 (23, 0.7616176712765598),
 (24, 0.7616505390259412),
 (25, 0.7616064463203962),
 (26, 0.761598724939355),
 (27, 0.7616094435201304),
 (28, 0.7615941559557649),
 (29, 0.7616031704370316),
 (30, 0.7615961939215832),
 (31, 0.7616189167865155),
 (32, 0.7615992430757401),
 (33, 0.7616075076851104),
 (34, 0.7616020643957784),
 (35, 0.7616109480031628),
 (36, 0.7615941559557649),
 (37, 0.761614

# Insert Keyword

In [20]:
headline_keyword=input("enter headline keyword")

print(headline_keyword)

enter headline keywordसुन
सुन


In [21]:
import re
news_headline=news[news['Titles'].str.contains(headline_keyword, na=False, flags=re.IGNORECASE, regex=True)]

In [22]:
news_headline

Unnamed: 0,Titles,Links
5,"सुनको भाउले लाख छुनै लाग्यो, प्रतितोला ९९ हजार ३ सय रुपैयाँ",https://ekantipur.com/business/2020/07/31/159618504852653322.html
14,"सुन तोलाको एक लाखनजिक, बजारमा ‘डेडलक’ हुने चिन्ता !",https://ekantipur.com/business/2020/08/01/159624602147066331.html
42,"सुनसान छ सौराहा, दशैं लक्षित तयारीमा जुटे व्यवसायी",https://www.onlinekhabar.com/2020/07/885989
46,सुनको भाउ तोलामै लाख नजिक,https://www.onlinekhabar.com/2020/07/885446
52,‘एनआरएन कम्पनीको आईपीओमा प्रतिफल सुनिश्चित छ’,https://www.onlinekhabar.com/2020/07/882948
89,सुनको भाउ प्रतितोला ९९ हजार ६०० पुग्यो,https://www.onlinekhabar.com/2020/08/886368
127,सुनको मूल्य तोलामा ९९ हजार ८ सय पुग्यो,https://www.onlinekhabar.com/2020/08/886867
148,तोलामा चार सयले घट्यो सुनको भाउ,https://gorkhapatraonline.com/economics/2020-06-23-17076
162,"सुन भाउ उकालो लागेको लाग्यै, प्रतितोला १,०२,५०० रुपैयाँ",https://ekantipur.com/business/2020/08/06/15966938813176253.html
171,प्रतितोला सुनको मूल्य १ लाख नाघ्यो,https://ekantipur.com/business/2020/08/05/15966213680902522.html


In [23]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the news titles 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar news
    sig_scores = sig_scores[1:11]

    # News title indices
    news_indices = [i[0] for i in sig_scores]

    # Top 10 most similar news
    return news['Titles'].iloc[news_indices],news['Links'].iloc[news_indices]
   

In [25]:
# Testing our news recommendation system 
give_rec('सुनकाण्ड : छानविन समितिले कारबाही सिफारिसविनै बुझायो प्रतिवेदन')

(576    युरिया मल नपाएपछि किसानको आक्रोश : कृषि मन्त्रीलाई कारबाही गर      
 532    सुन तोलाको फेरि एक लाख कट्यो                                       
 54     संसदीय समितिले सरकारलाई सोध्यो-फोरजी सेवा विस्तारमा किन ढिलाइ ?    
 530    बाँकेमा मास्क नलगाउने र जोर बिजोर उल्लङ्घन गर्ने १७ सयलाई कारबाही  
 454    संसदीय समितिले भन्यो-पूर्वतयारी सकिएपछि मात्रै रेलको ठेक्का लगाउनू 
 14     सुन तोलाको एक लाखनजिक, बजारमा ‘डेडलक’ हुने चिन्ता !                
 556    बढ्ता नबोल्नुस् ! समातेर लैजाम् ?                                  
 371    मर्मत गरेका सवारी फेरि थन्किए                                      
 261    संघीयताको पुनर्व्याख्या : भौगोलिक र गैरभौगोलिक स्वायत्तताको समिश्रण
 492    प्रिमियर लिगको खेल तालिका सार्वजनिक                                
 Name: Titles, dtype: object,
 576    https://www.onlinekhabar.com/2020/08/891335                      
 532    https://gorkhapatraonline.com/economics/2020-08-18-20772         
 54     https://www.onlinekhabar.com/2020/07/886118           