# Load Dataset

In [2]:
import pandas as pd
merged_df = pd.read_csv("./merged_df.csv")
merged_df.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,user_id,clicked,content
0,N37243,finance,finance-real-estate,the 25 most desirable places to live in the us...,check out where u s residents would live if th...,https://assets.msn.com/labs/mind/AABvlID.html,[],"[{""Label"": ""United States"", ""Type"": ""G"", ""Wiki...",unknown,1.0,the 25 most desirable places to live in the us...
1,N25540,finance,finance-saving-investing,take heart millennials investing is within you...,news headlines might lead you to believe that ...,https://assets.msn.com/labs/mind/AAEmGBr.html,[],[],unknown,1.0,take heart millennials investing is within you...
2,N37129,finance,finance-taxes,don t be like these celebrities convicted of t...,these celebs were in hot water with the taxman,https://assets.msn.com/labs/mind/AAEGGF9.html,"[{""Label"": ""Tax evasion"", ""Type"": ""C"", ""Wikida...",[],U73032,0.0,don t be like these celebrities convicted of t...
3,N36064,finance,finance-companies,more store closings coming the list of retaile...,more than 10 months into 2019 more than 8 600 ...,https://assets.msn.com/labs/mind/AADN84N.html,[],[],unknown,1.0,more store closings coming the list of retaile...
4,N63006,finance,finance-savemoney,17 surprising ways penny pinching costs you more,frugal living could end up costing you in the ...,https://assets.msn.com/labs/mind/AAB4M2y.html,[],[],U67894,0.0,17 surprising ways penny pinching costs you mo...


# Content Base Recommendation System 

In [7]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer(stop_words = 'english' , max_df = 0.8  , min_df = 5 , ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(merged_df['content'])
tfidf_matrix.toarray()


 ## cosine similariy 
con_sim =cosine_similarity(tfidf_matrix , tfidf_matrix)
merged_df=merged_df.reset_index(drop=True)

In [11]:
## Recommendation Function
import re
def content_base_rec(title , top_n=5):
    title = re.sub(r"\w+" , " "  , title)
    title  = title.lower().strip()

    title_Vec = vectorizer.transform([title])

    cosine_scores=cosine_similarity(title_Vec,tfidf_matrix).flatten()
    top_indices=cosine_scores.argsort()[::-1][:top_n]

    return merged_df.loc[top_indices, ["news_id" , "title" , "category" , "subcategory" , "url" , "abstract"]]

content_base_rec("Top investment #$@#$@ strategies  in 2025")

Unnamed: 0,news_id,title,category,subcategory,url,abstract
0,N37243,the 25 most desirable places to live in the us...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AABvlID.html,check out where u s residents would live if th...
3106,N6940,kaiser permanente ceo and affordable health ca...,finance,finance-career-education,https://assets.msn.com/labs/mind/BBWzANI.html,health care provider kaiser permanente said su...
3105,N241,are stores open on veterans day target aldi wa...,finance,finance-companies,https://assets.msn.com/labs/mind/BBWz0up.html,will shoppers be able to make the most of vete...
16,N9947,10 job skills worth six figure salaries,finance,finance-career-education,https://assets.msn.com/labs/mind/AAEzJ4H.html,learn these skills to get you closer to six fi...
17,N15651,the penthouse of nyc s woolworth building just...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AAJJaaX.html,the condo comes as a white box unit meaning it...


In [12]:
content_base_rec("How to save more money on taxes this year")

Unnamed: 0,news_id,title,category,subcategory,url,abstract
0,N37243,the 25 most desirable places to live in the us...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AABvlID.html,check out where u s residents would live if th...
3106,N6940,kaiser permanente ceo and affordable health ca...,finance,finance-career-education,https://assets.msn.com/labs/mind/BBWzANI.html,health care provider kaiser permanente said su...
3105,N241,are stores open on veterans day target aldi wa...,finance,finance-companies,https://assets.msn.com/labs/mind/BBWz0up.html,will shoppers be able to make the most of vete...
16,N9947,10 job skills worth six figure salaries,finance,finance-career-education,https://assets.msn.com/labs/mind/AAEzJ4H.html,learn these skills to get you closer to six fi...
17,N15651,the penthouse of nyc s woolworth building just...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AAJJaaX.html,the condo comes as a white box unit meaning it...


In [14]:
# Colloborative filtering
from sklearn.preprocessing import LabelEncoder
user_encode = LabelEncoder()
item_encode = LabelEncoder()

merged_df['user_enc']  = user_encode.fit_transform(merged_df['user_id'])
merged_df['news_enc'] = item_encode.fit_transform(merged_df['news_id'])



user_item_matrix = merged_df.pivot_table(index='user_enc' , columns='news_id' , values='clicked' , fill_value=0)


user_similarity = cosine_similarity(user_item_matrix)
user_similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], shape=(845, 845))

In [15]:
## Recommendation Fucntion

def colaborative_base_rec(input_user , df , top_k=5):
    user_idx = user_encode.transform([input_user])[0]
    sim_scores = user_similarity[user_idx]
    user_clicked = user_item_matrix.iloc[user_idx]
    weighted_scores = sim_scores @ user_item_matrix.values
    weighted_scores[user_clicked  == 1] = 0
    top_new_indicies = weighted_scores.argsort()[::-1][:top_k]
    recommendation_news_id = item_encode.inverse_transform(top_new_indicies)
    recommend_news = df[df['news_id'].isin(recommendation_news_id)][[
        'news_id' , 'title' , 'category' , 'subcategory' , 'url' , 'abstract'
    ]].drop_duplicates(subset='news_id')
    return  recommend_news.reset_index(drop=True)


recommendations = colaborative_base_rec(input_user='U91836' ,df = merged_df)
recommendations

Unnamed: 0,news_id,title,category,subcategory,url,abstract
0,N1037,ceo departures hit a new high in october on pa...,finance,finance-companies,https://assets.msn.com/labs/mind/AAJWqVs.html,october marked the highest month on record wit...
1,N9988,everybody is calling twin snow storms mean goo...,finance,financenews,https://assets.msn.com/labs/mind/AAJytd9.html,with slick snowy roads leading to a lot of sli...
2,N1039,supreme court rejects charter appeal of sprint...,finance,finance-companies,https://assets.msn.com/labs/mind/AAJOpA2.html,usa court charter commns sprint corp update 1 ...
3,N1004,small business lessons from blue collar millio...,finance,finance-small-business,https://assets.msn.com/labs/mind/BBWpEsR.html,not all millionaires wear a suit
4,N9989,how should billionaires spend their money to f...,finance,markets,https://assets.msn.com/labs/mind/BBWE4pA.html,is it better to invest in developing clean ene...


In [17]:
# Hybrid Recommendation System 
def hybrid_recommendations(user_id,title, df,top_n=5):
    # clean title
    title = re.sub(r'\W+', ' ', title)  # Remove special chars
    title = title.lower().strip()
    
    
    # Get content-based recommendations
    content_recs = content_base_rec(title)
    
    # Get collaborative-based recommendations
    collab_recs = colaborative_base_rec(user_id, df)

    # Merge the recommendations
    combined_recs = pd.concat([content_recs, collab_recs]).drop_duplicates().reset_index(drop=True)
    
    # Limit the number of recommendations to the top 'n' 
    combined_recs = combined_recs.head(top_n)
    
    return combined_recs

hybrid_recommendations(
    user_id='U91836',
    title="The 1 reason you shouldnâ€™t hesitate to claim Social Security",
    df=merged_df,
)

Unnamed: 0,news_id,title,category,subcategory,url,abstract
0,N37243,the 25 most desirable places to live in the us...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AABvlID.html,check out where u s residents would live if th...
1,N6940,kaiser permanente ceo and affordable health ca...,finance,finance-career-education,https://assets.msn.com/labs/mind/BBWzANI.html,health care provider kaiser permanente said su...
2,N241,are stores open on veterans day target aldi wa...,finance,finance-companies,https://assets.msn.com/labs/mind/BBWz0up.html,will shoppers be able to make the most of vete...
3,N9947,10 job skills worth six figure salaries,finance,finance-career-education,https://assets.msn.com/labs/mind/AAEzJ4H.html,learn these skills to get you closer to six fi...
4,N15651,the penthouse of nyc s woolworth building just...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AAJJaaX.html,the condo comes as a white box unit meaning it...
