# How to find similar documents using word2vec model

# Read news dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('preprocessed_news_218posts.csv', sep='|')

# Get latest categorical news for displaying on django website

In [3]:
df.iloc[0].item_id

'aipl_20220929_1'

In [4]:
cate = '娛樂'
df_cate = df[df.category == cate]

In [5]:
df_cate.iloc[0]

item_id                                                  amov_20220929_1
date                                                          2022-09-29
title                                              紀曉君當天兵媽太自然  曝戲外愛對女兒耍寶
content                歌手紀曉君和演員姚坤君在茁劇場新戲「誰說媽媽像月亮」是小情侶的冤家媽媽們，兩人今天在媒體茶敘...
sentiment                                                            1.0
summary                ['」紀曉君在「誰說媽媽像月亮」是名常對兒子搗亂的天兵媽媽', '歌手紀曉君和演員姚坤君在茁...
top_key_freq           [('紀曉君', 14), ('媽媽', 10), ('姚坤君', 6), ('老師', 5...
tokens                 ['歌手', '紀曉君', '和', '演員', '姚坤君', '在', '茁劇場', '新...
tokens_rm_stopwords    ['歌手', '紀曉君', '演員', '姚坤君', '茁劇場', '媽媽', '月亮', ...
entities               {(147, 150, 'PERSON', '紀曉君'), (37, 38, 'CARDIN...
token_pos              [('歌手', 'Na'), ('紀曉君', 'Nb'), ('和', 'Caa'), ('...
link                   https://www.cna.com.tw/news/amov/202209290356....
photo_link             https://imgcdn.cna.com.tw/www/WebPhotos/200/20...
category                                           

## Define function: get_cate_latest_news()

In [6]:
#-- Given a category, get the latest news
def get_cate_latest_news(cate):
    items = []
    df_cate = df[df.category == cate]

    # get the last news (the latest news)
    df_cate = df_cate.tail(5)  # Only 5 pieces
    # only return 10 news

    for i in range( len(df_cate)):
        item_id = df_cate.iloc[i].item_id    
        title = df_cate.iloc[i].title
        content = df_cate.iloc[i].content
        category = df_cate.iloc[i].category
        link = df_cate.iloc[i].link
        photo_link = df_cate.iloc[i].photo_link
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=''

        item = {
            "id": item_id, 
            "category": category, 
            "title": title, 
            "link": link,
            "photo_link": photo_link
        }

        items.append(item)
    
    return items

In [7]:
df_cate.iloc[0]

item_id                                                  amov_20220929_1
date                                                          2022-09-29
title                                              紀曉君當天兵媽太自然  曝戲外愛對女兒耍寶
content                歌手紀曉君和演員姚坤君在茁劇場新戲「誰說媽媽像月亮」是小情侶的冤家媽媽們，兩人今天在媒體茶敘...
sentiment                                                            1.0
summary                ['」紀曉君在「誰說媽媽像月亮」是名常對兒子搗亂的天兵媽媽', '歌手紀曉君和演員姚坤君在茁...
top_key_freq           [('紀曉君', 14), ('媽媽', 10), ('姚坤君', 6), ('老師', 5...
tokens                 ['歌手', '紀曉君', '和', '演員', '姚坤君', '在', '茁劇場', '新...
tokens_rm_stopwords    ['歌手', '紀曉君', '演員', '姚坤君', '茁劇場', '媽媽', '月亮', ...
entities               {(147, 150, 'PERSON', '紀曉君'), (37, 38, 'CARDIN...
token_pos              [('歌手', 'Na'), ('紀曉君', 'Nb'), ('和', 'Caa'), ('...
link                   https://www.cna.com.tw/news/amov/202209290356....
photo_link             https://imgcdn.cna.com.tw/www/WebPhotos/200/20...
category                                           

In [8]:
type(df_cate.iloc[0].item_id)

str

In [9]:
get_cate_latest_news("政治")

[{'id': 'aipl_20220929_16',
  'category': '政治',
  'title': '林為洲提竹北舊城區新地標  推增建國民運動中心',
  'link': 'https://www.cna.com.tw/news/aipl/202209290239.aspx',
  'photo_link': 'https://imgcdn.cna.com.tw/www/WebPhotos/200/20220929/1051x768_20220929000161.jpg'},
 {'id': 'aipl_20220929_17',
  'category': '政治',
  'title': '成立農漁會後援會 楊文科續推做好基層農村建設',
  'link': 'https://www.cna.com.tw/news/aipl/202209290237.aspx',
  'photo_link': 'https://imgcdn.cna.com.tw/www/WebPhotos/200/20220929/1188x768_20220929000168.jpg'},
 {'id': 'aipl_20220929_18',
  'category': '政治',
  'title': '18歲公民權修憲案公告期滿 11/26公民複決',
  'link': 'https://www.cna.com.tw/news/aipl/202209290232.aspx',
  'photo_link': 'https://imgcdn.cna.com.tw/www/webphotos/WebCover/420/20220929/800x600_643333227861.jpg'},
 {'id': 'aipl_20220929_19',
  'category': '政治',
  'title': '世界台商聯合總會閉幕  賴清德籲替民主台灣發聲',
  'link': 'https://www.cna.com.tw/news/aipl/202209290224.aspx',
  'photo_link': 'https://imgcdn.cna.com.tw/www/WebPhotos/200/20220929/1365x768_2022092900016

# Get news content for displaying on django website

In [10]:
itemid = 'amov_20220929_1'

In [11]:
df.loc[df.item_id == itemid]

Unnamed: 0,item_id,date,title,content,sentiment,summary,top_key_freq,tokens,tokens_rm_stopwords,entities,token_pos,link,photo_link,category
100,amov_20220929_1,2022-09-29,紀曉君當天兵媽太自然 曝戲外愛對女兒耍寶,歌手紀曉君和演員姚坤君在茁劇場新戲「誰說媽媽像月亮」是小情侶的冤家媽媽們，兩人今天在媒體茶敘...,1.0,"['」紀曉君在「誰說媽媽像月亮」是名常對兒子搗亂的天兵媽媽', '歌手紀曉君和演員姚坤君在茁...","[('紀曉君', 14), ('媽媽', 10), ('姚坤君', 6), ('老師', 5...","['歌手', '紀曉君', '和', '演員', '姚坤君', '在', '茁劇場', '新...","['歌手', '紀曉君', '演員', '姚坤君', '茁劇場', '媽媽', '月亮', ...","{(147, 150, 'PERSON', '紀曉君'), (37, 38, 'CARDIN...","[('歌手', 'Na'), ('紀曉君', 'Nb'), ('和', 'Caa'), ('...",https://www.cna.com.tw/news/amov/202209290356....,https://imgcdn.cna.com.tw/www/WebPhotos/200/20...,娛樂


In [12]:
df.loc[df.item_id == itemid].item_id

100    amov_20220929_1
Name: item_id, dtype: object

In [13]:
df.loc[df.item_id == itemid].item_id.tolist()

['amov_20220929_1']

In [14]:
df.loc[df.item_id == itemid].item_id.tolist()[0]

'amov_20220929_1'

## Define function: get_news_content()

In [15]:
# -- Given a item_id, get document information
def get_news_content(item_id):
    df_item = df[df.item_id == item_id]
    title = df_item.iloc[0].title
    content = df_item.iloc[0].content
    category = df_item.iloc[0].category
    link = df_item.iloc[0].link
    date = df_item.iloc[0].date
    photo_link = df_item.iloc[0].photo_link
    # if photo_link value is NaN, replace it with empty string 
    if pd.isna(photo_link):
        photo_link=''

    news_info = {
        "id": item_id,
        "category": category,
        "title": title,
        "content": content,
        "link": link,
        "date": date,
        "photo_link": photo_link
    }

    return news_info

In [16]:
get_news_content(itemid)

{'id': 'amov_20220929_1',
 'category': '娛樂',
 'title': '紀曉君當天兵媽太自然  曝戲外愛對女兒耍寶',
 'content': '歌手紀曉君和演員姚坤君在茁劇場新戲「誰說媽媽像月亮」是小情侶的冤家媽媽們，兩人今天在媒體茶敘隔空合體，紀曉君更自曝戲裡戲外都是無厘頭媽媽，常在女兒做功課時旁邊大叫。紀曉君和姚坤君兩人分別飾演張耀仁和葛盈瑄的母親，兩人在戲中一見面就在高級茶館攤牌，姚坤君開了張支票，要張耀仁跟自己女兒分手。沒想到紀曉君根本不買帳，沒拿支票就走，還只付自己的茶錢，害姚坤君被服務生攔下買單。兩人笑言拍完都笑得要命。特別的是，紀曉君是「誰說媽媽像月亮」導演王小棣親自到台東請來演戲。她回憶，聽聞小棣老師要來，還特地煮了燒酒雞迎接貴客。當時表舅和叔公胡德夫都在現場，當她得知小棣老師和胡德夫都是淡江大學畢業，兩人也認識就卸下心防，願意把自己交給小棣老師，「那一刻起，我就把老師當家人，只要被我當家人，很多事我都願意聽。」在跟王小棣對戲及畫面講解時，紀曉君怕自己背不起台詞，乾脆直接把自己歸零，融入角色。身為表演老師的姚坤君也稱讚紀曉君的做法是對的，「拍戲第一要愛上導演，並且愛上自己的角色。如果妳都不愛，那就進入不了劇情和角色，痛苦的會是演員自己。」她也對紀曉君的角色挑戰很羨慕，「如果是我，我會跪求演出這角色。」紀曉君在「誰說媽媽像月亮」是名常對兒子搗亂的天兵媽媽，真實生活裡紀曉君也說自己真的是個無厘頭媽媽，「我女兒在做功課，我會跑去突然大叫或鬧她，她就說，不要這樣啦！有時候女兒還會被我嚇哭，那我就很開心，然後呼呼她！」紀曉君強調會在孩子面前耍神經、耍寶，但嚴格起來孩子是很怕她的，她對孩子有要求。在劇中兩名媽媽都是沒有登記結婚，紀曉君坦言自己很保守，不可能沒結婚就幫對方生孩子，「如果生了，也是跟我姓。」姚坤君說如果不登記，遇到未來誰生病必須對方簽字，就得自負責任。「誰說媽媽像月亮」將於10月1日起，每週六晚上9時在MyVideo、公視同步播出，晚上10時台視播出；隔天在中華電信MOD、Hami Video晚上8時上架。紀曉君紀曉君幽默扮無厘頭媽媽  王小棣赴台東邀戲',
 'link': 'https://www.cna.com.tw/news/amov/202209290356.aspx',
 'date': '202

# Load Doc2vec model

In [17]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load("cna_news_doc2vec.model")

# Given new keywords, find similar documents

In [18]:
keywords = ['智慧', '台灣' ]

# infer vector of keywords from the word2vec model (vector space)
new_vector = model.infer_vector( keywords )

In [19]:
new_vector

array([-4.7160592e-03,  3.2768238e-03, -2.8133024e-03, -1.1807955e-03,
       -1.1129719e-03, -1.6098281e-03, -7.6177780e-04, -6.0695346e-04,
       -1.1695381e-03,  1.4763763e-03, -5.8485242e-04,  2.0626283e-03,
       -1.7065278e-03, -1.1904420e-03,  4.1843337e-04, -3.8478917e-03,
        1.0454564e-03, -4.3906411e-03, -1.1751567e-03, -2.8224788e-03,
        3.7753198e-03,  7.0818508e-04,  1.9697328e-03,  1.3359085e-03,
        6.2692817e-04,  1.5571113e-03, -1.2470703e-03,  1.7611852e-03,
        7.9061632e-04,  1.4037122e-03,  3.3342693e-04,  1.3315830e-03,
       -8.5431186e-04,  4.7459970e-03,  3.9913342e-03, -3.3691577e-03,
        2.6052987e-04, -4.1371854e-03,  1.9337260e-04,  3.3068380e-03,
        1.7197124e-03, -4.7526825e-03, -1.2931587e-03,  4.3095648e-03,
       -1.3693612e-03,  7.2260341e-04,  4.2667063e-03, -8.4511604e-04,
        2.6757994e-03,  2.4314271e-03,  9.1812562e-04,  6.7808194e-04,
       -1.9683172e-03, -2.2587993e-03, -1.7412434e-03, -2.6634252e-03,
      

In [20]:
# based on the vector, find several nearby tags (item_id) 
model.docvecs.most_similar(positive = [new_vector], topn = 5  )

  model.docvecs.most_similar( positive= [new_vector], topn=5  )


[('ahel_20220929_4', 0.18443980813026428),
 ('ahel_20220929_3', 0.1760752946138382),
 ('asc_20220930_2', 0.16297820210456848),
 ('asoc_20220929_1', 0.14656753838062286),
 ('aie_20220929_7', 0.122268907725811)]

In [22]:
keywords  = ['智慧', '農業' ]

new_vector = model.infer_vector(keywords)
for item in model.docvecs.most_similar( positive = [new_vector], topn =3 ):
    df_item = df[df.item_id == item[0]]
    title = df_item.iloc[0].title
    content = df_item.iloc[0].content
    category = df_item.iloc[0].category
    link = df_item.iloc[0].link
    print(category, title)

運動 兄弟逆轉秀奪9連勝 與龍頭桃猿勝差剩1場[影]
兩岸 中共20大前收緊輿論 官方掃蕩社群8萬多筆謠言
科技 Meta：監察委員會設立2年來 收到逾百萬件申訴


  for item in model.docvecs.most_similar( positive = [new_vector], topn =3 ):


## All-in-one function: get_keywords_most_similar()

In [23]:
#--Given news keywords, find similar documents 
def get_keywords_most_similar(keywords):
    new_vector = model.infer_vector(keywords)
    similar_items = model.docvecs.most_similar(positive=[new_vector], topn=5)
    items = []
    for item_id, score in similar_items:
        df_item = df[df.item_id == item_id]
        
        title = df_item.iloc[0].title
        content = df_item.iloc[0].content
        category = df_item.iloc[0].category
        link = df_item.iloc[0].link
        photo_link = df_item.iloc[0].photo_link
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=''

        score = round(score, 2)
        
        item = {
            "id": item_id, 
            "category": category, 
            "title": title, 
            "link": link,
            'score': score, 
            "photo_link": photo_link
            }
        items.append(item)

    return items

In [24]:
keywords = ['台北', '農業' ]

get_keywords_most_similar(keywords)

  similar_items = model.docvecs.most_similar(positive=[new_vector], topn=5)


[{'id': 'aipl_20220929_18',
  'category': '政治',
  'title': '18歲公民權修憲案公告期滿 11/26公民複決',
  'link': 'https://www.cna.com.tw/news/aipl/202209290232.aspx',
  'score': 0.56,
  'photo_link': 'https://imgcdn.cna.com.tw/www/webphotos/WebCover/420/20220929/800x600_643333227861.jpg'},
 {'id': 'acn_20220929_2',
  'category': '兩岸',
  'title': '刺激房市 中國階段性放寬首套住房貸款利率下限',
  'link': 'https://www.cna.com.tw/news/acn/202209290381.aspx',
  'score': 0.56,
  'photo_link': ''},
 {'id': 'asc_20220929_4',
  'category': '證卷',
  'title': '土洋同步買超華新 外資9月大賣台積電和鴻海',
  'link': 'https://www.cna.com.tw/news/afe/202209290276.aspx',
  'score': 0.56,
  'photo_link': ''},
 {'id': 'aopl_20220929_15',
  'category': '國際',
  'title': '賀錦麗剛離開南韓返美 北韓5天內3度射彈',
  'link': 'https://www.cna.com.tw/news/aopl/202209290361.aspx',
  'score': 0.55,
  'photo_link': 'https://imgcdn.cna.com.tw/www/webphotos/WebCover/420/20220929/1134x851_83859563352.jpg'},
 {'id': 'ahel_20220929_9',
  'category': '生活',
  'title': '嘉義蒜頭糖廠五分車 延駛高鐵站完成履勘',
  'link

# Given an item_id, find similar documents

In [25]:
# Find the top-N most similar documents

# e.g., find similar documents for the first item_id
model.docvecs.most_similar([itemid], topn = 5)

  model.docvecs.most_similar([itemid], topn = 5)


[('aopl_20220929_6', 0.9602535963058472),
 ('ait_20220922_20', 0.9592099785804749),
 ('asc_20220929_6', 0.9583355784416199),
 ('ahel_20220929_10', 0.9576078057289124),
 ('aopl_20220929_8', 0.9561548829078674)]

## All-in-one function: get_itemid_most_similar()

In [26]:
#-- Given item_id, get three similar news based on the doc2vec model
def get_itemid_most_similar(item_id):
    similar_items = model.docvecs.most_similar(positive=[item_id], topn=3)
    items = []
    for item_id, score in similar_items:
        df_item = df[df.item_id == item_id]
        title = df_item.iloc[0].title
        content = df_item.iloc[0].content
        category = df_item.iloc[0].category
        link = df_item.iloc[0].link
        photo_link = df_item.iloc[0].photo_link
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=''

        score = round(score, 2)
        item = {
            "category": category, 
            "title": title, 
            "link": link,
            "id": item_id, 
            'score': score, 
            "photo_link": photo_link
            }
        items.append(item)
    return items

In [27]:
item_id =  itemid
get_itemid_most_similar(item_id)

  similar_items = model.docvecs.most_similar(positive=[item_id], topn=3)


[{'category': '國際',
  'title': '北溪天然氣管線疑遭人為破壞  外媒分析一次看',
  'link': 'https://www.cna.com.tw/news/aopl/202209290384.aspx',
  'id': 'aopl_20220929_6',
  'score': 0.96,
  'photo_link': ''},
 {'category': '科技',
  'title': '成功的人不一定最有才華 2022搞笑諾貝爾：運氣才是關鍵',
  'link': 'https://www.cna.com.tw/news/ait/202209225011.aspx',
  'id': 'ait_20220922_20',
  'score': 0.96,
  'photo_link': 'https://imgcdn.cna.com.tw/www/webphotos/WebCover/420/20220922/1778x1333_624960633396.jpg'},
 {'category': '證卷',
  'title': '台股重返13500點 法人：觀察新台幣匯率與護盤政策',
  'link': 'https://www.cna.com.tw/news/afe/202209290225.aspx',
  'id': 'asc_20220929_6',
  'score': 0.96,
  'photo_link': ''}]

# Django views.py

In [None]:
from django.shortcuts import render
from django.http import JsonResponse
import pandas as pd
from django.views.decorators.csrf import csrf_exempt


# Notice: using different gensim version will cause errors
from gensim.models.doc2vec import Doc2Vec

# Load news data
import app_user_keyword.views as userkeyword_views
def load_df_data_v2():
    # import and use df from app_user_keyword 
    global df # global variable
    df = userkeyword_views.df

# call load data function when starting server
load_df_data_v2()

# Load doc2vec model
def load_doc2vec_model():
    global model # global variable 
    model = Doc2Vec.load("dataset/cna_news_doc2vec.model")

# call load model function when starting server
load_doc2vec_model()

#-- home page
def home(request):
    return render(request, "app_news_rcmd/home.html")

#-- API (three APIs) 3個APIs
#-- API: input category
@csrf_exempt
def api_cate_news(request):
    cate = request.POST['category']
    response = get_cate_latest_news(cate)
    return JsonResponse({"latest_news": response})

#-- API: input keywords, get top 5 similar news 
#@csrf_exempt
def api_keywords_similar_news(request):
    keywords = request.POST['tokens']
    keywords = [t for t in keywords.split()]
    response = get_keywords_most_similar(keywords)
    return JsonResponse({"data": response})

#-- API: input news_id, and then get news information
@csrf_exempt
def api_news_content(request):
    item_id = request.POST['news_id']
    content = get_news_content(item_id)
    related = get_itemid_most_similar(item_id)
    # print(related)
    return JsonResponse({"news_content": content, "related_news": related})


# -- Given a item_id, get document information
def get_news_content(item_id):
    df_item = df[df.item_id == item_id]
    title = df_item.iloc[0].title
    content = df_item.iloc[0].content
    category = df_item.iloc[0].category
    link = df_item.iloc[0].link
    date = df_item.iloc[0].date
    photo_link = df_item.iloc[0].photo_link
    # if photo_link value is NaN, replace it with empty string 
    if pd.isna(photo_link):
        photo_link=''

    news_info = {
        "id": item_id,
        "category": category,
        "title": title,
        "content": content,
        "link": link,
        "date": date,
        "photo_link": photo_link
    }

    return news_info

#-- Given a category, get the latest news
def get_cate_latest_news(cate):
    items = []
    df_cate = df[df.category == cate]

    # get the last news (the latest news)
    df_cate = df_cate.tail(5)  # Only 5 pieces
    # only return 10 news

    for i in range( len(df_cate)):
        item_id = df_cate.iloc[i].item_id    
        title = df_cate.iloc[i].title
        content = df_cate.iloc[i].content
        category = df_cate.iloc[i].category
        link = df_cate.iloc[i].link
        photo_link = df_cate.iloc[i].photo_link
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=""

        item = {
            "id": item_id, 
            "category": category, 
            "title": title,
            "content": content, 
            "link": link,
            "photo_link": photo_link
        }

        items.append(item)
    
    return items

#--Given news keywords, find similar documents 
def get_keywords_most_similar(keywords):
    new_vector = model.infer_vector(keywords)
    similar_items = model.docvecs.most_similar(positive=[new_vector], topn=5)
    items = []
    for item_id, score in similar_items:
        df_item = df[df.item_id == item_id]
        
        title = df_item.iloc[0].title
        content = df_item.iloc[0].content
        category = df_item.iloc[0].category
        link = df_item.iloc[0].link
        photo_link = df_item.iloc[0].photo_link
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=''

        score = round(score, 2)
        
        item = {
            "id": item_id, 
            "category": category, 
            "title": title, 
            "link": link,
            'score': score, 
            "photo_link": photo_link
            }
        items.append(item)

    return items

#-- Given item_id, get three similar news based on the doc2vec model
def get_itemid_most_similar(item_id):
    similar_items = model.docvecs.most_similar(positive=[item_id], topn=3)
    items = []
    for item_id, score in similar_items:
        df_item = df[df.item_id == item_id]
        title = df_item.iloc[0].title
        content = df_item.iloc[0].content
        category = df_item.iloc[0].category
        link = df_item.iloc[0].link
        photo_link = df_item.iloc[0].photo_link
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=''

        score = round(score, 2)
        item = {
            "category": category, 
            "title": title, 
            "link": link,
            "id": item_id, 
            'score': score, 
            "photo_link": photo_link
            }
        items.append(item)
    return items

print("app_doc2vec -- 今日新聞瀏覽與新聞推薦載入成功!")
