# Initialize Database

In [None]:
%reload_ext sql
%run lib.py

%sql postgresql+psycopg2://postgres:@127.0.0.1:5432/fcrec

# Import NLP libraries

In [None]:
import json
import nltk
import numpy as np

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/dj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/dj/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/dj/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Embedding News Articles using Doc2Vec

## Extract Words from News

In [None]:
import gensim

df = pd.read_table("mind/train/news.tsv", sep='\t', header=None, names=[
    'id', 'category', 'subcategory', 'title', 'abstract', 'url',
    'title_entities', 'abstract_entities'
])

docs = []

for idx, row in df.iterrows():
    news_id = row['id']

    title = row['title']
    abstract = row['abstract']

    tagged = []

    if is_not_blank(title):
        title_tokens = nltk.word_tokenize(title)
        tagged = nltk.pos_tag(title_tokens)

    if is_not_blank(abstract):
        abstract_tokens = nltk.word_tokenize(abstract)
        tagged = nltk.pos_tag(abstract_tokens)

    words = [t[0].lower() for t in list(filter(is_pos_to_extract, tagged))]

    docs.append(gensim.models.doc2vec.TaggedDocument(words, [news_id]))

    if (idx+1)%1000 == 0:
        print(idx+1)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000


## Train Doc2Vec Model 

In [None]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

2021-11-03 23:07:02,071 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d20,n5,w5,mc2,s0.001,t3)', 'datetime': '2021-11-03T23:07:02.038140', 'gensim': '4.1.0', 'python': '3.9.7 (default, Oct 13 2021, 06:45:31) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platform': 'macOS-11.1-x86_64-i386-64bit', 'event': 'created'}
2021-11-03 23:07:02,074 : INFO : collecting all words and their counts
2021-11-03 23:07:02,075 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-11-03 23:07:02,156 : INFO : PROGRESS: at example #10000, processed 197305 words (2449664/s), 25834 word types, 10000 tags
2021-11-03 23:07:02,245 : INFO : PROGRESS: at example #20000, processed 414307 words (2461667/s), 36599 word types, 20000 tags
2021-11-03 23:07:02,333 : INFO : PROGRESS: at example #30000, processed 633617 words (2508880/s), 44745 word types, 30000 tags
2021-11-03 23:07:02,420 : INFO : PROGRESS: at example #40000, processed 863781 words (2679169/s), 52454 word types, 40000

## Assign Topics to Each News Article

In [None]:
news_topic_arr = []

for td in docs:
    news_id = td.tags[0]
    topics = model.infer_vector(td.words)
    norm = np.sqrt(np.sum([t*t for t in topics]))

    for idx, t in enumerate(topics):
        news_topic_arr.append([news_id, idx, t/norm])

topic_df = pd.DataFrame(
    columns = ['news_id', 'topic', 'w'],
    data=news_topic_arr
)

topic_df.to_csv('temp/news_d2v_w.csv', index=False)

## Load News Topic

In [None]:
%%sql
drop table if exists mind_train_news_d2v_w;

create table mind_train_news_d2v_w
(
    news_id         varchar(10) not null,
    topic_id        int not null,
    w               float not null
);

copy mind_train_news_d2v_w from '/Users/dj/fcrec2021/03news/temp/news_d2v_w.csv' delimiter ',' csv header;

create index idx_mind_train_news_d2v_w_1 on mind_train_news_d2v_w (news_id, topic_id, w);

create index idx_mind_train_news_d2v_w_2 on mind_train_news_d2v_w (topic_id, news_id, w);

 * postgresql+psycopg2://postgres:***@127.0.0.1:5432/fcrec
Done.
Done.
1025640 rows affected.
Done.
Done.


[]

# Create User Topic Interest Profile

In [None]:
%%sql
drop table if exists mind_train_user_d2v_w;

create table mind_train_user_d2v_w as
with user_d2v_w as (
select a.user_id, b.topic_id, sum(w) w
from mind_train_user_history a 
	join mind_train_news_d2v_w b on b.news_id = a.news_id
group by user_id, b.topic_id )
select user_id, topic_id, w/sqrt(sum(pow(w, 2)) over (partition by user_id)) w
from user_d2v_w;

create index idx_mind_train_user_d2v_w_1 on mind_train_user_d2v_w (user_id, topic_id, w);


 * postgresql+psycopg2://postgres:***@127.0.0.1:5432/fcrec
Done.
982160 rows affected.
Done.


[]

# Recommend Using Topics

In [None]:
user_id = 'U89675'
size = 20


query = f'''
    with rec as (
    select b.news_id, sum(a.w*b.w) sim
    from mind_train_user_word_w a
        join mind_train_news_word_w b on b.word = a.word
    where a.user_id = '{user_id}'
    group by b.news_id
    union all 
    select b.news_id, sum(a.w*b.w) sim
    from mind_train_user_entity_w a
        join mind_train_news_entity_w b on b.label = a.label
    where a.user_id = '{user_id}'
    group by b.news_id 
    union all 
    select b.news_id, sum(a.w*b.w) sim
    from mind_train_user_d2v_w a
        join mind_train_news_lda_w b on b.topic_id = a.topic_id
    where a.user_id = '{user_id}'
    group by b.news_id 
    )
    select t.sim, s.id, s.category1, s.category2, s.title, s.abstract, s.url
    from (
        select news_id, sum(sim) sim
        from rec
        group by news_id
        order by sim desc
        limit {size} ) t join mind_train_news s on s.id = t.news_id
    order by t.sim desc;
    '''

result = executeQuery(query)

html = ""
for news in result:
    html += get_news_html(news)
display(HTML(html))