In [1]:
# importing necessary libraries
import glob
import numpy as np
import re
import requests
import pandas as pd
from pyvi.ViTokenizer import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import warnings
warnings.filterwarnings("ignore")


In [2]:
# read all file csv in raw folder
path = r'raw'
all_files = glob.glob(path + "/*.csv")
history = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)


In [3]:
# split attributes about video base on url
for i in range(len(history)):
    text = history.loc[i, 'url'].split('/')[-1]
    info_video = re.findall(r'\d+', text)
    history.loc[i, 'id'] = info_video[0]  # id video
    history.loc[i, 'from_webapp'] = info_video[1]  # from_webapp
    history.loc[i, 'web_id'] = info_video[2]  # from_webapp

history.drop_duplicates(subset='id', inplace=True)
history.reset_index(inplace=True, drop=True)


In [4]:
history.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643 entries, 0 to 642
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   url             643 non-null    object 
 1   desc_video      623 non-null    object 
 2   like_count      643 non-null    object 
 3   comment_count   643 non-null    object 
 4   like            643 non-null    int64  
 5   time_container  643 non-null    object 
 6   timestamp       643 non-null    float64
 7   user            643 non-null    object 
 8   id              643 non-null    object 
 9   from_webapp     643 non-null    object 
 10  web_id          643 non-null    object 
dtypes: float64(1), int64(1), object(9)
memory usage: 55.4+ KB


In [5]:
history['desc_video'] = history['desc_video'].astype(str)


In [6]:
def download_stopwords():
    """
    Get stopwords from GitHub
    """
    stopwords_url = 'https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt'
    response = requests.get(stopwords_url)
    stopwords = response.text.split('\n')
    stopwords = set(stopwords)
    return stopwords


stopwords = download_stopwords()


In [7]:
def filter_stop_words(sentences, stop_words):
    new_sent = [word for word in sentences.split() if word not in stop_words]
    sentences = ' '.join(new_sent)
    return sentences


def de_emojify(text):
    regrex_pattern = re.compile(pattern="["
                                        u"\U0001F600-\U0001F64F"  # emoticons
                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                        "]+", flags=re.UNICODE)
    return regrex_pattern.sub(r'', text)


def preprocess(text, tokenized=True, lowercased=True):
    text = text.lower() if lowercased else text
    text = de_emojify(text)
    text = ViTokenizer.tokenize(text) if tokenized else text
    text = re.sub(r'[^\w]', ' ', text)
    text = filter_stop_words(text, stopwords)
    return text


def pre_process_features(sentences, tokenized=True, lowercased=True):
    sentences = [preprocess(str(p), tokenized=tokenized,
                            lowercased=lowercased) for p in list(sentences)]
    for idx, ele in enumerate(sentences):
        if not ele:
            np.delete(sentences, idx)
    return sentences


In [8]:
# Preprocess description
descr_video = history['desc_video'].values
descr_video_processed = pre_process_features(descr_video)


# TF-IDF


In [9]:
# Term Frequency-Inverse Document frequency
tfidf = TfidfVectorizer()
descr_matrix = tfidf.fit_transform(descr_video_processed)


In [10]:
# measure the similarity between 2 description vector base on Cosine_Similarity
similarity_matrix = linear_kernel(descr_matrix)


In [11]:
def recommend_video_based_on_desc(video_id, num_video, similarity_matrix):
    # get similarity values with other video
    # similarity_score is the list of index and similarity matrix
    vid_idx = history.index[history['id'] == video_id].to_list()[0]
    similarity_score = list(enumerate(similarity_matrix[vid_idx]))
    # sort in descending order the similarity score of movie inputted with all the other videos
    similarity_score = sorted(
        similarity_score, key=lambda x: x[1], reverse=True)
    # Get the scores of the 15 most similar videos. Ignore the first movie.
    similarity_score = similarity_score[1:num_video + 1]
    # return movie names using the mapping series
    video_indices = [(history.loc[idx[0], 'id'], idx[1])
                     for idx in similarity_score]
    print(f'Recommend {num_video} video are similarity for user watched video has id: \"{video_id}\"')
    for i, v in video_indices:
        print(f"Video has id \"{i}\" with similarity: {v}")


video_id = history.loc[1, 'id']
num_video = 10
recommend_video_based_on_desc(video_id, num_video, similarity_matrix)

Recommend 10 video are similarity for user watched video has id: "7133533031737920811"
Video has id "7166864341277396251" with similarity: 0.35229262867658223
Video has id "7140522031476460826" with similarity: 0.2902933145638706
Video has id "7164310529899629851" with similarity: 0.24341507360668957
Video has id "7159574479822146817" with similarity: 0.17350065195605596
Video has id "7142146157400132890" with similarity: 0.14973074882596585
Video has id "7141371459607252251" with similarity: 0.10018816041276869
Video has id "7166101487418871067" with similarity: 0.0
Video has id "7160241510816976154" with similarity: 0.0
Video has id "7161707438633127194" with similarity: 0.0
Video has id "7161372876992466203" with similarity: 0.0


# Word to Vec


In [12]:
from gensim import models
# GitHub: https://test.ocom.vn/?url=github.com/sonvx/word2vecVN
word2vec_path = 'word2vec_model/baomoi.window2.vn.model.bin.gz'
w2v_model = models.KeyedVectors.load_word2vec_format(
    word2vec_path, binary=True)


In [13]:
vocab = w2v_model.wv.vocab
print("The total number of words:", len(vocab))


The total number of words: 439056


In [14]:
word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = w2v_model.wv.get_vector(word)
print("The number of key-value pairs:", len(word_vec_dict))


The number of key-value pairs: 439056


## Base on embedding vector (Developing)

In [15]:
max_length = 0
for val in descr_video:
    len_sentence = len(str(val).split())
    if len_sentence > max_length:
        max_length = len_sentence
print("Sentences have max %d lengths" % max_length)


Sentences have max 70 lengths


In [16]:
dim = 300  # dimension of a word vector
enc_descr_processed = []
for descr in descr_video_processed:
    descr_list = descr.split()
    w2v = []
    for val in descr_list:
        try:
            _ = word_vec_dict[val]
        except KeyError:
            # if word isn't in vocab,
            # assign it to zeros vector have 300 dim
            _ = np.zeros((dim,))
        w2v.append(_)
    enc_descr_processed.append(w2v)


In [17]:
from keras.utils import pad_sequences
# padding sentences to have equal length
pad_cont = pad_sequences(enc_descr_processed, maxlen=max_length, dtype='float',
                         padding='post', value=np.zeros(dim))
pad_cont.shape


(643, 70, 300)

## Remove word not in vocab, then calc similarity between 2 sentences

In [19]:
descr_in_vocab = []
for descr in descr_video_processed:
    descr_list = [_ for _ in descr.split() if _ in vocab]
    descr_in_vocab.append(descr_list)

In [20]:
sim_matrix = [[0]*len(history)]*len(history)
for idx1, val1 in enumerate(descr_in_vocab):
    for idx2, val2 in enumerate(descr_in_vocab):
        try:
            sim = w2v_model.wv.n_similarity(val1, val2)
        except:
            sim = 0
        sim_matrix[idx1][idx2] = sim


In [21]:
def recommend_video_based_on_desc(video_id, num_video, similarity_matrix):
    # get similarity values with other video
    # similarity_score is the list of index and similarity matrix
    vid_idx = history.index[history['id'] == video_id].to_list()[0]
    similarity_score = list(enumerate(similarity_matrix[vid_idx]))
    # sort in descending order the similarity score of movie inputted with all the other videos
    similarity_score = sorted(
        similarity_score, key=lambda x: x[1], reverse=True)
    # Get the scores of the 15 most similar videos. Ignore the first movie.
    similarity_score = similarity_score[1:num_video + 1]
    # return movie names using the mapping series
    video_indices = [(history.loc[idx[0], 'id'], idx[1])
                     for idx in similarity_score]
    print(f'Recommend {num_video} video are similarity for user watched video has id: \"{video_id}\"')
    for i, v in video_indices:
        print(f"Video has id \"{i}\" with similarity: {v}")


video_id = history.loc[1, 'id']
num_video = 10
recommend_video_based_on_desc(video_id, num_video, sim_matrix)

Recommend 10 video are similarity for user watched video has id: "7133533031737920811"
Video has id "7159525365185039642" with similarity: 0.5890825986862183
Video has id "7140131471540489498" with similarity: 0.5462914109230042
Video has id "7166590557802761499" with similarity: 0.5410798192024231
Video has id "7151337992114818330" with similarity: 0.5267125368118286
Video has id "7156818902658403611" with similarity: 0.5265475511550903
Video has id "7165370368742411547" with similarity: 0.5189292430877686
Video has id "7155123858289495342" with similarity: 0.5100876688957214
Video has id "7165461787595377946" with similarity: 0.49493736028671265
Video has id "7159542834763336986" with similarity: 0.4862213432788849
Video has id "7155372076080041242" with similarity: 0.47996985912323
