In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import elmo
%load_ext autoreload

In [102]:
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

In [3]:
%autoreload 1
import importlib
from utils import tokenize
from utils import Utils
from utils import find_word
from utils import find_word_root
import utils
from fetched_word import FetchedWord
import config
from webpage import Webpage
import inference_tools

In [4]:
driver = config.DRIVER
elmo = elmo.Elmo()
tools = Utils()

Using real Elmo?


### Opens the file that contains rare skill words. 

- Returns a list of custom objects called SkillWords. 
- Each object is an uncommon word(rare word) that is extracted from the knowledge base fetched from wikipedia.
- This object along with word contains the following : 
* "Word root", 
* "Sentence in which it was used",
* "Embeddings from elmo",  
* "Indices of their presence in sentences" ,
* "which skill it belongs to", 
* "number of times it occured(count)"

In [5]:
with open(config.SKILL_WORD_PATH, 'rb') as handle:
    skill_rare_words = pickle.load(handle)

#### Opens the file that contains synthetically prepared dataset with urls that are related to topics such as 

- Shopping
- Entertainment 
- Travel 

In [6]:
other_df = pd.read_excel("../data/web_history/other_website_history_xl.xls")

In [7]:
other_df.head()

Unnamed: 0,URL,Title,Visit Time,Visit Count,Visited From,Visit Type,Web Browser,User Profile,Browser Profile,URL Length,Typed Count,History File,Record ID
0,https://ohthatfilmblog.com/,Oh! That Film Blog | The ramblings of a self c...,2021-12-03 14:04:00,1,,Link,Chrome,welcome,Default,27,0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12382
1,https://www.themovieblog.com/,The Movie Blog | The Official Home of Correct ...,2021-12-03 14:04:00,1,,Link,Chrome,welcome,Default,29,0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12381
2,https://www.movie-blogger.com/,Movie-Blogger.com | The No.1 Independent Movie...,2021-12-03 14:04:00,1,,Link,Chrome,welcome,Default,30,0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12380
3,https://blogging.org/top-movie-blogs/,Top 10 Movie Blogs on the Internet Today | Mov...,2021-12-03 14:04:00,1,,Link,Chrome,welcome,Default,37,0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12379
4,https://thefilm.blog/,The Film Blog | The official blog of everythin...,2021-12-03 14:04:00,1,,Link,Chrome,welcome,Default,21,0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12378


#### Opens the file that contains synthetically prepared dataset with urls that are related to an example skill set

In [8]:
skills_df = pd.read_excel("../data/web_history/synthetic_data_490_xl.xls")

In [9]:
skills_df.head()

Unnamed: 0,URL,TITILE,Visited_time,1,2,3,4,5,6,7,8,9,10
0,https://www.tensorflow.org/tutorials/images/cnn,Convolutional Neural Network (CNN) | TensorF...,2021-12-03 03:39:00,2.0,,Link,Chrome,welcome,Default,47.0,0.0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12121
1,https://www.tensorflow.org/,TensorFlow,2021-12-03 03:39:00,2.0,,Link,Chrome,welcome,Default,27.0,0.0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12120
2,https://en.wikipedia.org/wiki/TensorFlow,TensorFlow - Wikipedia,2021-12-03 03:38:00,1.0,,Link,Chrome,welcome,Default,40.0,0.0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12119
3,https://github.com/tensorflow/tensorflow,tensorflow/tensorflow: An Open Source Machine ...,2021-12-03 03:38:00,1.0,,Link,Chrome,welcome,Default,40.0,0.0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12118
4,https://www.tensorflow.org/,TensorFlow,2021-12-03 03:38:00,2.0,,Link,Chrome,welcome,Default,27.0,0.0,C:\Users\welcome\AppData\Local\Google\Chrome\U...,12117


In [10]:
other_df = pd.DataFrame({"TIME": other_df['Visit Time'], "TITLE": other_df['Title'], "URL": other_df['URL']})
other_df.head()

Unnamed: 0,TIME,TITLE,URL
0,2021-12-03 14:04:00,Oh! That Film Blog | The ramblings of a self c...,https://ohthatfilmblog.com/
1,2021-12-03 14:04:00,The Movie Blog | The Official Home of Correct ...,https://www.themovieblog.com/
2,2021-12-03 14:04:00,Movie-Blogger.com | The No.1 Independent Movie...,https://www.movie-blogger.com/
3,2021-12-03 14:04:00,Top 10 Movie Blogs on the Internet Today | Mov...,https://blogging.org/top-movie-blogs/
4,2021-12-03 14:04:00,The Film Blog | The official blog of everythin...,https://thefilm.blog/


In [11]:
def get_web_embedding(f_words):
    page_embedding = []
    if len(f_words) > 50:
        f_words.sort(key=lambda x: x.count, reverse=True)
        for word in f_words[:50]:
            emb = np.mean(word.embeddings, axis=0)
            page_embedding.append(emb)
    page_embedding = np.array(page_embedding)
    return page_embedding

In [12]:
urls = list(other_df['URL'])
sample_fetched_words = []
page_embedding_list = []
for i, url in enumerate(tqdm(urls[:5])):
    print(i)
    _, fetched_words, _ = inference_tools.test_url(elmo, url)
    sample_fetched_words.append(fetched_words)
    emb = get_web_embedding(fetched_words)
    try:
        assert emb.shape == (50,1024) or emb.shape == (0,)
        page_embedding_list.append(emb)
    except Exception as e:
        print("Invalid embeddding shape", e, "at url", url)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

0
Processing Text from the web page


 20%|████████████████▊                                                                   | 1/5 [00:26<01:46, 26.68s/it]

1
Processing Text from the web page


 40%|█████████████████████████████████▌                                                  | 2/5 [00:46<01:07, 22.53s/it]

2
Processing Text from the web page


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [01:03<00:40, 20.16s/it]

3
Processing Text from the web page


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [02:31<00:46, 46.99s/it]

4
Processing Text from the web page


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:49<00:00, 33.91s/it]


In [26]:
clustering = DBSCAN(eps=0.1, min_samples=2).fit(embeddings)
clustering.labels_

array([-1, -1, -1, -1, -1], dtype=int64)

In [18]:
clustering

NameError: name 'clustering' is not defined

In [None]:
DBSCAN(eps=3, min_samples=2)

In [11]:
other_df = other_df.sample(frac=1).reset_index(drop=True)

In [20]:
embeddings = []
for page in page_embedding_list:
    embeddings.append(page.flatten())
np.array(embeddings).shape

(5, 51200)

In [28]:
other_df['URL'][25]

'https://www.google.com/search?q=cricket+sport&sxsrf=AOaemvK0mvPWfGjVTn5n7sYdgjBhMda0hw%3A1638558134424&ei=tmmqYd-qGamnptQPqsuVoA0&oq=cricket+s&gs_lcp=Cgdnd3Mtd2l6EAMYADIICAAQgAQQsQMyCAgAEIAEELEDMhEILhCABBCxAxDHARCjAhDJAzIFCAAQkgMyCwgAEIAEELEDEIMBMgsIABCABBCxAxCDATIFCAAQgAQyCwgAEIAEELEDEIMBMggILhCABBCxAzIFCAAQgAQ6BwgjEOoCECc6BAgjECc6BQgAEJECOggILhCxAxCDAToFCC4QgAQ6CAgAELEDEIMBOgUILhCRAjoLCC4QgAQQsQMQgwE6DgguEIAEELEDEMcBENEDOgsILhDHARCvARCRAjoOCC4QgAQQsQMQxwEQowJKBAhBGABKBAhGGABQssQBWLnhAWDr7AFoAXABeAGAAdEEiAGOEZIBCzMuMi4wLjIuMS4xmAEAoAEBsAEKwAEB&sclient=gws-wiz'

In [34]:
embeddings = np.load("../data/page_embeddings.npy", allow_pickle=True)

In [36]:
embeddings_fixed = []
for emb in embeddings:
    if emb.shape != (50,1024):
        embeddings_fixed.append(np.zeros((50,1024)))
    else:
        embeddings_fixed.append(emb)
        

In [38]:
cluster_embeddings = []
for page in embeddings_fixed:
    cluster_embeddings.append(page.flatten())
np.array(cluster_embeddings).shape

(50, 51200)

In [98]:
clustering = DBSCAN(eps=0.21, min_samples=3).fit(cluster_embeddings)
print(np.unique(clustering.labels_))
other_df = other_df[:50]
other_df['CLUSTER'] = pd.Series(clustering.labels_)

[-1  0  1  2  3  4  5]


In [111]:
clustering = KMeans(n_clusters=5, random_state=0).fit(cluster_embeddings)
print(np.unique(clustering.labels_))
other_df = other_df[:50]
other_df['CLUSTER'] = pd.Series(clustering.labels_)

[0 1 2 3 4]


In [110]:
other_df

Unnamed: 0,TIME,TITLE,URL,CLUSTER
0,2021-12-03 14:04:00,Oh! That Film Blog | The ramblings of a self c...,https://ohthatfilmblog.com/,0
1,2021-12-03 14:04:00,The Movie Blog | The Official Home of Correct ...,https://www.themovieblog.com/,0
2,2021-12-03 14:04:00,Movie-Blogger.com | The No.1 Independent Movie...,https://www.movie-blogger.com/,0
3,2021-12-03 14:04:00,Top 10 Movie Blogs on the Internet Today | Mov...,https://blogging.org/top-movie-blogs/,0
4,2021-12-03 14:04:00,The Film Blog | The official blog of everythin...,https://thefilm.blog/,0
5,2021-12-03 14:04:00,Film Blogs | Movie Blogs | NME,https://www.nme.com/blogs/the-movies-blog,0
6,2021-12-03 14:04:00,Jason's Movie Blog | A Movie Blog for the Late...,https://jasonsmovieblog.com/,0
7,2021-12-03 14:04:00,Top 100 Movie Review Blogs and Websites To Fol...,https://blog.feedspot.com/movie_review_blogs/,0
8,2021-12-03 14:04:00,Learn how to review movies at the kidsfirst bo...,https://www.googleadservices.com/pagead/aclk?s...,0
9,2021-12-03 14:04:00,Learn how to review movies at the kidsfirst bo...,https://www.kidsfirst.org/become-a-juror/?gcli...,0
