In [1]:
import numpy as np
import pandas as pd
import faiss

import pickle
import re
from tqdm.notebook import tqdm

import spacy
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_lg')

In [2]:
df = pd.read_pickle('./data/twitter_parler_text_url_data_pkl4.pkl')
df['source'] = 'website'
df.head()

Unnamed: 0,url,text,source
0,https://www.ajc.com/politics/kemp-condemns-pro...,Credit: WSBTV Videos Credit: WSBTV Videos Gov...,website
1,https://uscouriertoday.com/ken-starr-pulls-bac...,exception occurred with html parsing,website
2,https://www.thegatewaypundit.com/2021/01/pro-t...,Advertisement Police have been using a very h...,website
3,https://www.thegatewaypundit.com/2021/01/fulto...,Advertisement The Georgia twin senate runoff...,website
4,https://www.pscp.tv/w/csSSPDFsWktweXJveHB2am58...,unable to extract meaningful text,website


In [3]:
df_filtered = df[df['text'] != 'exception occurred with html parsing']
df_filtered = df_filtered[df_filtered['text'] != 'unable to extract meaningful text']
df_filtered.head()

Unnamed: 0,url,text,source
0,https://www.ajc.com/politics/kemp-condemns-pro...,Credit: WSBTV Videos Credit: WSBTV Videos Gov...,website
2,https://www.thegatewaypundit.com/2021/01/pro-t...,Advertisement Police have been using a very h...,website
3,https://www.thegatewaypundit.com/2021/01/fulto...,Advertisement The Georgia twin senate runoff...,website
5,https://www.zerohedge.com/geopolitical/chinas-...,Authored by Doug Dodge via AmericanThinker.co...,website
6,https://www.thegatewaypundit.com/2021/01/ignor...,Advertisement In late November we reported v...,website


In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
word_list = ['Advertisement']
stop_words.extend(word_list)

def preprocess_text(text):
    if text == '' or text == None:
        return ''
    
    text_cleaned = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", text).split())
    text_cleaned = re.sub(r'^https?:\/\/.*[\r\n]*', '', text_cleaned)
    text_cleaned = ' '.join([w for w in text_cleaned.split() if not w.lower() in stop_words])
    text_cleaned = text_cleaned.replace('\"', '')
    text_cleaned = text_cleaned.replace('\'', '')
    text_cleaned = text_cleaned.strip()
    
    return text_cleaned 

In [6]:
df_filtered['text_processed'] = df_filtered['text'].apply(preprocess_text)

In [7]:
doc_list = {}

for idx, row in tqdm(df_filtered.iterrows()):
    text = row['text_processed']
    text = text[:1000000] if len(text) > 1000000 else text
    doc_list[idx] = nlp(text)

0it [00:00, ?it/s]

In [9]:
dimensions = 300
hyperplanes = 20 # to find out by PCA
plane_norms = np.random.rand(hyperplanes, dimensions) - 0.5

In [27]:
index_map = {}
index = faiss.IndexLSH(hyperplanes, hyperplanes)

In [18]:
plane_norms.shape

(20, 300)

In [26]:
index.add(vec_reshape.astype('float32'))

In [29]:
for k,v in tqdm(doc_list.items()):
    vec = v.vector
    vec_dot = np.dot(vec, plane_norms.T)
    vec_bin = vec_dot.astype(int)
#     print(vec_bin.shape)
    vec_reshape = vec_bin.reshape(1, hyperplanes).astype('float32')
#     print(vec_reshape.shape)
    
    text = v.text
    index_map[k] = {'text': text, 'vector': vec_reshape}
    index.add(vec_reshape)

  0%|          | 0/2125 [00:00<?, ?it/s]

In [None]:
fh = open('./data/text_sim.csv', 'w')

In [42]:
similarity_array = []

for num in tqdm(range(0, len(doc_list))):
    vec = doc_list[num].vector
    vec_dot = np.dot(vec, plane_norms.T)
    vec_reshape = vec_dot.reshape(1, 20).astype('float32')
    
    D, I = index.search(vec_reshape, k=5)
    indices = I[0]
    distances = D[0]
    
    for idx in range(0, len(indices)):
        if indices[idx] != -1 and idx != num: 
            arr_obj = {}
            arr_obj['orig_text'] = doc_list[num].text
            arr_obj['matched_text'] = index_map[i]['text']
            arr_obj['distance'] = distances[idx]
            
            similarity_array.append(arr_obj)

  0%|          | 0/2125 [00:00<?, ?it/s]

In [43]:
df_sim = pd.DataFrame(similarity_array)
df_sim.head()

Unnamed: 0,orig_text,matched_text,distance
0,Thanks contacting us received submission DAs c...,Technology Many President Donald Trump support...,3.0
1,Thanks contacting us received submission DAs c...,Technology Many President Donald Trump support...,3.0
2,Thanks contacting us received submission DAs c...,Technology Many President Donald Trump support...,3.0
3,Thanks contacting us received submission DAs c...,Technology Many President Donald Trump support...,3.0
4,Monday 11 January 2021 05 30 PM Facebook inten...,Technology Many President Donald Trump support...,2.0


In [45]:
df_sim.to_csv('./data/website_sim_text.csv', index=False)

In [None]:
# TEST SEARCH CODE 
num = 2
vec = doc_list[num].vector
vec_dot = np.dot(vec, plane_norms.T)
vec_reshape = vec_dot.reshape(1, 20).astype('float32')
D, I = index.search(vec_reshape, k=2)
indices = I[0]
distances = D[0]

print(doc_list[num].text, '\n\n')

for idx in range(0, len(indices)):
    if indices[idx] != -1:
        print(index_map[i]['text'], distances[idx], '\n')