In [1]:
# ---------------
# Imports
# ---------------

# !pip install googlesearch-python
# from googlesearch import search
import pandas as pd
import spacy
from spacy import displacy
import requests
from bs4 import BeautifulSoup
import requests
import re
import time

pd.options.display.max_colwidth = 2000 # for viewing text
NER = spacy.load("en_core_web_sm")

# Mount drive - needed for GBQ creds
# from google.colab import drive
# drive.mount('/content/drive')

# ---------
# Data
# ---------

imdb_reviews = pd.read_csv('https://raw.githubusercontent.com/naserahmadi/TDmatch/main/data/imdb/IMDB_reviews.csv', names = ['text'])
imdb_reviews['row_number'] = imdb_reviews.reset_index().index



In [7]:
# ------------------------------
# NER & functions to search
# ------------------------------

def run_NER(text, text_id):

  ner_text = NER(text)
  ner_result = [pd.DataFrame({'id': text_id, 'word': x.text, 'entity_detected': x.label_}, index = [0]) for x in ner_text.ents]

  return ner_result


def url_title(url):
  headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0"}

  try:
    reqs = requests.get(url, headers = headers)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    url_title = soup.find('title').get_text()

  except:
    url_title = ''

  return url_title

def title_NER(title):

  try:
    ner_results = NER(title)
    ner_df = [pd.DataFrame({'word': x.text, 'entity_detected': x.label_}, index = [0]) for x in ner_results.ents]
    ner_all = pd.concat(ner_df)

    return ner_all

  except:
    pass

In [109]:
# ------------------------------
# Functions for NER & search
# ------------------------------

def ner_helper(NER_text):
  '''
  Attributes:
    NER_text: a list of 
  '''
  ner_results = []
  count_var = 0

  for i in NER_text:
    
    ner_results += list(map(lambda x: {'id': count_var, 'word': x.text, 'entity_detected': x.label_}, i.ents))
      
    count_var +=1

  ner_df = pd.DataFrame(ner_results)
  return ner_df

def create_search_text(NER_text, entity_list: list = None, incl_entities: bool = True):

  ner_df = ner_helper(NER_text)

  if entity_list is not None:
    not_operator = '' if incl_entities else 'not'
    ner_df = ner_df.query(f'entity_detected {not_operator} in @entity_list')

  ner_unique = ner_df.drop_duplicates()
  ner_search = ner_unique.groupby(['id'])['word'].apply(' '.join).reset_index()

  return ner_search

def run_search(search_query: str, num_results: int = 10, engine: str = 'duckduckgo'):

  time.sleep(0.25)
  # Rotate headers
  headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0"}

  if engine == 'duckduckgo':

    page = requests.get(f'https://duckduckgo.com/html/?q={search_query}', headers = headers).text
    soup = BeautifulSoup(page, 'html.parser').find_all("a", class_ = "result__url", href = True, limit = num_results)

    urls = [re.sub('\n|</a|.html', '', str(x).split('>')[1]).strip() for x in soup]
    urls_split = [x.split('/')[1:] for x in urls]
    # Select longest string as title
    url_titles = [re.sub('-|_', ' ', max(x, key=len)) for x in urls_split] # .replace('-', ' ')
    # title_df = pd.DataFrame({'search_term': search_query, 'url_title': url_titles})

  elif engine == 'google':

    params = {"q": search_query,"gl": "us","hl": "en", "num": "10"}
    page_html = requests.get("https://www.google.com/search", headers = headers, params = params)
    soup = BeautifulSoup(page_html.text).find_all("a", href = True)

    # Is this stable?
    url_strings = [str(x).split('-line-clamp:2">') for x in soup]

    url_titles = []

    for i in url_strings:
      if len(i) > 1:
        url_titles.append(i[1].split('<')[0])

  # title_df = pd.DataFrame({'search_term': search_query, 'url_title': url_titles})
  return {'search_term': search_query, 'url_title': url_titles}

def count_tokens(text: list, search_term: str):
  '''
  Attributes:
    text: a list of words
    search_term: the search term that generated the list
  '''

  title_tokens = {}

  for i in text:
    if i not in title_tokens.keys():
      title_tokens[i] = 1
    
    else:
      title_tokens[i] += 1

  title_tokens_df = pd.DataFrame.from_dict(title_tokens, orient = 'index').reset_index()
  title_tokens_df.columns = ['word', 'count']
  title_tokens_df['search_term'] = search_term

  return title_tokens_df


In [None]:
# displacy.render(text2, style = "ent", jupyter = True)

# NER on unstructured text, Googling entities

In [27]:
# Workflow for multiple entities

# NER for each review
tryme = imdb_reviews.iloc[:10] # select 4th row to test
ner_text = list(map(NER, list(tryme.text)))

# DDG search
ner_search2 = create_search_text(ner_text, entity_list = ['ORDINAL', 'CARDINAL'], incl_entities = False)
search_results = list(map(run_search, ner_search2.word.values))
search_results_df = pd.concat(search_results)

# Pile all results into one long string (don't consider separately for counting tokens)
search_bow_df = search_results_df.groupby(['search_term'])['url_title'].apply(' '.join).reset_index()
url_splits = [x.replace('_', ' ').title().split(' ') for x in search_bow_df.url_title.values]

# Count tokens
final_guess_ls = list(map(count_tokens, url_splits, search_bow_df.search_term.values))
final_guess_df = pd.concat(final_guess_ls)

final_guess_df['word'] = final_guess_df['word'].str.lower()
final_guess_df['overlapping'] = list(map(lambda x,y: x in y, final_guess_df.word.values, final_guess_df.search_term.values))
final_guess_blocked = final_guess_df.query('overlapping == False') # Get rid of any terms that are in the search (spaCy doesn't really recognize movie names)
idx = final_guess_blocked.groupby(['search_term'])['count'].transform(max) == final_guess_blocked['count'] # select max tokens
title_guess = final_guess_blocked[idx].groupby(['search_term'])['word'].apply(' '.join).reset_index()

In [193]:
title_guess

Unnamed: 0,search_term,word
0,al al pacino robert de niro michael,and
1,america vito james caan john cazale al pacino tom hagen robert duvall michael 1970 hollywood francis ford coppola,the godfather
2,california peter jackson's december 2001 new zealand peter jackson ian sean astin sam mortensen bernard hill the twenty-first century tolkien,of
3,christopher nolan jonathan nolan deserve gore christian gotham bravo,the
4,coppola the 1940s young vito corleone the 20th century 1917 italian new york city michael al pacino al pacino's the beginning of the cuba waspish de niro's italy new york don pacino lee strasberg g. d. spradlin michael v. gazzo unforgettable john cazale diane keaton robert duvall,godfather
5,frank darabont novella rita hayworth tim robbins morgan freeman andy dufresne andy bob gunton norton james whitmore byron hadley william sadler heywood & mark rolston roger deakins thomas newman's hank williams,shawshank redemption
6,kane michael al pacino,hunters
7,stephen king andy dufrane 20 years andy morgan freeman bob gunton tommy '90's,shawshank redemption
8,tonight nearly 15 years them.i,and new moon is closest in


In [None]:
# workflow for one entity

tryme = imdb_reviews.iloc[4]
# Google approach for one example
tryme_res = run_NER(tryme.text, tryme.row_number)
tryme_df = pd.concat(tryme_res)
tryme_unique = tryme_df.drop_duplicates()
tryme_search = tryme_unique.groupby(['id'])['word'].apply(' '.join).reset_index()

# Feed entities into Google
tryme_google = search(tryme_search.word.values[0])
google_results = list(tryme_google)
article_titles = list(map(url_title, google_results))

# NER on Google titles
title_entities = list(map(title_NER, article_titles))
title_ents_df = pd.concat(title_entities)
title_ents_df.query('entity_detected in ["ORG", "PERSON", "WORK_OF_ART"]').groupby('word').count()

# Optional: parse URL and return full text for NER/majority voting

In [None]:
# !pip install newspaper3k
# import newspaper
# import json

In [None]:
# article = newspaper.Article(url = 'https://en.wikipedia.org/wiki/The_Dark_Knight_Rises', language='en')
# article.download()
# article.parse()

In [None]:
# article ={
#     "title": str(article.title),
#     "text": str(article.text),
#     "authors": article.authors,
#     "published_date": str(article.publish_date),
#     "top_image": str(article.top_image),
#     "videos": article.movies,
#     "keywords": article.keywords,
#     "summary": str(article.summary)
# }

In [None]:
dk_knight = NER(article['text'])

In [None]:
dk_knight_entities = [pd.DataFrame({'word': x.text, 'entity_detected': x.label_}, index = [0]) for x in dk_knight.ents]

# Alternatives

## BeautifulSoup alt.

In [None]:
# Alt way to get titles, but they have ...
# soup_alt = BeautifulSoup(page, 'html.parser')
# soup_class = soup_alt.find_all("a", class_ = 'result__a', href = True)
# url_titles_alt = [str(x).split('>')[1] for x in soup_class]
# title_df_alt = pd.DataFrame({'search_term': q, 'url_title': url_titles_alt})
# title_ner = list(map(NER, title_df_alt.url_title.values))
# title_results = []
# count_var = 0

# for i in title_ner:
  
#   title_results += list(map(lambda x: {'id': count_var, 'word': x.text, 'entity_detected': x.label_}, i.ents))
    
#   count_var +=1

# title_df_alt_final = pd.DataFrame(title_results)

## Gnews

In [None]:
def get_Gnews(search_term, geo = False):
  
  time.sleep(0.25)
  gn = GoogleNews()
  top = gn.search(search_term)

  entries = top["entries"]

  news_df = pd.DataFrame({'search_term': search_term,
                          'title': [x['title'] for x in entries],
                          'published': [x['published'] for x in entries],
                          'link': [x['links'][0]['href'] for x in entries],
                          'load_dttm': datetime.now()})
  

  return news_df

In [None]:
# g_results = [get_Gnews(x) for x in ner_search.word.values]

## DDGo APIs/scrapers

In [None]:
# Experiments

# DuckDuckpy option
# pip install duckduckpy
# https://github.com/ivankliuk/duckduckpy
# from duckduckpy import query
# response = query('Python', container='dict')

# from websearch import search
# keywords = 'what is machine learning'
# results = search(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=4)
# print(results.__next__().__dict__)