In [18]:
# Install required libraries

!pip install -q goose3
!pip install -q spacy
!pip install -q spacy-transformers
!python3 -m spacy download en_core_web_trf
!pip install -q google

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-trf==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.1/en_core_web_trf-3.4.1-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [19]:
from goose3 import Goose
import pandas as pd
from googlesearch import search
import spacy
from spacy import displacy


In [20]:
def create_search_query(risk_terms, companies_list):
    """
    Creates search queries from dataframes of risk terms and companies terms
    Returns dataframe of Risk Terms, Companies and the corresponding search queries
    Parameters
    ----------
    risk_terms : DataFrame
    companies_list : DataFrame

    Returns
    -------
    search_queries : Dataframe
    """
    search_queries = risk_terms.merge(companies_list, how="cross")
    search_queries["Search Query"] = (
        search_queries["Companies"] + " " + search_queries["Risk Terms"]
    )
    return search_queries


def get_url(query):
    """Fetch URLs from Google Search

    Args:
        query (string): Search Query

    Returns:
       url (string): URL of search result.
    """
    for url in search(query, tld="co.in", num=1, stop=1, pause=2):
        urls = url
    return urls


def goose_text_extraction(url):
    """
    Extract text from given URL

    Parameters
    ----------
    url: string
      URL of the article from which the text needs to be extracted

    Returns
    -------
    cleaned_text: string
      Text that has been extracted from the URL
    """
    g = Goose()
    article = g.extract(url)
    cleaned_text = article.cleaned_text
    return cleaned_text


In [21]:
# Importing data
companies = pd.DataFrame(["Costco", "Goldman Sachs", "Delta"], columns=["Companies"])
risk_terms = pd.DataFrame(["Layoffs", "Stock Price", "Fraud"], columns=["Risk Terms"])

# Creating Search Queries
risk_queries = create_search_query(risk_terms, companies)


In [24]:
# Getting urls for search queries
risk_queries["url"] = risk_queries["Search Query"].apply(get_url)

# Extract text from the URLS
risk_queries["Text"] = risk_queries["url"].apply(goose_text_extraction)


In [25]:
risk_queries.head(20)

Unnamed: 0,Risk Terms,Companies,Search Query,url,Text
0,Layoffs,Costco,Costco Layoffs,https://www.thelayoff.com/costco-wholesale,They treat there employees like c-ap. They jus...
1,Layoffs,Goldman Sachs,Goldman Sachs Layoffs,https://www.washingtonpost.com/business/2023/0...,Wall Street giant Goldman Sachs began laying o...
2,Layoffs,Delta,Delta Layoffs,https://www.paddleyourownkanoo.com/2022/12/01/...,"Delta Air Lines has hired a record 4,300 new f..."
3,Stock Price,Costco,Costco Stock Price,https://logos-download.com/212-costco-wholesal...,Some logos are clickable and available in larg...
4,Stock Price,Goldman Sachs,Goldman Sachs Stock Price,https://en.wikipedia.org/wiki/Goldman_Sachs,Goldman Sachs ( ) is an American multinational...
5,Stock Price,Delta,Delta Stock Price,https://news.delta.com/delta-air-lines-logos-b...,Downloads available below; just click the arro...
6,Fraud,Costco,Costco Fraud,https://customerservice.costco.com/app/answers...,It is an unfortunate fact of the Internet that...
7,Fraud,Goldman Sachs,Goldman Sachs Fraud,https://www.goldmansachs.com/security/,As our reliance on the internet and digital de...
8,Fraud,Delta,Delta Fraud,http://content.delta.com/content/dam/delta-www...,


In [26]:
# Saving Extracted Text and URLs
risk_queries.to_csv("risk_queries.csv", index=False)


In [27]:
nlp = spacy.load("en_core_web_trf")


def ner_get_entities(raw_text, ner_entities, org_entities, span_start, span_end):
    """
    Generate named entities given text.

    Parameters
    ----------
    raw_text : string
        String on which NER is applied.
    ner_entities : tuple
        List of named entities that were extracted.
    org_entities : list
        List of named organisation entities.
    span_start : list
        List of index of start locations of all named entities.
    span_end : list
        List of index of end locations of all named entities.

    Returns
    -------
    list
        Named Entities
    """
    text_ner = nlp(raw_text)

    if ner_entities:
        return text_ner

    organization = []
    start_char_org = []
    end_char_org = []

    for word in text_ner.ents:
        if word.label_ == "ORG":
            organization.append(word.text)
            start_char_org.append(word.start_char)
            end_char_org.append(word.end_char)

    if org_entities:
        return organization

    if span_start:
        return start_char_org

    if span_end:
        return end_char_org


In [28]:
risk_queries["NER"] = risk_queries["Text"].apply(
    ner_get_entities, args=(True, False, False, False)
)
risk_queries["Organization"] = risk_queries["Text"].apply(
    ner_get_entities, args=(False, True, False, False)
)
risk_queries["Start"] = risk_queries["Text"].apply(
    ner_get_entities, args=(False, False, True, False)
)
risk_queries["End"] = risk_queries["Text"].apply(
    ner_get_entities, args=(False, False, False, True)
)


In [29]:
risk_queries

Unnamed: 0,Risk Terms,Companies,Search Query,url,Text,NER,Organization,Start,End
0,Layoffs,Costco,Costco Layoffs,https://www.thelayoff.com/costco-wholesale,They treat there employees like c-ap. They jus...,"(They, treat, there, employees, like, c, -, ap...",[],[],[]
1,Layoffs,Goldman Sachs,Goldman Sachs Layoffs,https://www.washingtonpost.com/business/2023/0...,Wall Street giant Goldman Sachs began laying o...,"(Wall, Street, giant, Goldman, Sachs, began, l...","[Goldman Sachs, Goldman, Goldman, Amazon, Meta...","[18, 289, 509, 638, 646, 655, 665, 772, 813, 8...","[31, 296, 516, 644, 650, 663, 675, 778, 817, 8..."
2,Layoffs,Delta,Delta Layoffs,https://www.paddleyourownkanoo.com/2022/12/01/...,"Delta Air Lines has hired a record 4,300 new f...","(Delta, Air, Lines, has, hired, a, record, 4,3...","[Delta Air Lines, Delta, Delta, Delta, Delta, ...","[0, 323, 602, 753, 1084, 1130, 1267, 1307, 158...","[15, 328, 607, 758, 1089, 1149, 1272, 1312, 15..."
3,Stock Price,Costco,Costco Stock Price,https://logos-download.com/212-costco-wholesal...,Some logos are clickable and available in larg...,"(Some, logos, are, clickable, and, available, ...",[],[],[]
4,Stock Price,Goldman Sachs,Goldman Sachs Stock Price,https://en.wikipedia.org/wiki/Goldman_Sachs,Goldman Sachs ( ) is an American multinational...,"(Goldman, Sachs, (, ), is, an, American, multi...","[Goldman Sachs, Goldman Sachs, Goldman Sachs, ...","[0, 112, 352, 604, 999, 1563, 1635, 1643, 1735...","[13, 125, 365, 633, 1006, 1576, 1641, 1668, 17..."
5,Stock Price,Delta,Delta Stock Price,https://news.delta.com/delta-air-lines-logos-b...,Downloads available below; just click the arro...,"(Downloads, available, below, ;, just, click, ...","[Delta, Delta, Delta, Delta, Delta, Delta Air ...","[66, 217, 356, 615, 647, 725, 828, 1261, 1400,...","[71, 222, 361, 620, 652, 740, 833, 1266, 1436,..."
6,Fraud,Costco,Costco Fraud,https://customerservice.costco.com/app/answers...,It is an unfortunate fact of the Internet that...,"(It, is, an, unfortunate, fact, of, the, Inter...","[Costco, Costco, Costco, Social Security, Cost...","[215, 434, 784, 899, 1201, 1415, 1489, 1552, 1...","[221, 440, 790, 914, 1207, 1421, 1514, 1577, 1..."
7,Fraud,Goldman Sachs,Goldman Sachs Fraud,https://www.goldmansachs.com/security/,As our reliance on the internet and digital de...,"(As, our, reliance, on, the, internet, and, di...","[Goldman Sachs, Goldman Sachs, Dropbox, iCloud...","[479, 3868, 5771, 5780, 6874, 6884, 6899, 6919...","[492, 3881, 5778, 5786, 6882, 6894, 6906, 6926..."
8,Fraud,Delta,Delta Fraud,http://content.delta.com/content/dam/delta-www...,,(),[],[],[]


In [30]:
displacy.render(risk_queries.iloc[5, 5], style="ent", jupyter=True)