# **Lock numpy and genism version**
+ Lock numpy version: 1.26.4
+ Lock gensim version: 4.3.3


In [None]:
# Lock numpy version to prevent compatibility issues with gensim
!pip install gensim
import os
os._exit(0)

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [1]:
# Install gensim and lock numpy version to avoid compatibility issues
import gensim
import numpy

print(f"Gensim version: {gensim.__version__}")
print(f"Numpy version: {numpy.__version__}")

Gensim version: 4.3.3
Numpy version: 1.26.4


# **Ner_utils**
**1. Imports & Setup**
+ Loads Flair NER model and necessary libraries.
+ Imports company-to-ticker mapping and relative year dictionaries.
+ Sets a fixed current year (2024) for reference.

**2. extract_company_ticker(text)**
+ Uses Flair NER to detect organization names in the input text.
+ Checks for an exact match in the NASDAQ company list.
+ Applies fuzzy matching if no exact match is found.

**3. extract_year(text)**
+ Detects DATE entities using Flair or regex fallback.
+ Maps relative time expressions (e.g. "last year") to actual years.
+ Handles patterns like “5 years ago” and “2 years later”.

**4. Test Cases**
+ Covers valid company names with/without year info.
+ Includes typos to test fuzzy matching.
+ Handles unknown companies or missing time references.

In [2]:
!pip install flair



In [3]:
# A dictionary mapping company names and tickers (case-insensitive) to their official ticker symbols
company_to_ticker = {
    "apple": "AAPL",
    "aapl": "AAPL",
    "airbnb": "ABNB",
    "abnb": "ABNB",
    "adobe": "ADBE",
    "adbe": "ADBE",
    "analog devices": "ADI",
    "adi": "ADI",
    "automatic data processing": "ADP",
    "adp": "ADP",
    "autodesk": "ADSK",
    "adsk": "ADSK",
    "american electric power": "AEP",
    "aep": "AEP",
    "applied materials": "AMAT",
    "amat": "AMAT",
    "advanced micro devices": "AMD",
    "amd": "AMD",
    "amgen": "AMGN",
    "amgn": "AMGN",
    "amazon": "AMZN",
    "amzn": "AMZN",
    "ansys": "ANSS",
    "anss": "ANSS",
    "applovin": "APP",
    "app": "APP",
    "arm holdings": "ARM",
    "arm": "ARM",
    "asml holding": "ASML",
    "asml": "ASML",
    "broadcom": "AVGO",
    "avgo": "AVGO",
    "axon enterprise": "AXON",
    "axon": "AXON",
    "astrazeneca": "AZN",
    "azn": "AZN",
    "biogen": "BIIB",
    "biib": "BIIB",
    "booking holdings": "BKNG",
    "bkng": "BKNG",
    "baker hughes": "BKR",
    "bkr": "BKR",
    "coca-cola europacific partners": "CCEP",
    "ccep": "CCEP",
    "cadence design systems": "CDNS",
    "cdns": "CDNS",
    "cdw corporation": "CDW",
    "cdw": "CDW",
    "constellation energy": "CEG",
    "ceg": "CEG",
    "charter communications": "CHTR",
    "chtr": "CHTR",
    "comcast": "CMCSA",
    "cmcsa": "CMCSA",
    "costco": "COST",
    "cost": "COST",
    "copart": "CPRT",
    "cprt": "CPRT",
    "crowdstrike": "CRWD",
    "crwd": "CRWD",
    "cisco": "CSCO",
    "csco": "CSCO",
    "costar group": "CSGP",
    "csgp": "CSGP",
    "csx corporation": "CSX",
    "csx": "CSX",
    "cintas": "CTAS",
    "ctas": "CTAS",
    "cognizant": "CTSH",
    "ctsh": "CTSH",
    "doordash": "DASH",
    "dash": "DASH",
    "datadog": "DDOG",
    "ddog": "DDOG",
    "dexcom": "DXCM",
    "dxcm": "DXCM",
    "electronic arts": "EA",
    "ea": "EA",
    "exelon": "EXC",
    "exc": "EXC",
    "diamondback energy": "FANG",
    "fang": "FANG",
    "fastenal": "FAST",
    "fast": "FAST",
    "fortinet": "FTNT",
    "ftnt": "FTNT",
    "ge healthcare": "GEHC",
    "gehc": "GEHC",
    "globalfoundries": "GFS",
    "gfs": "GFS",
    "gilead sciences": "GILD",
    "gild": "GILD",
    "alphabet (class c)": "GOOG",
    "goog": "GOOG",
    "alphabet (class a)": "GOOGL",
    "googl": "GOOGL",
    "honeywell": "HON",
    "hon": "HON",
    "idexx laboratories": "IDXX",
    "idxx": "IDXX",
    "intel": "INTC",
    "intc": "INTC",
    "intuit": "INTU",
    "intu": "INTU",
    "intuitive surgical": "ISRG",
    "isrg": "ISRG",
    "keurig dr pepper": "KDP",
    "kdp": "KDP",
    "kraft heinz": "KHC",
    "khc": "KHC",
    "kla corporation": "KLAC",
    "klac": "KLAC",
    "linde": "LIN",
    "lin": "LIN",
    "lam research": "LRCX",
    "lrcx": "LRCX",
    "lululemon": "LULU",
    "lulu": "LULU",
    "marriott international": "MAR",
    "mar": "MAR",
    "microchip technology": "MCHP",
    "mchp": "MCHP",
    "mongodb": "MDB",
    "mdb": "MDB",
    "mondelez": "MDLZ",
    "mdlz": "MDLZ",
    "mercadolibre": "MELI",
    "meli": "MELI",
    "meta platforms": "META",
    "meta": "META",
    "monster beverage": "MNST",
    "mnst": "MNST",
    "marvell technology": "MRVL",
    "mrvl": "MRVL",
    "microsoft": "MSFT",
    "msft": "MSFT",
    "microstrategy": "MSTR",
    "mstr": "MSTR",
    "micron technology": "MU",
    "mu": "MU",
    "netflix": "NFLX",
    "nflx": "NFLX",
    "nvidia": "NVDA",
    "nvda": "NVDA",
    "nxp semiconductors": "NXPI",
    "nxpi": "NXPI",
    "old dominion freight": "ODFL",
    "odfl": "ODFL",
    "on semiconductor": "ON",
    "on": "ON",
    "o'reilly automotive": "ORLY",
    "orly": "ORLY",
    "palo alto networks": "PANW",
    "panw": "PANW",
    "paychex": "PAYX",
    "paccar": "PCAR",
    "pdd holdings": "PDD",
    "pepsico": "PEP",
    "palantir technologies": "PLTR",
    "paypal": "PYPL",
    "qualcomm": "QCOM",
    "regeneron pharmaceuticals": "REGN",
    "roper technologies": "ROP",
    "ross stores": "ROST",
    "starbucks": "SBUX",
    "synopsys": "SNPS",
    "atlassian": "TEAM",
    "t-mobile": "TMUS",
    "tesla": "TSLA",
    "the trade desk": "TTD",
    "take-two interactive": "TTWO",
    "texas instruments": "TXN",
    "verisk analytics": "VRSK",
    "vertex pharmaceuticals": "VRTX",
    "warner bros discovery": "WBD",
    "workday": "WDAY",
    "xcel energy": "XEL",
    "zscaler": "ZS",
    "payx": "PAYX",
    "pcar": "PCAR",
    "pdd": "PDD",
    "pep": "PEP",
    "pltr": "PLTR",
    "pypl": "PYPL",
    "qcom": "QCOM",
    "regn": "REGN",
    "rop": "ROP",
    "rost": "ROST",
    "sbux": "SBUX",
    "snps": "SNPS",
    "team": "TEAM",
    "tmus": "TMUS",
    "tsla": "TSLA",
    "ttd": "TTD",
    "ttwo": "TTWO",
    "txn": "TXN",
    "vrsk": "VRSK",
    "vrtx": "VRTX",
    "wbd": "WBD",
    "wday": "WDAY",
    "xel": "XEL",
    "zs": "ZS"
}

In [4]:
# Mapping of expressions indicating past years to the number of years before the current year
past_years_map = {
    "last year": 1, "the previous year": 1,
    "two years ago": 2, "three years ago": 3,
    "four years ago": 4, "five years ago": 5,
}

In [5]:
# Mapping of expressions indicating future years to negative values
# (relative to the current year; e.g., -1 means 1 year after now)
future_years_map = {
    "next year": -1, "one year later": -1, "in a year": -1,
    "two years later": -2, "in two years": -2,
    "three years later": -3, "in three years": -3,
    "four years later": -4, "in four years": -4,
    "in the future": -1, "future": -1,
    "the year after next": -2, "year after next": -2,
}

In [6]:
from flair.models import SequenceTagger
from flair.data import Sentence
from difflib import get_close_matches
import re

# Get the current year (based on financial report date)
CURRENT_YEAR = 2024

# Load the Flair NER model
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2025-03-24 16:38:34,272 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [7]:
def extract_company_ticker(text):
    """
    Extract company name and convert it to ticker symbol.
    1. Use Flair NER to detect organization names.
    2. Try exact match with known companies.
    3. Use fuzzy matching for close alternatives if direct match fails.
    """
    sentence = Sentence(text)
    tagger.predict(sentence)
    entities = sentence.to_dict(tag_type="ner")["entities"]

    # Extract organization (ORG) entities
    detected_companies = [ent["text"] for ent in entities if ent["labels"][0]["value"] == "ORG"]

    if detected_companies:
        ner_company = " ".join(detected_companies).lower()
        print(f"Detected company by NER: {ner_company}")

        # Check for exact match
        if ner_company in company_to_ticker:
            ticker = company_to_ticker[ner_company]
            print(f"Exact match found: {ticker}")
            return {"company": ner_company, "ticker": ticker}

        # Try fuzzy matching if not found
        closest_match = get_close_matches(ner_company, company_to_ticker.keys(), n=1, cutoff=0.6)
        if closest_match:
            matched_company = closest_match[0]
            ticker = company_to_ticker[matched_company]
            print(f"Fuzzy match: {matched_company}, Ticker：{ticker}")
            return {"company": matched_company, "ticker": ticker}

        print("Company detected but not in NASDAQ-100.")
        return {"company": ner_company, "ticker": None}

    print("No company detected by NER.")
    return {"company": None, "ticker": None}

In [8]:
def extract_year(text):
    """
    Extract year from input text.
    1. Try Flair NER to identify DATE entities.
    2. Use regex to find 4-digit years if NER fails.
    3. Handle relative time phrases like 'last year' or '2 years ago'.
    """
    sentence = Sentence(text)
    tagger.predict(sentence)
    entities = sentence.to_dict(tag_type="ner")["entities"]

    # Extract date entities
    detected_years = [ent["text"] for ent in entities if ent["labels"][0]["value"] == "DATE"]

    # If NER fails, use regex to extract year
    if not detected_years:
        regex_years = re.findall(r"\b(19[0-9]{2}|20[0-9]{2})\b", text)
        if regex_years:
            detected_years = regex_years

    year = detected_years[0] if detected_years else None

    # Check for past time expressions
    for key, past_offset in past_years_map.items():
        if key in text.lower():
            year = CURRENT_YEAR - past_offset

    # Check for future time expressions
    for key, future_offset in future_years_map.items():
        if key in text.lower():
            year = CURRENT_YEAR - future_offset  # Negative offset for future

    # Handle specific patterns like "5 years ago"
    match = re.search(r"(\d+) years ago", text.lower())
    if match:
        years_ago = int(match.group(1))
        year = CURRENT_YEAR - years_ago

    match = re.search(r"(\d+) years later", text.lower())
    if match:
        years_later = int(match.group(1))
        year = CURRENT_YEAR + years_later

    if year:
        match = re.search(r"\b(19[0-9]{2}|20[0-9]{2})\b", str(year))
        if match:
            year = int(match.group(1))
            print(f"Extracted year: {year}")
        else:
            print(f"Invalid year format: {year}")
            year = None
    else:
        print("No year detected.")

    return {"year": year} if year else None

In [9]:
# Test Inputs
test_texts = [
    "The revenue of Airbnb in 2021",  # Expected: ABNB, 2021
    "What was Facebook's revenue in 2023?",  # Facebook → META, 2023
    "Tell me about Tesl's performance",  # Misspelled 'Tesla' → TSLA, year is not specified
    "Nividia stock price forecast",  # Misspelled 'Nvidia' → NVDA, year is not specified
    "How is XYZ Corp performing?",  # Not in NASDAQ-100 → should return None, None
    "Trade Desk's profit in 2021",  # TTD, 2021
    "Analog's business viewpoint in 2026",  # ADI, 2026
    "I found that Amgen company performs better than before. Could you give me the revenue of Amgen?",  # AMGN, year is not specified
]

In [10]:
# Run Tests
for text in test_texts:
    print(f"\n🔎 Test Input: {text}")

    company_result = extract_company_ticker(text)
    year_result = extract_year(text)

    print("Company Extraction Result:", company_result)
    print("Year Extraction Result:", year_result)


🔎 Test Input: The revenue of Airbnb in 2021
Detected company by NER: airbnb
Exact match found: ABNB
Extracted year: 2021
Company Extraction Result: {'company': 'airbnb', 'ticker': 'ABNB'}
Year Extraction Result: {'year': 2021}

🔎 Test Input: What was Facebook's revenue in 2023?
Detected company by NER: facebook
Company detected but not in NASDAQ-100.
Extracted year: 2023
Company Extraction Result: {'company': 'facebook', 'ticker': None}
Year Extraction Result: {'year': 2023}

🔎 Test Input: Tell me about Tesl's performance
Detected company by NER: tesl
Fuzzy match: tesla, Ticker：TSLA
No year detected.
Company Extraction Result: {'company': 'tesla', 'ticker': 'TSLA'}
Year Extraction Result: None

🔎 Test Input: Nividia stock price forecast
Detected company by NER: nividia
Fuzzy match: nvidia, Ticker：NVDA
No year detected.
Company Extraction Result: {'company': 'nvidia', 'ticker': 'NVDA'}
Year Extraction Result: None

🔎 Test Input: How is XYZ Corp performing?
Detected company by NER: xyz 