# Machine Learning for Business - Individual Coursework
## Market Research Assistant

#### Part 3: IPYNB File
Candidate ID: MPZY3

### Installing Libraries and Setting Up

In [None]:
!pip install -q langchain-google-genai
!pip install -q langchain-community
!pip install -q wikipedia

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m29.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m29.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.4 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.16 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5

In [None]:
# Import libraries
import json
import pandas as pd
import requests
import time
import wikipedia
import openai
from langchain_core.prompts import ChatPromptTemplate
from langchain.retrievers import WikipediaRetriever
from langchain_google_genai import ChatGoogleGenerativeAI

In [None]:
wikipedia.set_lang("en")  # Ensures results are in English
wikipedia.__dict__["BeautifulSoup"] = lambda html: wikipedia.BeautifulSoup(html, features="lxml")

In [None]:
# Import API keys
from google.colab import userdata
import os

os.environ['GOOGLE_API_KEY'] = userdata.get('mlb1') # Google Gemini's API key for generating reports
os.environ['OPENAI_API_KEY'] = userdata.get('openai') # OpenAI's API key for evaluating reports

### Model Setup

In [None]:
# Selecting LLM for the assistant
model = "gemini-1.5-flash"
max_output_tokens = 2000

In [None]:
# Defining the LLM
llm = ChatGoogleGenerativeAI(
    temperature=0.4,
    model=model,
    max_output_tokens=max_output_tokens,
)

In [None]:
# Setting model configuration values
class Config:
    WIKI_MAX_CHARS = 10000
    LLM_TEMP = 0.4
    RETRIES = 3

### Defining Functions

#### Extract Information from Wikipedia

In [None]:
def fetch_wikipedia_data(industry: str):
    """Retrieve relevant Wikipedia content while filtering out irrelevant pages."""
    try:

        # Step 1: Try finding a direct Wikipedia page for "{industry} industry"
        try:
            main_page = wikipedia.page(f"{industry} industry", auto_suggest=True)
            return {
                "content": main_page.content[:Config.WIKI_MAX_CHARS],
                "url": [main_page.url],
                "page_title": main_page.title
            }
        except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError):
            print(f"No direct industry page for: {industry}. Trying category search...")

        # Step 2: Check if a Wikipedia category exists for "{industry} industry"
        category_search_term = f"Category:{industry} industry"
        category_url = f"https://en.wikipedia.org/wiki/{category_search_term.replace(' ', '_')}"
        category_response = requests.get(category_url)

        if category_response.status_code == 200:
            return fetch_wikipedia_category_pages(category_search_term)

        # Step 3: Use Wikipedia search to find related pages (but filter irrelevant ones)
        print(f"No industry category found. Using filtered Wikipedia search...")
        return fetch_filtered_wikipedia_page(industry)

    except Exception as e:
        return None

In [None]:
def fetch_wikipedia_category_pages(category: str):
    """Retrieve Wikipedia pages from an industry category."""
    try:

        # Wikipedia API request URL
        api_url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle={category.replace(' ', '_')}&cmlimit=5"
        response = requests.get(api_url).json()

        # Retrieve list of pages under the relevant Wikipedia category
        pages = response.get("query", {}).get("categorymembers", [])

        # If pages are not found
        if not pages:
            print(f"[WARNING] No pages found in category: {category}")
            return None

        relevant_docs = []
        urls = []

        for page in pages:
            try:
                page_data = wikipedia.page(page["title"], auto_suggest=False)

                # Filter pages that are too generic or unrelated
                if industry.lower() not in page_data.title.lower() and "industry" not in page_data.title.lower():
                    print(f"[WARNING] Skipping unrelated page: {page_data.title}")
                    continue

                # Storing data
                relevant_docs.append(page_data.content[:Config.WIKI_MAX_CHARS])
                urls.append(page_data.url)

            # Handling errors
            except wikipedia.exceptions.PageError:
                print(f"[WARNING] Page not found: {page['title']}")
                continue
            except wikipedia.exceptions.DisambiguationError:
                print(f"[WARNING] Disambiguation page detected: {page['title']}. Skipping...")
                continue

        if not relevant_docs:
            return None

        return {
            "content": "\n\n".join(relevant_docs),
            "url": urls,
            "page_title": category
        }

    except Exception as e:
        return None


In [None]:
def fetch_filtered_wikipedia_page(topic: str):
    """Fallback method: Fetches a Wikipedia page while filtering out irrelevant results."""
    try:
        # Search Wikipedia for the topic and get a maximum of 2 results
        search_results = wikipedia.search(topic, results=2)

        for title in search_results:
            try:
                page = wikipedia.page(title, auto_suggest=False)

                # Filtering Logic: Ensure the page title is relevant to the industry
                if topic.lower() not in page.title.lower() and "industry" not in page.title.lower():
                    print(f"[WARNING] Skipping irrelevant page: {page.title}")
                    continue

                # Extract Wikipedia page content
                content = page.content[:Config.WIKI_MAX_CHARS]  # Truncate to avoid overload
                url = page.url

                return {
                    "content": content,
                    "url": [url],
                    "page_title": page.title
                }

            # Handling errors
            except wikipedia.exceptions.PageError:
                continue
            except wikipedia.exceptions.DisambiguationError:
                continue

        return None

    except Exception as e:
        return None

#### Define LLM System Prompt

In [None]:
def create_llm_prompt(industry: str, wiki_content: str):
    """
    Returns a prompt requesting a ~600-word market research report in JSON format.
    """
    return f"""
      You are a market research assistant tasked with writing a comprehensive report (approximately 600 words total) about the {industry} industry.
      Read the Wikipedia page given below to understand the industry.

      Your report MUST be approximately 600 words in total. Do not stop generating early. Use the Wikipedia content below as background.

      Use the Wikipedia content below as the **ONLY** source of information.

      You MUST return only JSON, with no additional text or explanations and must have exactly these fields:
      ```json
      {{
        "industry": "Industry name",
        "industry_overview": "A summary (120-150 words).",
        "competitive_landscape": {{
          "competition_intensity": "Low, Medium, or High. Additionally, give a short explanation (50-60 words)",
          "key_players": [
            "List 3 to 5 dominant companies/key players/manufacturers/organisations with brief explanations (60-70 words each).",
            "If Wikipedia does not mention any companies, state: 'Insufficient data available.' DO NOT GUESS OR INFER COMPANIES."
            ],
          "barriers_to_entry": "List 3 to 5 major obstacles for new companies entering the market (50-60 words each)."
        }},
        "emerging_technologies": [
          "List of 3 to 5 key technologies shaping the industry (100-120 words total)."
        ],
        "consumer_insights": {{
          "target_audience": "Main demographics of customers (70-80 words total).",
          "buying_trends": "Consumer behaviours shaping the market (70-80 words total)."
        }},
        "risk_analysis": {{
          "economic_risks": "Potential economic downturns and their impact (50-60 words).",
          "technological_risks": "Challenges from rapid technological changes (50-60 words).",
          "geopolitical_risks": "Trade restrictions, policy shifts, or global instability (50-60 words)."
        }},
        "high_growth_areas": "Segments of the industry expected to grow rapidly (80-90 words).",
        "wikipedia_sources": "List of URLs referencing Wikipedia pages used."
      }}
      ```

      Requirements:
      1. Your output must be close to **600 words**.
      2. If the output is shorter, expand your explanations. Do NOT shorten responses unnecessarily.
      3. If your response is too long, reduce explanation length per section.
      4. DO NOT exceed 600 words.
      5. The 'wikipedia_sources' field is an array of URL strings referencing the pages used.
      6. Do not include any extra keys beyond the specified JSON fields.
      7. Do NOT create fictional companies, facts or trends.
      8. Verify that the Wikipedia page is actually about the mentioned industry and ensure all responses are accurate and directly supported by Wikipedia data.

      Wikipedia content:
      {wiki_content}
      """

#### Generate Industry Report

In [None]:
def generate_industry_report(industry: str):
    """Generates a market research report and evaluates its relevance before displaying it."""

    # Fetch Wikipedia data
    wiki_data = fetch_wikipedia_data(industry)
    if not wiki_data:
        return None

    # Initialise prompt
    prompt = create_llm_prompt(industry, wiki_data["content"])

    # Run LLM
    for attempt in range(Config.RETRIES):
        try:
            # Get a response from the LLM
            response = llm.invoke(prompt)
            if not response or not response.content:
                continue

            report = response.content.strip()

            # Ensure Wikipedia sources are explicitly added BEFORE printing
            report = json.loads(report.replace("```json", "").replace("```", "").strip())

            # Ensure Wikipedia sources are correctly included in the JSON
            report["wikipedia_sources"] = wiki_data["url"]

            # Convert back to JSON with proper formatting
            report = "```json\n" + json.dumps(report, indent=2) + "\n```"

            # Evaluating relevance of the model
            evaluation = evaluate_response(industry, report) # Chaining the evaluation function
            if evaluation.lower() in ["yes", "ambiguous"]: # Only printing relevant reports
                print(report)
            else:
                print(f"No relevant information found for the {industry} industry.")
            break
        except Exception as e:
            print(f"[ERROR] Failed to generate report for {industry}: {str(e)}")

    return None

#### Evaluate Report Quality and Relevance

In [None]:
def evaluate_response(industry: str, report: dict):
    """Evaluates the generated report for relevance."""
    system_prompt_eval = f"""
      Critically evaluate the provided response to the customer's query regarding market research.
      Assess the following:
      - Does the response provide a factually correct answer based on the query and the relevant Wikipedia page?
      - Is the response actually from a relevant Wikipedia page and does it include specific information about {industry}?
      - Does the response contain sufficient depth and detail?
      - Is the response structured in a way that makes sense for business professionals?
      - Is the AI hallucinating and generating information that is not true?

      **IMPORTANT RULES**
      If the response is appropriate, return "Yes".
      Focus on whether the content in the report is actually related to the {industry} industry. If it is not relevant, return "No".
      If the response is incorrect, lacks details, or is too generic return "No" instead of "Yes".
      If the response is uncertain or lacks sufficient information, return "Ambiguous".

      Respond only with "Yes", "No", or "Ambiguous" with no extra text.
      """

    prompt = f"{system_prompt_eval}\n\nGenerated Market Research Report:\n{json.dumps(report, indent=2)}"

    # Initialise OpenAI API key
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # Generate a response
    response = client.chat.completions.create(
            model="gpt-4o-mini",  # Using a different LLM to verify the report generated
            messages=[
                {"role": "system", "content": system_prompt_eval},
                {"role": "user", "content": prompt}
            ],
            max_tokens=50  # Keeping response short as we only need "Appropriate", "Not Appropriate", or "Ambiguous"
        )

    evaluation_result = response.choices[0].message.content.strip()
    return evaluation_result

### Generating Reports

In [None]:
generate_industry_report("Real Estate on Jupiter") # Testing on a fake industry

No relevant information found for the Real Estate on Jupiter industry.


In [None]:
generate_industry_report("Financial Services")

```json
{
  "industry": "Financial Services",
  "industry_overview": "Financial services encompass a wide array of economic services related to finance, provided by various institutions.  These services include managing finances, offering consumer finance options, and facilitating transactions at a macro level impacting global politics and economics.  The industry's influence is significant, leading to ongoing discussions about its power and scale, particularly in developed economies.  Key players include commercial banks, investment banks, insurance companies, and various other specialized firms offering products like credit cards, mortgages, and investment management services. The industry's structure and offerings have evolved, particularly due to legislation like the Gramm-Leach-Bliley Act, allowing mergers and diversification of services within holding companies.",
  "competitive_landscape": {
    "competition_intensity": "High. The financial services industry is characterized by 

In [None]:
generate_industry_report("Music Industry")

```json
{
  "industry": "Music Industry",
  "industry_overview": "The music industry encompasses individuals and organizations involved in creating, producing, distributing, and performing music.  This includes songwriters, composers, musicians, singers, record labels, publishers, recording studios, music producers, retailers, digital music stores, concert promoters, and various support professionals like managers, lawyers, and agents.  The industry's structure involves creating compositions, recordings, and media (physical and digital), each with its own ownership and revenue streams.  The industry has undergone significant transformation with the rise of digital distribution, impacting sales and revenue models.",
  "competitive_landscape": {
    "competition_intensity": "High. The industry is dominated by a few major players, but faces intense competition among independent labels and digital platforms for market share and artist acquisition.",
    "key_players": [
      "Universal Mu

In [None]:
generate_industry_report("Aerospace Industry")

```json
{
  "industry": "Aerospace Industry (United Kingdom)",
  "industry_overview": "The UK aerospace industry is the second-largest globally and the largest in Europe by turnover, holding a 17% global market share in 2019.  Employing 116,000 people in 2020, it boasts a rich history of innovation, responsible for pioneering advancements like the first enclosed-cabin aircraft and the first supersonic commercial jetliner.  The industry's success is driven by a blend of domestic and foreign companies,  a strong government partnership, and a focus on both crewed and unmanned aircraft technologies.  The sector's future hinges on adapting to a competitive global landscape and embracing emerging technologies.",
  "competitive_landscape": {
    "competition_intensity": "High. The global aerospace market is intensely competitive, with established players vying for contracts and market share, requiring significant investment and technological expertise.",
    "key_players": [
      "BAE System

In [None]:
generate_industry_report("Automotive Industry")

```json
{
  "industry": "Automotive Industry (India)",
  "industry_overview": "India's automotive industry is the world's fourth-largest by production and valuation (2022), and the third-largest market by sales (2023).  Valued at over US$100 billion (April 2022), it contributes significantly to India's GDP (7.1%) and exports (8%). While car ownership is increasing (60% of households own a vehicle), the rate of car ownership per capita remains relatively low (around 400 per 1,000 people). The industry's history is marked by initial reliance on imports, followed by the emergence of domestic manufacturers and a period of import substitution and government regulation.",
  "competitive_landscape": {
    "competition_intensity": "Medium. While a few dominant players exist, the market shows signs of increasing competition with new entrants and evolving consumer preferences.",
    "key_players": [
      "Hindustan Motors: Initially collaborated with Morris Motors, producing the Ambassador.  La

In [None]:
generate_industry_report("Hollywood")

```json
{
  "industry": "Hollywood Film Industry",
  "industry_overview": "The Hollywood film industry, centered in Los Angeles, California, has been a major global force since the early 20th century.  Initially developing on the East Coast, the industry migrated westward due to favorable weather and lower production costs.  Hollywood's influence is seen in its development of classical filmmaking styles and its global reach, with films often released in multiple languages. While the US is the fourth largest film producer (behind India, Japan, and China), Hollywood's impact far surpasses its production volume, shaping global cinematic trends and influencing filmmaking worldwide. The industry's success is tied to its innovative spirit, technological advancements, and the creation of iconic films and genres.",
  "competitive_landscape": {
    "competition_intensity": "High. The industry is dominated by a few major studios, creating intense competition for market share, talent, and distrib

In [None]:
generate_industry_report("Pharmaceutical Industry")

```json
{
  "industry": "Pharmaceutical Industry",
  "industry_overview": "The pharmaceutical industry encompasses the discovery, development, production, and marketing of medications and medical devices to treat, prevent, or alleviate symptoms of diseases and injuries.  This industry includes companies dealing in both generic and branded drugs, with the latter protected by chemical patents.  Subdivisions exist, such as biologics manufacturing and total synthesis.  Stringent regulations govern patenting, testing, safety, and marketing. In 2020, the global market generated $1,228.45 billion in treatments, showing a 1.8% CAGR in 2021, despite the COVID-19 pandemic.  Its origins trace back to the mid-to-late 1800s in developed nations like Germany, Switzerland, and the U.S., evolving from local apothecaries to large-scale manufacturers.",
  "competitive_landscape": {
    "competition_intensity": "High. The industry is characterized by intense competition due to the high barriers to entry,

In [None]:
generate_industry_report("Chocolate")

```json
{
  "industry": "Chocolate Industry",
  "industry_overview": "The chocolate industry encompasses the production and distribution of chocolate products, derived from roasted and ground cocoa beans.  With a history spanning millennia, starting with the Mayo-Chinchipe culture in Ecuador, chocolate evolved from bitter beverages in Mesoamerica to the diverse range of sweet treats and products consumed globally today.  The industry involves complex processes from bean fermentation and roasting to the creation of various chocolate types, including dark, milk, and white chocolate, along with numerous confectionery items.  Significant production centers shifted from the Americas to West African countries like Ivory Coast and Ghana, which now account for a substantial portion of global cocoa supply.  However, ethical concerns regarding child labor persist within the industry.",
  "competitive_landscape": {
    "competition_intensity": "High. The chocolate market is highly competitive, wi

#### Question 9 Reports

In [None]:
generate_industry_report("Aluminium")

```json
{
  "industry": "Aluminium Industry",
  "industry_overview": "The aluminium industry encompasses the mining of bauxite, its processing into alumina, and the subsequent smelting of alumina into aluminium metal.  The industry's development was significantly accelerated by the Hall-H\u00e9roult process in 1886, enabling mass production and widespread use.  Aluminium's unique properties\u2014lightweight yet strong, excellent conductor of heat and electricity, and corrosion-resistant due to its oxide layer\u2014make it crucial in transportation, engineering, construction, and packaging.  Its abundance in the Earth's crust, though primarily in compounds, makes it a readily available resource.  The industry's growth is tied to global economic activity and technological advancements.",
  "competitive_landscape": {
    "competition_intensity": "Medium.  While the raw material base is relatively abundant, the energy-intensive nature of smelting and the need for specialized technology cre

In [None]:
generate_industry_report("Advertising")

```json
{
  "industry": "Advertising Industry",
  "industry_overview": "The advertising industry is a global, multibillion-dollar business encompassing public relations, marketing companies, media services, and advertising agencies.  It acts as a crucial link between manufacturers and consumers, serving clients ranging from non-profit organizations to Fortune 500 companies.  The industry's scale is substantial; in the US alone, over 65,000 agencies employed nearly 250,000 people, generating $166.8 billion in revenue by 2014. Global advertising sales reached $493 billion in 2016, with digital ad sales surpassing TV advertising in 2017, highlighting the industry's dynamic and ever-evolving nature.",
  "competitive_landscape": {
    "competition_intensity": "High. The industry is characterized by intense competition among large multinational agencies and smaller specialized firms, leading to constant innovation and price pressures.",
    "key_players": [
      "WPP plc: A global advertisi

In [None]:
generate_industry_report("Internet Services & Infrastructure")

```json
{
  "industry": "Internet Services & Infrastructure (IaaS)",
  "industry_overview": "The Internet Services & Infrastructure industry, specifically Infrastructure as a Service (IaaS), provides computing resources like storage, networks, servers, and virtualization to users via cloud services.  Users manage their operating systems and applications but not the underlying infrastructure. IaaS offers scalability on demand and is available in public, private, or hybrid cloud models.  The industry is characterized by high-level APIs for managing resources and is considered the most basic cloud service model by the Internet Engineering Task Force (IETF).  The global market is experiencing significant growth, driven by increased adoption across various sectors.",
  "competitive_landscape": {
    "competition_intensity": "High. The IaaS market is characterized by intense competition among major cloud providers, each striving for market share and offering similar services.  This leads to 

In [None]:
generate_industry_report("Automobile Manufacturers")

```json
{
  "industry": "Automobile Manufacturers Industry (China)",
  "industry_overview": "China boasts the world's largest automotive industry, encompassing hundreds of manufacturers.  These include state-owned enterprises (SOEs), privately owned companies, foreign manufacturers, and joint ventures.  The industry's structure is complex, with varying levels of government control and a wide range of brands and models.  The market is characterized by intense competition, rapid technological advancements, and significant government influence.  The industry's size and complexity make it a dynamic and challenging sector to analyze.",
  "competitive_landscape": {
    "competition_intensity": "High. The presence of numerous state-owned and privately owned manufacturers, along with foreign players and joint ventures, creates a highly competitive market environment.",
    "key_players": [
      "SAIC Motor: A major state-owned manufacturer with diverse brands like Roewe and MG Motor, showcasi

In [None]:
generate_industry_report("Automobile Manufacturers in South Korea")

No direct industry page for: Automobile Manufacturers in South Korea. Trying category search...
No industry category found. Using filtered Wikipedia search...
```json
{
  "industry": "Automobile Manufacturers in South Korea",
  "industry_overview": "South Korea's automotive industry is the fifth largest globally in both production and export volume.  Initially focused on assembling imported parts, it has evolved into a technologically advanced sector.  Annual domestic output surpassed one million units in 1988, and the 1990s saw the development of numerous in-house models, showcasing design, performance, and technological capabilities.  The industry's growth is marked by government support, strategic partnerships, and a focus on export markets, particularly North America.",
  "competitive_landscape": {
    "competition_intensity": "High. The industry has seen intense competition, both domestically and internationally, leading to mergers, acquisitions, and a constant push for innovation