In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [3]:
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [4]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
###Change here is that I add links as a variable to be accessed later, use for loop i.e get all links in the webpage then add all links into a list
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
ps = Website("https://prith27.github.io/")
ps.links

['mailto:prithvi.seshadri01@gmail.com',
 'tel:+919940654174',
 'https://www.linkedin.com/in/prithvi-seshadri-b736631b3/',
 'https://scholar.google.com/citations?user=aP4C_hoAAAAJ&hl=en',
 '#',
 'https://ieeexplore.ieee.org/document/10395841',
 'https://onlinelibrary.wiley.com/doi/10.1002/9781119905172.ch6',
 'https://ai.plainenglish.io/a-beginners-guide-to-training-a-yolov5-object-detection-model-91adffe99f79',
 'https://ai.plainenglish.io/building-accurate-object-detection-models-with-retinanet-a-comprehensive-step-by-step-guide-b8a35f435285',
 'https://iopscience.iop.org/article/10.1088/1742-6596/2115/1/012038']

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [7]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [8]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [9]:
print(get_links_user_prompt(ps))

Here is the list of links on the website of https://prith27.github.io/ - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
mailto:prithvi.seshadri01@gmail.com
tel:+919940654174
https://www.linkedin.com/in/prithvi-seshadri-b736631b3/
https://scholar.google.com/citations?user=aP4C_hoAAAAJ&hl=en
#
https://ieeexplore.ieee.org/document/10395841
https://onlinelibrary.wiley.com/doi/10.1002/9781119905172.ch6
https://ai.plainenglish.io/a-beginners-guide-to-training-a-yolov5-object-detection-model-91adffe99f79
https://ai.plainenglish.io/building-accurate-object-detection-models-with-retinanet-a-comprehensive-step-by-step-guide-b8a35f435285
https://iopscience.iop.org/article/10.1088/1742-6596/2115/1/012038


In [10]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [11]:
get_links("https://prith27.github.io/")

{'links': [{'type': 'LinkedIn profile',
   'url': 'https://www.linkedin.com/in/prithvi-seshadri-b736631b3/'},
  {'type': 'Google Scholar profile',
   'url': 'https://scholar.google.com/citations?user=aP4C_hoAAAAJ&hl=en'},
  {'type': 'IEEE document',
   'url': 'https://ieeexplore.ieee.org/document/10395841'},
  {'type': 'Wiley publication',
   'url': 'https://onlinelibrary.wiley.com/doi/10.1002/9781119905172.ch6'},
  {'type': 'AI guide article',
   'url': 'https://ai.plainenglish.io/a-beginners-guide-to-training-a-yolov5-object-detection-model-91adffe99f79'},
  {'type': 'RetinaNet guide article',
   'url': 'https://ai.plainenglish.io/building-accurate-object-detection-models-with-retinanet-a-comprehensive-step-by-step-guide-b8a35f435285'},
  {'type': 'IOPscience article',
   'url': 'https://iopscience.iop.org/article/10.1088/1742-6596/2115/1/012038'}]}

In [12]:
def get_all_details(url):  ###Here i am appending all website url details and info one below other, will understand if you see print statement
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [13]:
get_all_details("https://prith27.github.io/")

Found links: {'links': [{'type': 'linkedin page', 'url': 'https://www.linkedin.com/in/prithvi-seshadri-b736631b3/'}, {'type': 'scholar profile', 'url': 'https://scholar.google.com/citations?user=aP4C_hoAAAAJ&hl=en'}, {'type': 'IEEE document', 'url': 'https://ieeexplore.ieee.org/document/10395841'}, {'type': 'Wiley article', 'url': 'https://onlinelibrary.wiley.com/doi/10.1002/9781119905172.ch6'}, {'type': 'AI blog post 1', 'url': 'https://ai.plainenglish.io/a-beginners-guide-to-training-a-yolov5-object-detection-model-91adffe99f79'}, {'type': 'AI blog post 2', 'url': 'https://ai.plainenglish.io/building-accurate-object-detection-models-with-retinanet-a-comprehensive-step-by-step-guide-b8a35f435285'}, {'type': 'IOPscience article', 'url': 'https://iopscience.iop.org/article/10.1088/1742-6596/2115/1/012038'}]}


'Landing page:\nWebpage Title:\nPrithvi Seshadri\nWebpage Contents:\nPrithvi Seshadri\nData Scientist\nShow Contacts\nEmail\nprithvi.seshadri01@gmail.com\nPhone\n+91 9940654174\nBirthday\nSept 27, 2001\nLocation\nChennai, Tamil Nadu, India\nAbout\nResume\nBlog\nContact\nAbout me\nHey there! I\'m a Data Scientist at Shell, born in the BBQ haven of Houston, Texas, but currently rocking it in Chennai, India. By day, I wrangle data and deploy cutting-edge ML & DL algorithms to solve Shell’s asset-based mysteries with end-to-end CI/CD pipelines, saving the company millions.\nWhen I’m not crunching numbers, you\'ll find me on the football field. I\'ve played Division 1 football and represented my school in national competitions, winning numerous local tournaments along the way.\nOn the nerdy side, I love diving into data science research and have proudly published three papers in the field. Footballer by day, data scientist by... well, also by day. Welcome to my world!\nWhat I do\nData Scien

In [16]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [17]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [18]:
create_brochure("Prithvi Website", "https://prith27.github.io/")

Found links: {'links': []}


# Prithvi Seshadri - Data Scientist at Shell

---

## About Me
Hello! I’m Prithvi Seshadri, a Data Scientist currently working with Shell in Chennai, India. Originally from the BBQ haven of Houston, Texas, I’ve transformed my love for analyzing complex data into actionable insights that have saved millions for my company.

When not wrangling data or developing advanced machine learning algorithms, I indulge my passion for football, having played Division 1 and represented my school at national competitions.

---

## What I Do
### Data Science
Utilizing data to uncover patterns and developing advanced machine learning and deep learning solutions tailored for asset management.

### Artificial Intelligence
Research enthusiast with hands-on experience in AI algorithms and large language models.

### MLOps
Specialist in deploying fully automated CI/CD pipelines with expertise in Azure and GitHub Actions.

### Football
A dedicated Manchester United supporter and passionate football player.

---

## Education
- **Vellore Institute of Technology**  
  BTech in Computer Science (AI and ML Specialization)  
  Graduated: 2023 | CGPA: 9.16/10

- **DAV Public School**  
  AISSCE (CBSE) | 96.2% (Ranked 3rd in Computer Science)  
  Captain of the school football team.

---

## Experience
### Data Scientist @ Shell  
*Aug 2023 — Present*  
- Designed automated CI/CD pipelines on Azure for deploying advanced ML & DL algorithms.
- Implemented predictive maintenance models achieving approximately $9 million in cost savings.

### Data Analyst Intern @ Acies Global  
*May 2022 — Jun 2022*  
- Analyzed market trends to forecast sales fluctuations using Amperity and Snowflake.

---

## Publications & Blogs
1. **Computer Vision**  
   *"Hand Detection and Morse Code Translation for Alternative Communication"*  
   IEEE International Conference on System, Computation, Automation and Networking (ICSCAN) - Nov 18, 2023

2. **ML & DL**  
   *"Detection of Phishing URLs Using Machine Learning and Deep Learning Models"*  
   Evolution and Applications of Quantum Computing, Scrivener Publishing LLC - Jun 2, 2022

3. **Computer Vision**  
   *"A Beginner's Guide to Training a YOLOv5 Object Detection Model"* - Medium.com - Feb 24, 2023

4. **Computer Vision**  
   *"Building Accurate Object Detection Models with RetinaNet: A Comprehensive Step-by-Step Guide"* - Medium.com - Feb 20, 2023

5. **AI & DL**  
   *"Web Based COVID Detection System using Deep Learning"*  
   Journal of Physics: Conference Series, IOP Science - Nov 24, 2021

---

## Connect with Me
For collaboration or inquiries, feel free to reach out:
- **Email**: [prithvi.seshadri01@gmail.com](mailto:prithvi.seshadri01@gmail.com)
- **Phone**: +91 9940654174

---

Join me in making data-driven decisions and exploring the exciting intersection of technology and sports!

In [19]:
###Now UI
import gradio as gr

In [25]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Give the response in a structured manner which looks good.\
Include details of company culture, customers and careers/jobs if you have the information."

In [26]:
def stream_gpt(prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
      ]
    stream = openai.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        stream=True
    )
    result = ""
    for chunk in stream:
        result += chunk.choices[0].delta.content or ""
        yield result

In [27]:
def stream_brochure(company_name, url):
    prompt = get_brochure_user_prompt(company_name,url)
    result = stream_gpt(prompt)
    yield from result

In [28]:
view = gr.Interface(
    fn=stream_brochure,
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://")],
    outputs=[gr.Markdown(label="Brochure:")],
    flagging_mode="never"
)
view.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




Found links: {'links': [{'type': 'about page', 'url': 'https://www.shell.com/about-us.html'}, {'type': 'careers page', 'url': 'https://www.shell.com/careers.html'}, {'type': 'company page', 'url': 'https://www.shell.com/company.html'}, {'type': 'sustainability page', 'url': 'https://www.shell.com/sustainability.html'}, {'type': 'news page', 'url': 'https://www.shell.com/media.html'}]}
