In [None]:
import gradio as gr 
import requests
import json
import re
import pandas as pd
from io import BytesIO

def fetch_url_info(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        "X-Respond-With": "markdown",
        "x-target-selector": "p,h1,h2,h3,h4,h5,li,div,title,meta",
        "Accept": "application/json",
        "X-With-Generated-Alt": "true",
        "X-With-Links-Summary": "true",
        "X-With-Images-Summary": "true",
        "X-Timeout": "200"
    }
    reader_url = 'https://r.jina.ai/'  

    try:
        response = requests.get(reader_url + url, headers=headers, timeout=60)
        if response.status_code == 200:
            try:
                return response.json()  
            except json.JSONDecodeError:
                return {"error": "Response is not in JSON format", "content": response.text}
        else:
            return {"error": f"Error fetching data from {url}, Status code: {response.status_code}"}
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}

def extract_content(info):
    if 'data' in info and 'content' in info['data']:
        content = info['data']['content']
        separator = "\n===============\n\n"
        if separator in content:
            title, content_body = content.split(separator, 1)
        else:
            title = None
            content_body = content
        return title, content_body
    elif 'error' in info:
        return None, info['error'] 
    else:
        return None, "No content found"

def fetch_and_retry(url, max_retries=3):
    attempt = 0
    info = None
    title, content = None, None

    while attempt < max_retries:
        info = fetch_url_info(url)
        title, content = extract_content(info)
        
        if content:
            break
        else:
            attempt += 1
    return title, content

def text_to_json_content(url, content):
    json_content = {
        "url": url,
        "content": content
    }
    return json_content

def process_urls(url_input):
    urls = url_input.splitlines()
    data_list = []

    for url in urls:
        title, content = fetch_and_retry(url.strip())
        if not content:
            continue
        
        json_result = text_to_json_content(url, content)
        content_str = json_result['content']

        matches = {
            "全国最低价": re.search(r"全国最低价：\s*\[?\*\*([0-9.]+-[0-9.]+万)\*\*\]?", content_str),
            "厂商指导价": re.search(r"厂商指导价：\s*\[?([0-9.]+-[0-9.]+万)\]?", content_str),
            "品牌": re.search(r"品牌：\s*\[?(\w+)\]?", content_str),
            "级别": re.search(r"级别：\s*\[?(\w+)\]?", content_str),
            "产地": re.search(r"产地：\s*\[?(\w+)\]?", content_str),
            "发动机": re.search(r"发动机：\s*\[?(\w+)\]?", content_str),
            "变速箱": re.search(r"变速箱：\s*\[?(\w+)\]?", content_str),
            "车身结构": re.search(r"车身结构：\s*\[?(\w+)\]?", content_str),
            "油耗": re.search(r"油耗：\s*\[?(\w+)\]?", content_str),
            "口碑": re.search(r"口碑：\s*\[?(\w+)\]?", content_str),
        }

        data = {
            "URL": url,
            "全国最低价": matches["全国最低价"].group(1) if matches["全国最低价"] else "暂无",
            "厂商指导价": matches["厂商指导价"].group(1) if matches["厂商指导价"] else "暂无",
            "品牌": matches["品牌"].group(1) if matches["品牌"] else "暂无",
            "级别": matches["级别"].group(1) if matches["级别"] else "暂无",
            "产地": matches["产地"].group(1) if matches["产地"] else "暂无",
            "发动机": matches["发动机"].group(1) if matches["发动机"] else "暂无",
            "变速箱": matches["变速箱"].group(1) if matches["变速箱"] else "暂无",
            "车身结构": matches["车身结构"].group(1) if matches["车身结构"] else "暂无",
            "油耗": matches["油耗"].group(1) if matches["油耗"] else "暂无",
            "口碑": matches["口碑"].group(1) if matches["口碑"] else "暂无"
        }

        data_list.append(data)

    df = pd.DataFrame(data_list)
    excel_file_path = "/tmp/car_info.xlsx"  # Vercel functions use /tmp for temporary files
    df.to_excel(excel_file_path, index=False)

    return data_list, excel_file_path

def gradio_app(request):
    url_input = request.json.get("url_input", "")
    data_list, excel_file_path = process_urls(url_input)
    formatted_output = "\n".join([
        f"{item['URL']}\n"
        f"全国最低价: {item['全国最低价']}, 厂商指导价: {item['厂商指导价']}, 品牌: {item['品牌']}, "
        f"级别: {item['级别']}, 产地: {item['产地']}, 发动机: {item['发动机']}, "
        f"变速箱: {item['变速箱']}, 车身结构: {item['车身结构']}, 油耗: {item['油耗']}, "
        f"口碑: {item['口碑']}\n"
        for item in data_list
    ])

    # Return the result as JSON response with file URL
    return {
        "formatted_output": formatted_output,
        "excel_file_url": excel_file_path
    }



  from .autonotebook import tqdm as notebook_tqdm
2024-11-09 17:47:35.274 
  command:

    streamlit run C:\Users\jsy13\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]


Running on local URL:  http://127.0.0.1:7860


--------


Running on public URL: https://aa731b52d6e9db27ae.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


