# MAIN PROJECT BODY

In [15]:
from transformers import pipeline
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from langdetect import detect

def translate_with_mymemory(text, source_lang='auto', target_lang='en'):
    # Split text into chunks of 500 characters
    chunks = [text[i:i+500] for i in range(0, len(text), 500)]
    translated_text = ''

    for chunk in chunks:
        url = "https://api.mymemory.translated.net/get"
        params = {
            "q": chunk,
            "langpair": f"{source_lang}|{target_lang}"
        }
        response = requests.get(url, params=params)
        json_data = response.json()
        translated_text += json_data['responseData']['translatedText'] + ' '
    
    return translated_text

def translate_article_with_selenium(url):
    # Setup WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('headless')  # Run in background
    driver = webdriver.Chrome(options=options)

    # Fetch the webpage
    driver.get(url)
    content = driver.find_element(By.TAG_NAME, 'body').text

    driver.quit()

    # Detect the language
    lang = detect(content)
    print(f"Detected language: {lang}")
    return content
    # Translate to English if not already in English
    # if lang != 'en':
    #     translated_text = translate_with_mymemory(content, lang, 'en')
    # else:
    #     translated_text = content

    # return translated_text

# Example usage
url = "https://ndtv.in/india/mumbai-start-up-founder-expresses-happiness-over-nvidias-support-5269980"
translated_text = translate_article_with_selenium(url)
print(translated_text)





Detected language: hi
NDTV
WORLD EDITION
PROFIT
हिंदी
IPL 2024
MOVIES
FOOD
LIFESTYLE
HEALTH
SWASTH
TECH
INFLUENCERS
GAMES
BIG BONUS
SHOPPING
LIVE टीवी
ताज़ातरीन
देश
चुनाव
वीडियो
MP-छत्तीसगढ़
राजस्थान
क्रिकेट
वेबस्टोरीज़
बॉलीवुड
होम
देश 
"मैं उत्साहित हूं..." : मुंबई स्टार्ट-अप के संस्थापक ने Nvidia के समर्थन पर जताई खुशी
"मैं उत्साहित हूं..." : मुंबई स्टार्ट-अप के संस्थापक ने Nvidia के समर्थन पर जताई खुशी
स्फेरिकल इनसाइट्स एंड कंसल्टिंग की एक रिपोर्ट के अनुसार, वैश्विक एआई बाजार 2022 में 168.5 बिलियन डॉलर से बढ़कर 2032 तक 2 ट्रिलियन डॉलर से अधिक होने का अनुमान है.
Edited by चंदन वत्स, Updated: 19 मार्च, 2024 7:24 PMhttps://ndtv.in/topic/chandan-batsya
मुंबई: एनवीडिया (NVIDIA) आर्टफिशियल इंटेलीजेंस (Artificial Intelligence) कंपनी के तौर पर काम करने के साथ ही ग्राफिक्स प्रोसेसिंग यूनिट्स के क्षेत्र में भी सफलतापूर्वक आगे बढ़ रही है. योट्टा डेटा सर्विसेज के एनवीडिया से ऑर्डर किए गए 4,000 से ज्यादा H100 सेमीकंडक्टर चिप्स सोमवार रात मुंबई के उपनगरीय इलाके में डिलेवर हुए. सभी कर्मचारी रात 10

In [22]:
from transformers import MarianMTModel, MarianTokenizer

# Load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-hi-en"  # Hindi to English translation model
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
trans=''
# Define your input text in Hindi
#input_text = "आपका स्वागत है। कृपया अपना नाम बताएं।"
chunks = [translated_text[i:i+200] for i in range(1000, len(translated_text)-1000, 200)]
for chunk in chunks:
    # Tokenize the input text
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

    # Perform translation
    outputs = model.generate(**inputs)

    # Decode the translated text
    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    trans= trans +' '+translated
    # Print the translated text
    print("Translated text:", translated)


Translated text: He was so excited about the arrival that the truck arrived for the post office had turned to the coconuts. He was welcomed with flowers and said that it was like a dream to be fulfilled.
Translated text: In building an developed India's way to start up, start up and drones: The number of · Global Voices
Translated text: Two associate members of the Democratic Republic of the Congo, without wasting time, were on the street, some of the people got off the ground, some of them talking about the days of an Invidor from an African Corporation. The company's products are very good and that's the development of AI.
Translated text: There's a lot of help for me, according to the Blueburg report, the AI technology has caused the world's industry to stir up.
Translated text: That's where Yota is putting the biggest difficulty in India at AI. The chief executive officer and co-computer, Helea secreta, has built the country with the Santribution of Envision, which is due to the co

In [26]:
summarizer = pipeline("summarization", model="BeenaSamuel/t5_cnn_daily_mail_abstractive_summarizer_v2")
response = summarizer(trans, num_beams=2, do_sample=True, max_length=1000)
print(response[0]["summary_text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Token indices sequence length is longer than the specified maximum sequence length for this model (1170 > 512). Running this sequence through the model will result in indexing errors


The U.S. computing service is determined to reach the lowest price in the world at the minimum price of an EnVdia AI chip . The most advanced Internet chips in the market are expected to reach nearly 20,000 . There has been a difficult start in the technologyntic acidics .


In [24]:
len(trans)

4728

## Chinese to English

In [2]:
'''
from transformers import MarianMTModel, MarianTokenizer
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from langdetect import detect

def translate_with_transformers(text, model_name="Helsinki-NLP/opus-mt-zh-en"):
    # Load the model and tokenizer
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    translated_text = ''
    
    # Split text into chunks of 200 characters, ensuring not to cut off sentences
    chunks = [text[i:i+200] for i in range(0, len(text), 200)]
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs)
        translated_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_text += translated_chunk + ' '

    return translated_text

def translate_article_with_selenium(url):
    # Setup WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('headless')  # Run in background
    driver = webdriver.Chrome(options=options)

    # Fetch the webpage
    driver.get(url)
    content = driver.find_element(By.TAG_NAME, 'body').text

    driver.quit()

    # Detect the language
    lang = detect(content)
    print(f"Detected language: {lang}")

    # Translate to English if not already in English
    if lang.startswith('zh'):
        translated_text = translate_with_transformers(content)
    else:
        translated_text = content

    return translated_text

# Example usage
url = "https://tech.ifeng.com/c/8WybGOq7JCe"
translated_text = translate_article_with_selenium(url)
print(translated_text)

'''


Detected language: zh-cn
First page, information, video, live broadcast, Phoenix Watch, Finance, Recreation, Sports, Fashion, Automobile, Technology, Reading, Culture, History, Military, Tourism, Buddhism, More Registers, Login, Technology > Artificial Intelligence > Body Station, Nightmare of High-end Beating Workers? Using a self-developed artificial intelligence system called ChipNeMo to help companies improve the efficiency of their manufacturing chips, designed to speed up the production of GPUs, it is stated that the design of GPUs is a labour-intensive exercise, that a chip usually requires close to a thousand people to construct, that everyone needs to have a full understanding of the design details and to work together, and that Chip NeMo is able to respond quickly to work and to queries related to chip design, such as the creation of a GPU structure and chip design code. According to the Wall Street Journal, Yvatar launched C last year (2023) in October. After hipnemo, a lot 

newspaper3k

In [1]:
from newspaper import Article

def fetch_article_content(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

# Example usage
url = "https://tech.ifeng.com/c/8WybGOq7JCe"
article_text = fetch_article_content(url)
print("Extracted Article Text:")
print(article_text)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\10195\AppData\Local\Temp\jieba.cache
Loading model cost 0.353 seconds.
Prefix dict has been built successfully.


Extracted Article Text:
英伟达

凤凰网科技讯 《AI前哨》（作者/尹明顺）2月7日，据外媒消息称，英伟达正在使用自己开发的名为ChipNeMo的人工智能系统帮助公司提升制造芯片效率，旨在加快GPU的生产速度。

据称设计GPU是一项劳动密集型工作，一个芯片通常需要近千人来建构，每个人都需要对设计细节具有充分的了解并进行协同工作，而ChipNeMo则能够快速对工作进行给予响应，并且回复与芯片设计相关的查询，例如GPU架构与芯片设计代码的生成问题。

据《华尔街日报》报道，英伟达自去年（2023年）10月推出ChipNeMo后，为图设计团队提供了不少帮助，对于英伟达而言建构人工智能产品的确也是个好兆头，在周一时这家芯片巨头的股价就上涨了4%，创下历史新高，而高盛的分析师预计，该涨幅或将可以持续到2025年上半年。

当然英伟达也并不是唯一一家使用人工智能技术加速半导体设计的公司，在去年7月谷歌DeepMind也制造了一个人工智能系统，谷歌公司还表示可以运用该技术提高设计效率，几个月后软件巨头新思科技也推出了一款提高芯片工程师生产效率的人工智能工具。

据悉，作为为人工智能领域“无可争议的领导者”，最新的数据显示，英伟达在全球人工智能芯片市场的份额预计最高可能已经达到了90%，创下新高纪录。

今日黄仁勋还入选了美国工程院院士，美国国家工程院（National Academy of Engineering, NAE）给出的理由是：“用高性能图形处理单元，推动了人工智能革命。”

更多一手新闻，欢迎下载凤凰新闻客户端订阅凤凰网科技。想看深度报道，请微信搜索“凤凰网科技”。


BeautifulSoup

In [2]:
from bs4 import BeautifulSoup
import requests

def fetch_page_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Heuristic: find the largest block of text
    largest_block = None
    max_length = 0
    for paragraph in soup.find_all(['p', 'div'], recursive=True):
        text = paragraph.get_text(strip=True)
        if len(text) > max_length:
            max_length = len(text)
            largest_block = text

    return largest_block

# Example usage
url = "https://tech.ifeng.com/c/8WybGOq7JCe"
cleaned_text = fetch_page_content(url)
print("Extracted Content:")
print(cleaned_text)


Extracted Content:
首页资讯视频直播凤凰卫视财经娱乐体育时尚汽车房产科技读书文化历史军事旅游佛教更多国学数码健康家居公益教育酒业美食科技>人工智能>正文高端打工人的噩梦？ 英伟达正利用人工智能制造芯片 | AI前哨凤凰网科技下载客户端独家抢先看2024年02月07日 10:23:26来自北京市英伟达凤凰网科技讯 《AI前哨》（作者/尹明顺）2月7日，据外媒消息称，英伟达正在使用自己开发的名为ChipNeMo的人工智能系统帮助公司提升制造芯片效率，旨在加快GPU的生产速度。据称设计GPU是一项劳动密集型工作，一个芯片通常需要近千人来建构，每个人都需要对设计细节具有充分的了解并进行协同工作，而ChipNeMo则能够快速对工作进行给予响应，并且回复与芯片设计相关的查询，例如GPU架构与芯片设计代码的生成问题。据《华尔街日报》报道，英伟达自去年（2023年）10月推出ChipNeMo后，为图设计团队提供了不少帮助，对于英伟达而言建构人工智能产品的确也是个好兆头，在周一时这家芯片巨头的股价就上涨了4%，创下历史新高，而高盛的分析师预计，该涨幅或将可以持续到2025年上半年。当然英伟达也并不是唯一一家使用人工智能技术加速半导体设计的公司，在去年7月谷歌DeepMind也制造了一个人工智能系统，谷歌公司还表示可以运用该技术提高设计效率，几个月后软件巨头新思科技也推出了一款提高芯片工程师生产效率的人工智能工具。据悉，作为为人工智能领域“无可争议的领导者”，最新的数据显示，英伟达在全球人工智能芯片市场的份额预计最高可能已经达到了90%，创下新高纪录。今日黄仁勋还入选了美国工程院院士，美国国家工程院（National Academy of Engineering, NAE）给出的理由是：“用高性能图形处理单元，推动了人工智能革命。”更多一手新闻，欢迎下载凤凰新闻客户端订阅凤凰网科技。想看深度报道，请微信搜索“凤凰网科技”。“特别声明：以上作品内容(包括在内的视频、图片或音频)为凤凰网旗下自媒体平台“大风号”用户上传并发布，本平台仅提供信息存储空间服务。Notice: The content above (including the videos, pictures and audios if any) is uploaded and posted by the use

Chunk text by punctuation

langdetect used for detecting language

newspaper3k used to download and parse the main article content directly

In [12]:
from transformers import MarianMTModel, MarianTokenizer
import re
from newspaper import Article
from langdetect import detect

def chunk_text_by_punctuation(text, max_length=200):
    # Split text at punctuation marks
    sentences = re.split(r'(?<=[.!?。！？])\s+', text)
    return sentences

def translate_with_transformers(text, model_name="Helsinki-NLP/opus-mt-zh-en"):
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    translated_text = ''
    
    sentences = chunk_text_by_punctuation(text)
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        outputs = model.generate(**inputs)
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_text += translated_sentence + ' '

    return translated_text.strip()

def fetch_article_content(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

def translate_article(url):
    original_text = fetch_article_content(url)
    lang = detect(original_text)
    print(f"Detected language: {lang}\n")

    print("Original Text:")
    original_sentences = chunk_text_by_punctuation(original_text)
    for sentence in original_sentences:
        print(sentence)

    if lang.startswith('en'):
        print("\nNo translation needed.")
        return original_text  # Return the English text directly
    else:
        print("\nTranslated Text:")
        translated_text = translate_with_transformers(original_text)
        translated_sentences = chunk_text_by_punctuation(translated_text)
        for sentence in translated_sentences:
            print(sentence)
        return translated_text  # Return the translated text

# Example usage
url1 = "https://tech.ifeng.com/c/8WybGOq7JCe"
translated_text1 = translate_article(url1)


Detected language: zh-cn

Original Text:
英伟达

凤凰网科技讯 《AI前哨》（作者/尹明顺）2月7日，据外媒消息称，英伟达正在使用自己开发的名为ChipNeMo的人工智能系统帮助公司提升制造芯片效率，旨在加快GPU的生产速度。
据称设计GPU是一项劳动密集型工作，一个芯片通常需要近千人来建构，每个人都需要对设计细节具有充分的了解并进行协同工作，而ChipNeMo则能够快速对工作进行给予响应，并且回复与芯片设计相关的查询，例如GPU架构与芯片设计代码的生成问题。
据《华尔街日报》报道，英伟达自去年（2023年）10月推出ChipNeMo后，为图设计团队提供了不少帮助，对于英伟达而言建构人工智能产品的确也是个好兆头，在周一时这家芯片巨头的股价就上涨了4%，创下历史新高，而高盛的分析师预计，该涨幅或将可以持续到2025年上半年。
当然英伟达也并不是唯一一家使用人工智能技术加速半导体设计的公司，在去年7月谷歌DeepMind也制造了一个人工智能系统，谷歌公司还表示可以运用该技术提高设计效率，几个月后软件巨头新思科技也推出了一款提高芯片工程师生产效率的人工智能工具。
据悉，作为为人工智能领域“无可争议的领导者”，最新的数据显示，英伟达在全球人工智能芯片市场的份额预计最高可能已经达到了90%，创下新高纪录。
今日黄仁勋还入选了美国工程院院士，美国国家工程院（National Academy of Engineering, NAE）给出的理由是：“用高性能图形处理单元，推动了人工智能革命。”

更多一手新闻，欢迎下载凤凰新闻客户端订阅凤凰网科技。想看深度报道，请微信搜索“凤凰网科技”。

Translated Text:
According to foreign media sources, British Weida is using its own artificial intelligence system, ChipNeMo, to help companies improve their chip-making efficiency, with the aim of accelerating the production of GPU.
The design of the GPU is said to be a labour-i

In [13]:
url2 = "https://tech.ifeng.com/c/8WpDSVQ5DIz"
translated_text2 = translate_article(url2)

Detected language: zh-cn

Original Text:
凤凰网科技讯 《AI前哨》（作者/蒋浇）2月1日近日，蚂蚁集团AI创新研发部门NextEvo全面开源AI Infra技术，可帮助大模型千卡训练有效时间占比超过95%，能实现训练时“自动驾驶”，这推动了AI研发效率。
该技术框架名为DLRover，目标在于大规模分布式训练的智能化。目前很多企业的训练作业都是跑在混合部署的集群中，运行环境复杂多变，不管多么“崎岖的地形”，DLRover都可以“轻松行驶”。
2023年大模型技术的发展，带来了工程实践的爆发，如何管理数据，提高训练和推理效率，最大化利用现有算力，成了关键一环。
完成一个千亿参数级别的大模型，如GPT-3，用一张卡训练一次要耗时32年，那么训练时的算力利用尤为重要。方法之一是把能用的算力用得更好，比如进一步压榨已购买GPU的性能；二是把以前利用不了的算力用起来，比如CPU、内存等，这就需要通过异构计算平台来解决。
据悉，最新集成进DLRover的是Flash Checkpoint（FCP）方案。模型训练时，一般要打Checkpoint（检查点），以便中断时能恢复到最近状态，目前常规的做法，存在着耗时长、高频打点易降低训练可用时间、低频打点恢复时丢失过多等缺点。新方案FCP应用在千卡千亿参数模型训练后，Checkpoint 导致的训练浪费时间降低约5倍，其中持久化时间降低约70倍，有效训练时间从90%提升至95%。

Translated Text:
Recently, on 1 February, the ants Group's Innovative Research and Development Unit, NextEvo, opened up a full range of AI Infra technologies, which can help large model kilocalories with more than 95 per cent of their training time and “self-drive” when they are trained, contributed to the efficiency of AI's research and development.
The techni

In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "facebook/bart-large-cnn"  # BART model fine-tuned for summarization
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def summarize(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
print(summarize(translated_text1))

print("\n------------------------\n")
print(summarize(translated_text2))


British Weida is using its own artificial intelligence system, ChipNeMo, to help companies improve their chip-making efficiency. On Monday, the stock price of this chip giant rose by 4 per cent, to an all-time high, while Goldman Sachs analysts predicted that the increase might last until first half of 2025.

------------------------

 AI Infra technologies can help large model kilocalories with more than 95 per cent of their training time. The technical framework, known as DLRover, aims at the intellectualization of large-scale distributed training.
