In [1]:
!pip install requests beautifulsoup4 transformers streamlit torch sentencepiece



In [4]:
!pip install torch torchvision torchaudio



In [2]:
import sys
print(sys.executable)


C:\Users\ruhan\anaconda3\envs\tf_env\python.exe


In [18]:
import requests
from bs4 import BeautifulSoup
from transformers import BartForConditionalGeneration, BartTokenizer, MarianMTModel, MarianTokenizer
import torch

In [1]:
def scrape_news_articles(url):
    """
    Scrapes the title, date, and main content from a Hindi news article.
    Adjust the tags/classes as per the site structure.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Adjust selectors for the specific site structure
    title = soup.find('h1')
    date = soup.find('span')
    content_div = soup.find('div')

    title = title.text.strip() if title else "Title not found"
    date = date.text.strip() if date else "Date not found"
    content = content_div.get_text(separator=' ', strip=True) if content_div else "Content not found"

    return {'title': title, 'date': date, 'content': content}


In [2]:
import streamlit as st

@st.cache_resource
def load_translation_model():
    tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")
    model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-hi-en")
    return tokenizer, model

def translate_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)


In [3]:
@st.cache_resource
def load_summarization_model():
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
    return tokenizer, model

def summarize_text(text, tokenizer, model):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [4]:
import pandas as pd
from datasets import Dataset

# Load and clean dataset
df = pd.read_csv('scrapped_clean.csv')  # ✅ use read_csv for CSV files
df.columns = ["hindi", "english"]
df.dropna(inplace=True)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print(dataset[0])

{'hindi': 'पहलगाम अटैक पर विवादित पोस्ट, 7 राज्यों में 26 गिरफ्तार:इनमें विधायक, पत्रकार, वकील और स्टूडेंट शामिल; देश विरोधी टिप्पणी की थी', 'english': 'Disputed posts on Pahalgam attack, 26 arrested in 7 states: these include MLAs, journalists, lawyers and students; Was made anti -national comments'}


In [6]:
# Format dataset to match Hugging Face translation format
def format_translation(example):
    return {
        "translation": {
            "hi": example["hindi"],
            "en": example["english"]
        }
    }

formatted_dataset = dataset.map(format_translation)

Map: 100%|██████████| 239/239 [00:00<00:00, 6196.04 examples/s]


In [7]:
from transformers import MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-hi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)

def tokenize(batch):
    hi_texts = [item["hi"] for item in batch["translation"]]
    en_texts = [item["en"] for item in batch["translation"]]

    inputs = tokenizer(hi_texts, padding="max_length", truncation=True, max_length=64)
    targets = tokenizer(en_texts, padding="max_length", truncation=True, max_length=64)

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }

tokenized_dataset = formatted_dataset.map(tokenize, batched=True)


Map: 100%|██████████| 239/239 [00:00<00:00, 922.83 examples/s]


In [8]:
print(tokenized_dataset[0])

{'hindi': 'पहलगाम अटैक पर विवादित पोस्ट, 7 राज्यों में 26 गिरफ्तार:इनमें विधायक, पत्रकार, वकील और स्टूडेंट शामिल; देश विरोधी टिप्पणी की थी', 'english': 'Disputed posts on Pahalgam attack, 26 arrested in 7 states: these include MLAs, journalists, lawyers and students; Was made anti -national comments', 'translation': {'en': 'Disputed posts on Pahalgam attack, 26 arrested in 7 states: these include MLAs, journalists, lawyers and students; Was made anti -national comments', 'hi': 'पहलगाम अटैक पर विवादित पोस्ट, 7 राज्यों में 26 गिरफ्तार:इनमें विधायक, पत्रकार, वकील और स्टूडेंट शामिल; देश विरोधी टिप्पणी की थी'}, 'input_ids': [6567, 4017, 975, 1067, 23881, 395, 33, 7419, 602, 11948, 2, 952, 11747, 12, 1998, 5876, 24, 41741, 40228, 28143, 2, 20322, 2, 12931, 7, 60843, 1204, 41, 512, 6373, 6128, 15, 167, 0, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 61126, 6

In [9]:
print(tokenized_dataset.column_names)

['hindi', 'english', 'translation', 'input_ids', 'attention_mask', 'labels']


In [10]:
print(len(tokenized_dataset[0]['input_ids']))  # Should be 64 if you used max_length=64

64


In [11]:
from transformers import MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name = "Helsinki-NLP/opus-mt-hi-en"
model = MarianMTModel.from_pretrained(model_name)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [12]:
from evaluate import load
from tqdm import tqdm  # For progress bar
import time
import torch

# Improved prediction function that accepts a model
def get_predictions_with_model(df, model, batch_size=8):
    print("Generating predictions in batches...")
    preds = []
    hindi_texts = df["hindi"].tolist()

    for i in tqdm(range(0, len(hindi_texts), batch_size)):
        batch_texts = hindi_texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
        with torch.no_grad():
            translated = model.generate(**inputs)
        batch_preds = tokenizer.batch_decode(translated, skip_special_tokens=True)
        preds.extend(batch_preds)
    return preds

In [31]:
from evaluate import load
metric = load("sacrebleu")

# Load original base model again for pre-fine-tuning eval
base_model = MarianMTModel.from_pretrained(model_name)

print("🧪 Evaluating BLEU before fine-tuning...")
baseline_preds = get_predictions_with_model(df, base_model)
bleu_before = metric.compute(predictions=baseline_preds, references=[[ref] for ref in df["english"]])
print("✅ BLEU score before fine-tuning:", bleu_before)


🧪 Evaluating BLEU before fine-tuning...
Generating predictions in batches...


100%|██████████████████████████████████████| 30/30 [07:11<00:00, 14.39s/it]

✅ BLEU score before fine-tuning: {'score': 12.013937343727523, 'counts': [2550, 967, 426, 199], 'totals': [5946, 5707, 5468, 5229], 'precisions': [42.88597376387487, 16.94410373225863, 7.790782735918069, 3.805698986421878], 'bp': 0.9916262424042355, 'sys_len': 5946, 'ref_len': 5996}





In [32]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,4.177,3.375665
2,3.4239,2.67428
3,2.9421,2.278084
4,2.3969,2.050273
5,2.5042,1.977165


TrainOutput(global_step=300, training_loss=3.213486576080322, metrics={'train_runtime': 601.3336, 'train_samples_per_second': 1.987, 'train_steps_per_second': 0.499, 'total_flos': 20254273044480.0, 'train_loss': 3.213486576080322, 'epoch': 5.0})

In [33]:
print("\n🧪 Evaluating BLEU after fine-tuning...")
fine_tuned_preds = get_predictions_with_model(df, model)  # 'model' is fine-tuned now
bleu_after = metric.compute(predictions=fine_tuned_preds, references=[[ref] for ref in df["english"]])
print("✅ BLEU score after fine-tuning:", bleu_after)



🧪 Evaluating BLEU after fine-tuning...
Generating predictions in batches...


100%|██████████████████████████████████████| 30/30 [06:10<00:00, 12.34s/it]

✅ BLEU score after fine-tuning: {'score': 26.06905981426099, 'counts': [3206, 1801, 1108, 724], 'totals': [5932, 5693, 5454, 5215], 'precisions': [54.045853000674306, 31.63534164763745, 20.315364869820314, 13.883029721955896], 'bp': 0.9892690505480524, 'sys_len': 5932, 'ref_len': 5996}





In [13]:
def display_news_summary(article_data, translated_title, translated_text, summary):
    """
    Display results on the Streamlit dashboard.
    """
    st.title("📰 Hindi News Summarizer & Translator")

    st.subheader("📝 Title (Hindi):")
    st.write(article_data['title'])

    st.subheader("🌐 Title (English):")
    st.write(translated_title)

    st.subheader("📅 Published Date:")
    st.write(article_data['date'])

    st.subheader("🗞️ Full Article (Hindi):")
    st.write(article_data['content'])

    st.subheader("🌍 Full Article (English):")
    st.write(translated_text)

    st.subheader("🔍 Summary (English):")
    st.markdown(summary)


In [3]:
import streamlit as st                          
def main():
    st.sidebar.title("🔗 Hindi News URL Input")
    url = st.sidebar.text_input("Paste a Hindi news article URL:")

    if url:
        try:
            article_data = scrape_news_articles(url)

            if article_data['content'] != "Content not found":
                # Load models
                trans_tokenizer, trans_model = load_translation_model()
                sum_tokenizer, sum_model = load_summarization_model()

                # Translate
                translated_title = translate_text(article_data['title'], trans_tokenizer, trans_model)
                translated_text = translate_text(article_data['content'], trans_tokenizer, trans_model)

                # Summarize
                summary = summarize_text(translated_text, sum_tokenizer, sum_model)

                # Display
                display_news_summary(article_data, translated_title, translated_text, summary)
            else:
                st.error("❌ Could not extract article content.")
        except Exception as e:
            st.error(f"⚠️ Error: {e}")

if __name__ == "__main__":
    main()


2025-06-05 23:26:45.636 
  command:

    streamlit run C:\Users\ruhan\anaconda3\envs\tf_env\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-06-05 23:26:45.660 Session state does not function when running a script without `streamlit run`
