In [1]:
import requests
from bs4 import BeautifulSoup

In [15]:
#URL of the webpage to scrape
url= "https://economictimes.indiatimes.com/news/india/india-house-at-paris-olympics-2024-what-is-it-see-pictures-of-whats-inside/nita-ambani-inaugurates-india-house-at-paris-olympics/slideshow/112083241.cms"
#Send a GET request to the webpage
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
  #Parse the webpage content
  soup = BeautifulSoup(response.content, "html.parser")
  # Extract all paragraph elements
  paragraphs = soup.find_all('p')

  # Extract text from each paragraph and save as sentences
  sentences = [para.get_text().strip() for para in paragraphs]

  # Save the sentences to a file
  with open("scraped_sentences.txt", "w", encoding="utf-8") as file:
    for sentence in sentences:
      if sentence:
        file.write(sentence + "\n")
  print("Text data has been scraped and saved as sentences.")
else:
  print("Failed to retrieve the webpage. Status code:", response.status_code)

Text data has been scraped and saved as sentences.


In [16]:
import pandas as pd
df=pd.read_csv('/content/scraped_sentences.txt', delimiter='\t')
df.head()

Unnamed: 0,"Fresh off hosting the year's most extravagant wedding, Nita Ambani inaugurated the India House for the Paris Olympics on Saturday. This is the first time India has set up such a facility at the Olympic Games, with the aim of promoting Indian culture and expressing the country's aspiration to host the 2036 Games. The India House initiative was launched in collaboration with the Indian Olympic Association and the Reliance Foundation, which Ambani founded and chairs."
0,"During the inauguration, Nita Ambani, an Inter..."
1,Visitors to India House can enjoy a range of c...
2,India House will host numerous performances an...
3,"Nita Ambani was accompanied by her husband, Mu..."
4,Trending Now


In [17]:
df.shape

(16, 1)

In [18]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

df = pd.read_csv('/content/scraped_sentences.txt', delimiter='\t', header=None, names=['text'])

def solve(df):
    # Load stopwords
    stopwords1 = set(stopwords.words("english"))

    # The 'text' column now exists
    summaries = []

    for index, row in df.iterrows():
        data = row['text']  # Access the text data for the current row

        # Tokenize the words
        words = word_tokenize(data)
        freqTable = {}
        for word in words:
            word = word.lower()
            if word in stopwords1:
                continue
            if word in freqTable:
                freqTable[word] += 1
            else:
                freqTable[word] = 1

        # Tokenize the sentences
        sentences = sent_tokenize(data)
        sentenceValue = {}
        for sentence in sentences:
            for word, freq in freqTable.items():
                if word in sentence.lower():
                    if sentence in sentenceValue:
                        sentenceValue[sentence] += freq
                    else:
                        sentenceValue[sentence] = freq

        # Calculate the average score for the sentences
        sumValues = sum(sentenceValue.values())
        average = int(sumValues / len(sentenceValue))

        # Generate the summary
        summary = ''
        for sentence in sentences:
            if sentence in sentenceValue and sentenceValue[sentence] > (1.2 * average):
                summary += " " + sentence

        summaries.append(summary.strip())

    # Add summaries to the DataFrame and save to a new CSV file
    df['summary'] = summaries
    df.to_csv('summarized_output.csv', index=False)
    print("Summarization completed and saved to 'summarized_output.csv'.")

# Example usage
solve(df)

Summarization completed and saved to 'summarized_output.csv'.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
df['summary']

Unnamed: 0,summary
0,
1,India House is among several hospitality house...
2,The house also features a lounge for athletes ...
3,"This comes as men's and women's cricket, in th..."
4,"With 112 athletes competing in 16 sports, Indi..."
5,
6,
7,
8,
9,


In [20]:
column_name = 'summary'  # Replace with your column name

# Iterate through the column and print each value
for value in df[column_name]:
    print(value)


India House is among several hospitality houses set up by participating countries or brands, showcasing Indian architecture, artistic motifs, and culture.
The house also features a lounge for athletes and a variety of Indian foods, including biryani, mutton curry, curd rice, and desserts.
This comes as men's and women's cricket, in the Twenty20 format, is set to be included in the 2028 Olympics in Los Angeles.
With 112 athletes competing in 16 sports, India's presence at the Paris Olympics is well-represented, and India House stands as a vibrant testament to the nation's spirit and ambitions.














In [21]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=bfe8d0dcbd68a795b2b94206d134df36b74d61918b973bae587c8dd71d734078
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [22]:
import pandas as pd
from rouge_score import rouge_scorer

#calculating rouge score
def calculate_rouge_scores(reference, summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, summary)


df['rouge_scores'] = df.apply(lambda row: calculate_rouge_scores(row['text'], row['summary']), axis=1)


for idx, row in df.iterrows():
    rouge_scores = row['rouge_scores']
    print(f"Summary {idx + 1} ROUGE scores:")
    print(f"ROUGE-1: {rouge_scores['rouge1']}")
    print(f"ROUGE-2: {rouge_scores['rouge2']}")
    print(f"ROUGE-L: {rouge_scores['rougeL']}")
    print("\n")


Summary 1 ROUGE scores:
ROUGE-1: Score(precision=0.0, recall=0.0, fmeasure=0.0)
ROUGE-2: Score(precision=0.0, recall=0.0, fmeasure=0.0)
ROUGE-L: Score(precision=0, recall=0, fmeasure=0)


Summary 2 ROUGE scores:
ROUGE-1: Score(precision=1.0, recall=0.25301204819277107, fmeasure=0.4038461538461538)
ROUGE-2: Score(precision=1.0, recall=0.24390243902439024, fmeasure=0.39215686274509803)
ROUGE-L: Score(precision=1.0, recall=0.25301204819277107, fmeasure=0.4038461538461538)


Summary 3 ROUGE scores:
ROUGE-1: Score(precision=1.0, recall=0.38596491228070173, fmeasure=0.5569620253164557)
ROUGE-2: Score(precision=1.0, recall=0.375, fmeasure=0.5454545454545454)
ROUGE-L: Score(precision=1.0, recall=0.38596491228070173, fmeasure=0.5569620253164557)


Summary 4 ROUGE scores:
ROUGE-1: Score(precision=1.0, recall=0.43103448275862066, fmeasure=0.6024096385542169)
ROUGE-2: Score(precision=1.0, recall=0.42105263157894735, fmeasure=0.5925925925925926)
ROUGE-L: Score(precision=1.0, recall=0.43103448275862

Abstractive summarization

In [23]:
import pandas as pd
from transformers import pipeline

# Load the data
data = pd.read_csv('/content/scraped_sentences.txt', delimiter='\t')


def summarize_text(text, summarizer, max_length=150):
    return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']


def solve(data, model='t5-base'):
    if 'text' not in data.columns:
        print("The dataset must contain a 'text' column.")
        return

    summarizer = pipeline("summarization", model=model)
    data['summary'] = data['text'].apply(lambda x: summarize_text(x, summarizer, max_length=150))

    output_path = 'summarized_output.csv'
    data.to_csv(output_path, index=False)
    print(f"Summarization completed and saved to '{output_path}'.")

solve(data)


The dataset must contain a 'text' column.


In [24]:
data1=pd.read_csv('/content/summarized_output.csv')
data1.head()
column_name = 'summary'  # Replace with your column name

# Iterate through the column and print each value
for value in df[column_name]:
    print(value)


India House is among several hospitality houses set up by participating countries or brands, showcasing Indian architecture, artistic motifs, and culture.
The house also features a lounge for athletes and a variety of Indian foods, including biryani, mutton curry, curd rice, and desserts.
This comes as men's and women's cricket, in the Twenty20 format, is set to be included in the 2028 Olympics in Los Angeles.
With 112 athletes competing in 16 sports, India's presence at the Paris Olympics is well-represented, and India House stands as a vibrant testament to the nation's spirit and ambitions.












