# Data Extraction

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
df = pd.read_excel('Input.xlsx')

urls = df['URL'].tolist()
url_ids = df['URL_ID'].tolist()

In [3]:
def extract_article_text(url):
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'lxml')
    
    title = soup.find('h1').text if soup.find('h1') else "Title not found"
    
    article_div = soup.find('div', class_="td-post-content")
    text = article_div.text if article_div else "Text not found"
    
    return title, text

result_df = pd.DataFrame(columns=["URL_ID", "Title", "Text"])

for url, url_id in zip(urls,url_ids):
    title, text = extract_article_text(url)
    row_df = pd.DataFrame({"Title": [title], "Text": [text], "URL_ID": [url_id]})
    result_df = pd.concat([result_df, row_df], ignore_index=True)

In [4]:
result_df['Text'] = result_df['Text'].str.replace('\n', '')

In [5]:
result_df.head()

Unnamed: 0,URL_ID,Title,Text
0,blackassign0001,Rising IT cities and its impact on the economy...,We have seen a huge development and dependence...
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,"Throughout history, from the industrial revolu..."
2,blackassign0003,"Internet Demand’s Evolution, Communication Imp...","IntroductionIn the span of just a few decades,..."
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,"The way we live, work, and communicate has unq..."
4,blackassign0005,OTT platform and its impact on the entertainme...,The year 2040 is poised to witness a continued...


In [6]:
result_df.to_excel('extracteddata.xlsx', index=False)

In [7]:
def save_to_file(url_id, title, text):
    
    filename = f"{url_id}.txt"
   
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n\nText: {text}")


for url, url_id in zip(urls, url_ids):
    title, text = extract_article_text(url)
    save_to_file(url_id, title, text)