# Web Scraping articles using BeautifulSoup and Scrapy

In [39]:
# Dependencies

import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
from retrying import retry

In [40]:
import scrapy
from scrapy.selector import Selector

## Scraping articles from TOI

#### Scrape urls

In [26]:
# Fetch urls from TOI page

base_url = "https://timesofindia.indiatimes.com/topic/business-and-finance/"
num_pages = 74
url_list = []

for page_number in range(1, num_pages + 1):
    url = base_url + str(page_number)
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        elements_with_class = soup.find_all(class_="uwU81")
        for element in elements_with_class:
            link = element.find('a')['href']
            url_list.append(link)
    else:
        print(f"Failed to fetch URL: {url}")

data = {'URL': url_list}
df = pd.DataFrame(data)

df.drop_duplicates(subset='URL', keep='first', inplace=True)
df.to_csv('df_businessAndFinance.csv', index = False)

Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/228
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/406
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/425
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/445
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/453
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/460
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/498
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/551
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/554
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/566
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/568
Failed to fetch URL: https://tim

Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/954
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/963
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/964
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/967
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/968
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/969
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/973
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/975
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/976
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/977
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/981
Failed to fetch URL: https://tim

Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1335
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1344
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1352
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1355
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1356
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1362
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1365
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1375
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1379
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1381
Failed to fetch URL: https://timesofindia.indiatimes.com/topic/business-and-finance/1383
Failed to fetch URL: 

In [27]:
df

Unnamed: 0,URL
0,https://timesofindia.indiatimes.com/business/i...
1,https://timesofindia.indiatimes.com/business/i...
2,https://timesofindia.indiatimes.com/city/chenn...
3,https://timesofindia.indiatimes.com/education/...
4,https://timesofindia.indiatimes.com/life-style...
...,...
25515,https://timesofindia.indiatimes.com/city/mumba...
25516,https://timesofindia.indiatimes.com/india/bank...
25517,https://timesofindia.indiatimes.com/india/cong...
25518,https://timesofindia.indiatimes.com/india/bure...


In [22]:
df.iloc[100].URL

'https://timesofindia.indiatimes.com/business/india-business/hdfc-to-sell-education-finance-arm-credila/articleshow/99946474.cms'

In [28]:
print(df.duplicated().sum())

414


In [29]:
df.drop_duplicates(subset='URL', keep='first', inplace=True)

In [31]:
df.shape

(25106, 1)

In [32]:
df.to_csv('df_BusinessAndFinance.csv', index = False)

#### Scrape text from scraped urls

In [33]:
url = "https://timesofindia.indiatimes.com/business/india-business/markets-settle-with-gains-after-two-days-of-fall/articleshow/100704868.cms"
page  = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

content = soup.find('div', class_ = '_s30J clearfix')
text = content.text

MUMBAI: Benchmark stock indices Sensex and Nifty closed higher on Friday after two days of fall, helped by buying in metal, telecom and auto stocks amid a firm trend in global markets.Automakers led by Maruti Suzuki India, Hyundai, Mahindra & Mahindra reporting robust wholesales of passenger vehicles and GST collections crossing Rs 1.50 lakh crore for the third straight month in May added to the optimism.The 30-share BSE Sensex climbed 118.57 points or 0.19 per cent to settle at 62,547.11. During the day, it jumped 291.3 points or 0.46 per cent to 62,719.84.The NSE Nifty advanced 46.35 points or 0.25 per cent to finish at 18,534.10.Tata Steel was the biggest gainer in the Sensex pack, rising nearly 2 per cent, followed by Maruti, Mahindra & Mahindra, Sun Pharma, Larsen & Toubro, Titan, Bharti Airtel, Power Grid, ITC, State Bank of India and Nestle.In contrast, Infosys, Wipro, HCL Technologies, Tata Consultancy Services, IndusInd Bank, Tech Mahindra, Reliance Industries and Bajaj Financ

In [8]:
url = "https://timesofindia.indiatimes.com/city/kolhapur/need-political-will-to-extend-kolhapur-city-limits-real-estate-developers-say/articleshow/32110141.cms"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
content = soup.find('div', class_='Normal')
print(content.get_text())


AttributeError: 'NoneType' object has no attribute 'get_text'

In [152]:
url_df = pd.read_csv('df_PersonalFinance.csv')
c = 0

@retry(wait_fixed=2000, stop_max_attempt_number=3)
def scrape_url_with_retry(url):
    global c
    try:
        page = requests.get(url, verify=True)
    except requests.exceptions.SSLError as e:
        page = requests.get(url, verify=False)
        
    try:
        page.raise_for_status()
        soup = BeautifulSoup(page.content, 'html.parser')
        
        content_with_class = soup.find('div', class_='_s30J clearfix')
        if content_with_class:
            c += 1
            print(f"Scraped article {c}")
            return content_with_class.text
        
        content_with_normal = soup.find('div', class_='Normal')
        if content_with_normal:
            c += 1
            print(f"Scraped article {c}")
            return content_with_normal.text
    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL: {url}. Error: {e}")
    
    return ''

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_url_with_retry, url_df['URL']))

url_df['text'] = results
url_df['target'] = 'personal finance'

result_df = url_df[['target', 'text']]


Scraped article 1
Scraped article 2
Scraped article 3
Scraped article 4
Scraped article 5
Scraped article 6
Scraped article 7
Scraped article 8
Scraped article 9
Scraped article 10
Scraped article 11
Scraped article 12
Scraped article 13
Scraped article 14
Scraped article 15Scraped article 16

Scraped article 17
Scraped article 18
Scraped article 19
Scraped article 20
Scraped article 21
Scraped article 22
Scraped article 23
Scraped article 24
Scraped article 25
Scraped article 26
Scraped article 27
Scraped article 28
Scraped article 29
Scraped article 30
Scraped article 31
Scraped article 32
Scraped article 33
Scraped article 34
Scraped article 35
Scraped article 36
Scraped article 37
Scraped article 38
Scraped article 39Scraped article 40

Scraped article 41
Scraped article 42
Scraped article 43
Scraped article 44
Scraped article 45
Scraped article 46Scraped article 47

Scraped article 48
Scraped article 49
Scraped article 50
Scraped article 51
Scraped article 52
Scraped article 53
Sc

Scraped article 418
Scraped article 419
Scraped article 420
Scraped article 421
Scraped article 422
Scraped article 423
Scraped article 424Scraped article 425Scraped article 426


Scraped article 427
Scraped article 428
Scraped article 429Scraped article 430

Scraped article 431
Scraped article 432
Scraped article 433
Scraped article 434
Scraped article 435
Scraped article 436
Scraped article 437Scraped article 438

Scraped article 439
Scraped article 440
Scraped article 441
Scraped article 442
Scraped article 443
Scraped article 444
Scraped article 445
Scraped article 446
Scraped article 447
Scraped article 448
Scraped article 449
Scraped article 450
Scraped article 451
Scraped article 452
Scraped article 453
Scraped article 454Scraped article 455

Scraped article 456
Scraped article 457
Scraped article 458
Scraped article 459Scraped article 460

Scraped article 461
Scraped article 462
Scraped article 463
Scraped article 464
Scraped article 465
Scraped article 466
Scraped article 467


Scraped article 828
Scraped article 829
Scraped article 830
Scraped article 831
Scraped article 832
Scraped article 833
Scraped article 834
Scraped article 835Scraped article 836Scraped article 837


Scraped article 838
Scraped article 839Scraped article 840

Scraped article 841
Scraped article 842
Scraped article 843
Scraped article 844
Scraped article 845
Scraped article 846
Scraped article 847
Scraped article 848
Scraped article 849
Scraped article 850
Scraped article 851
Scraped article 852
Scraped article 853
Scraped article 854
Scraped article 855
Scraped article 856
Scraped article 857Scraped article 858
Scraped article 859

Scraped article 860
Scraped article 861
Scraped article 862
Scraped article 863
Scraped article 864
Scraped article 865
Scraped article 866
Scraped article 867
Scraped article 868
Scraped article 869Scraped article 870
Scraped article 871

Scraped article 872
Scraped article 873
Scraped article 874
Scraped article 875
Scraped article 876
Scraped article 877


ConnectionError: HTTPConnectionPool(host='author.toiblogs.com', port=80): Max retries exceeded with url: /right-and-wrong/entry/why-spare-our-rich-farmers (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000018444578D60>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [153]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

# Load the DataFrame
url_df = pd.read_csv('df_PersonalFinance.csv')

# Define a function to scrape the content from a URL
def scrape_url(url):
    try:
        # Make a request to the URL
        response = requests.get(url, verify=True)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the article content based on the specific classes
        content_with_class = soup.find('div', class_='_s30J clearfix')
        if content_with_class:
            return content_with_class.text

        content_with_normal = soup.find('div', class_='Normal')
        if content_with_normal:
            return content_with_normal.text

    except requests.exceptions.RequestException as e:
        logging.error(f"Error accessing URL: {url}. Error: {e}")

    return ''  # Return an empty string if the content extraction fails or there's a connection error

# Use ThreadPoolExecutor for concurrent scraping
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_url, url_df['URL']))

# Add the scraped content and target to the DataFrame
url_df['text'] = results
url_df['target'] = 'personal finance'

# Save the resulting DataFrame
result_df = url_df[['target', 'text']]
result_df.to_csv('result_personal_finance.csv', index=False)


2023-08-13 02:48:03 [root] ERROR: Error accessing URL: https://timesofindia.indiatimes.com/home/sunday-times/all-that-matters/why-spare-our-rich-farmers/articleshow/18887658.cms. Error: HTTPConnectionPool(host='author.toiblogs.com', port=80): Max retries exceeded with url: /right-and-wrong/entry/why-spare-our-rich-farmers (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001844631BB80>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
2023-08-13 02:54:35 [root] ERROR: Error accessing URL: https://timesofindia.indiatimes.com/entertainment/hindi/bollywood/news/if-bush-is-neuman-whos-sonia/articleshow/1168683.cms. Error: HTTPConnectionPool(host='cmsstg1', port=8117): Max retries exceeded with url: /xfiles/1168611.cms (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000018446AF7BE0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


UnicodeEncodeError: 'utf-8' codec can't encode characters in position 1436-1439: surrogates not allowed

In [156]:
result_df = result_df[result_df.text != '']
result_df

Unnamed: 0,target,text
4,personal finance,The rate hikes by the Reserve Bank of India (R...
7,personal finance,NEW DELHI: Saving and investing is critical to...
8,personal finance,MUMBAI: A Mercedes-Benz C300d will surely give...
9,personal finance,NEW DELHI: This year mutual funds have emerged...
11,personal finance,The National Highway Authority of India (NHAI)...
...,...,...
5432,personal finance,"<div class=""section1""><div class=""Normal""><scr..."
5433,personal finance,doweshowbellyad=0; Edgar Schein (TOI Photo)Cul...
5434,personal finance,I want to do MBA. Can you tell me how much doe...
5435,personal finance,mumbai: when the silhouettes of high-rises sta...


In [159]:
result_df.to_csv('result_personal_finance.csv', index=False, encoding='utf-8-sig', errors='replace')

In [146]:
result_df.iloc[4].text

'NEW DELHI: A BJP Lok Sabha member has asked the government in Parliament whether it intends to recognise \'gaumata\' (cow) as a "national animal" that is an integral part of the Indian culture. Ajmer MP Bhagirath Choudhary asked this question in the lower House on Monday, to which no direct response was given.Union minister of culture G Kishan Reddy, in his response, said, "As informed by the department of animal husbandry, under the distribution of legislative powers between the Union and the states, (Article 246(3) of the Constitution) preservation of animal is a matter on which the state legislature has exclusive powers to legislate". The government had notified tiger and peacock as \'National Animal\' and \'National Bird\' respectively, the minister said.'

In [147]:
result_df.to_csv('d29.csv', index = False)

## Scraping articles from Economic Times

In [35]:
url = 'https://economictimes.indiatimes.com/markets/expert-view/investment-cycle-reviving-after-10-years-instead-of-sectors-be-stock-specific-rahul-singh/articleshow/99749211.cms'

try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page = soup.find('div', class_='artText')
    
    if page:
        print(page.get_text())
    else:
        print("")
except requests.exceptions.RequestException as e:
    print("Error:", e)

Rahul Singh, CIO – Equities, Tata Mutual Fund, says they follow growth at a reasonable price as a strategy, which is a strategy that works across the interest rate cycles across the economic cycles. At some point of time, when the interest rates were low, quality at any price grew at any price. Those kinds of strategies started to do much better. But now we feel that valuation discipline has come back into the market. So that is point number one. Also the economy is moving from just being consumption to consumption plus investment. The investment cycle revival is happening after 10 years. We think there is some distance to go.Please help us understand the current market breadth looking at the global uncertainties. Even though we are strong on our own grounds, do you think the market is range bound and not as upbeat as it should be? What could be the reason behind that?The range-bound movement is a result of the tussle which is going on between high interest rates and generally speaking

In [57]:
url="https://economictimes.indiatimes.com/markets/expert-view/investment-cycle-reviving-after-10-years-instead-of-sectors-be-stock-specific-rahul-singh/articleshow/99749211.cms"

response = requests.get(url)
sel = Selector(response)
page = sel.css('div.artText')
if page:
    print(' '.join(page.css('::text').getall()).strip())
else:
    print("Nothing to see here !!")

Rahul Singh , CIO – Equities,  Tata Mutual Fund,  says they follow growth at a reasonable price as a strategy, which is a strategy that works across the interest rate cycles across the economic cycles. At some point of time, when the interest rates were low, quality at any price grew at any price. Those kinds of strategies started to do much better. But now we feel that valuation discipline has come back into the market. So that is point number one. Also the economy is moving from just being consumption to consumption plus investment. The  investment cycle  revival is happening after 10 years. We think there is some distance to go. Please help us understand the current market breadth looking at the global uncertainties. Even though we are strong on our own grounds, do you think the market is range bound and not as upbeat as it should be? What could be the reason behind that? The range-bound movement is a result of the tussle which is going on between high interest rates and generally s

In [63]:
f = pd.read_csv('personal_finance_url.csv')
c = 0

def scrape_content(url):
    global c
    
    try:
        response = requests.get(url, verify=True)
    except requests.exceptions.SSLError as e:
        response = requests.get(url, verify=False)
    
    try:
        sel = Selector(response)
        page = sel.css('div.artText')
        if page:
            c += 1
            print(f"Scraped article {c}")
            return ' '.join(page.css('::text').getall()).strip()
        
    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL: {url}. Error: {e}")
        
    return ''

df['text'] = df['URL'].apply(scrape_content)
df['target'] = 'business and finance'

result_df = url_df[['target', 'text']]

Scraped article 1
Scraped article 2
Scraped article 3
Scraped article 4
Scraped article 5
Scraped article 6
Scraped article 7
Scraped article 8
Scraped article 9
Scraped article 10
Scraped article 11
Scraped article 12
Scraped article 13
Scraped article 14
Scraped article 15
Scraped article 16
Scraped article 17
Scraped article 18
Scraped article 19
Scraped article 20
Scraped article 21
Scraped article 22
Scraped article 23
Scraped article 24
Scraped article 25
Scraped article 26
Scraped article 27
Scraped article 28
Scraped article 29
Scraped article 30
Scraped article 31
Scraped article 32
Scraped article 33
Scraped article 34
Scraped article 35
Scraped article 36
Scraped article 37
Scraped article 38
Scraped article 39
Scraped article 40
Scraped article 41
Scraped article 42
Scraped article 43
Scraped article 44
Scraped article 45
Scraped article 46
Scraped article 47
Scraped article 48
Scraped article 49
Scraped article 50
Scraped article 51
Scraped article 52
Scraped article 53


KeyboardInterrupt: 

In [64]:
class MySpider(scrapy.Spider):
    name = 'my_spider'
    start_urls = []

    def __init__(self, df_path):
        super(MySpider, self).__init__()
        self.df = pd.read_csv(df_path)
        self.c = 0

    def start_requests(self):
        for url in self.df['URL']:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response):
        try:
            page = response.css('div.artText')
            if page:
                self.c += 1
                print(f"Scraped article {self.c}")
                text = ' '.join(page.css('::text').getall()).strip()
                yield {
                    'target': 'business and finance',
                    'text': text,
                }
        except Exception as e:
            self.logger.error(f"Error scraping URL: {response.url}. Error: {e}")

if __name__ == '__main__':
    import os
    from scrapy.crawler import CrawlerProcess

    df_path = 'personal_finance_url.csv'
    result_df_path = 'scraped_data_personalFinance.csv'

    process = CrawlerProcess(settings={
        'LOG_LEVEL': 'ERROR',
    })

    process.crawl(MySpider, df_path='personal_finance_url.csv')
    process.start()

Scraped article 1
Scraped article 2
Scraped article 3
Scraped article 4
Scraped article 5
Scraped article 6
Scraped article 7
Scraped article 8
Scraped article 9
Scraped article 10
Scraped article 11
Scraped article 12
Scraped article 13
Scraped article 14
Scraped article 15
Scraped article 16
Scraped article 17
Scraped article 18
Scraped article 19
Scraped article 20
Scraped article 21
Scraped article 22
Scraped article 23
Scraped article 24
Scraped article 25
Scraped article 26
Scraped article 27
Scraped article 28
Scraped article 29
Scraped article 30
Scraped article 31
Scraped article 32
Scraped article 33
Scraped article 34
Scraped article 35
Scraped article 36
Scraped article 37
Scraped article 38
Scraped article 39
Scraped article 40
Scraped article 41
Scraped article 42
Scraped article 43
Scraped article 44
Scraped article 45
Scraped article 46
Scraped article 47
Scraped article 48
Scraped article 49
Scraped article 50
Scraped article 51
Scraped article 52
Scraped article 53
Sc

#### Scrapy Output: 
        Scrapy scraped only 400 out of 14,000 articles

In [70]:
url_df = pd.read_csv('personal_finance_url.csv')
c = 0

@retry(wait_fixed=2000, stop_max_attempt_number=3)
def scrape_url_with_retry(url):
    global c
    try:
        page = requests.get(url, verify=True)
    except requests.exceptions.SSLError as e:
        page = requests.get(url, verify=False)
        
    try:
        page.raise_for_status()
        soup = BeautifulSoup(page.content, 'html.parser')
        
        content_with_class = soup.find('div', class_='artText')
        if content_with_class:
            c += 1
            print(f"Scraped article {c}")
            return content_with_class.text

    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL: {url}. Error: {e}")
    
    return ''

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_url_with_retry, url_df['URL']))

url_df['text'] = results
url_df['target'] = 'personal finance'

result_df = url_df[['target', 'text']]

Scraped article 1
Scraped article 2
Scraped article 3
Scraped article 4
Scraped article 5
Scraped article 6Scraped article 7
Scraped article 8
Scraped article 9

Scraped article 10
Scraped article 11
Scraped article 12
Scraped article 13
Scraped article 14
Scraped article 15
Scraped article 16Scraped article 17

Scraped article 18
Scraped article 19
Scraped article 20
Scraped article 21
Scraped article 22
Scraped article 23
Scraped article 24
Scraped article 25
Scraped article 26
Scraped article 27
Scraped article 28
Scraped article 29
Scraped article 30Scraped article 31
Scraped article 32Scraped article 33

Scraped article 34

Scraped article 35
Scraped article 36
Scraped article 37
Scraped article 38
Scraped article 39
Scraped article 40
Scraped article 41
Scraped article 42Scraped article 43

Scraped article 44Scraped article 45

Scraped article 46Scraped article 47
Scraped article 48

Scraped article 49
Scraped article 50
Scraped article 51
Scraped article 52
Scraped article 53
Sc

Scraped article 417
Scraped article 418Scraped article 419

Scraped article 420
Scraped article 421
Scraped article 422Scraped article 423

Scraped article 424Scraped article 425

Scraped article 426
Scraped article 427Scraped article 428
Scraped article 429

Scraped article 430
Scraped article 431
Scraped article 432
Scraped article 433
Scraped article 434Scraped article 435Scraped article 436


Scraped article 437
Scraped article 438
Scraped article 439
Scraped article 440
Scraped article 441
Scraped article 442
Scraped article 443
Scraped article 444
Scraped article 445
Scraped article 446
Scraped article 447
Scraped article 448
Scraped article 449
Scraped article 450
Scraped article 451Scraped article 452

Scraped article 453
Scraped article 454
Scraped article 455Scraped article 456

Scraped article 457
Scraped article 458Scraped article 459
Scraped article 460
Scraped article 461

Scraped article 462Scraped article 463

Scraped article 464
Scraped article 465
Scraped article 466


Scraped article 1226
Scraped article 1227Scraped article 1228

Scraped article 1229
Scraped article 1230
Scraped article 1231
Scraped article 1232
Scraped article 1233
Scraped article 1234Scraped article 1235

Scraped article 1236
Scraped article 1237Scraped article 1238

Scraped article 1239
Scraped article 1240
Scraped article 1241
Scraped article 1242
Scraped article 1243
Scraped article 1244
Scraped article 1245
Scraped article 1246
Scraped article 1247Scraped article 1248

Scraped article 1249
Scraped article 1250Scraped article 1251

Scraped article 1252
Scraped article 1253Scraped article 1254Scraped article 1255Scraped article 1256



Scraped article 1257
Scraped article 1258
Scraped article 1259
Scraped article 1260Scraped article 1261

Scraped article 1262
Scraped article 1263
Scraped article 1264
Scraped article 1265
Scraped article 1266
Scraped article 1267
Scraped article 1268Scraped article 1269

Scraped article 1270
Scraped article 1271
Scraped article 1272
Scraped artic

Scraped article 1617
Scraped article 1618
Scraped article 1619
Scraped article 1620
Scraped article 1621
Scraped article 1622Scraped article 1623

Scraped article 1624
Scraped article 1625Scraped article 1626

Scraped article 1627
Scraped article 1628
Scraped article 1629Scraped article 1630

Scraped article 1631
Scraped article 1632Scraped article 1633Scraped article 1634Scraped article 1635



Scraped article 1636
Scraped article 1637Scraped article 1638

Scraped article 1639
Scraped article 1640
Scraped article 1641
Scraped article 1642Scraped article 1643

Scraped article 1644
Scraped article 1645
Scraped article 1646
Scraped article 1647
Scraped article 1648
Scraped article 1649
Scraped article 1650
Scraped article 1651
Scraped article 1652
Scraped article 1653
Scraped article 1654
Scraped article 1655
Scraped article 1656
Scraped article 1657
Scraped article 1658
Scraped article 1659Scraped article 1660

Scraped article 1661Scraped article 1662
Scraped article 1663
Scraped articl

Scraped article 2008
Scraped article 2009
Scraped article 2010
Scraped article 2011
Scraped article 2012
Scraped article 2013
Scraped article 2014
Scraped article 2015
Scraped article 2016
Scraped article 2017
Scraped article 2018
Scraped article 2019
Scraped article 2020Scraped article 2021

Scraped article 2022
Scraped article 2023
Scraped article 2024
Scraped article 2025
Scraped article 2026Scraped article 2027

Scraped article 2028
Scraped article 2029
Scraped article 2030
Scraped article 2031
Scraped article 2032
Scraped article 2033
Scraped article 2034
Scraped article 2035
Scraped article 2036
Scraped article 2037
Scraped article 2038
Scraped article 2039Scraped article 2040

Scraped article 2041
Scraped article 2042
Scraped article 2043
Scraped article 2044
Scraped article 2045
Scraped article 2046
Scraped article 2047
Scraped article 2048
Scraped article 2049
Scraped article 2050
Scraped article 2051
Scraped article 2052
Scraped article 2053
Scraped article 2054
Scraped artic

Scraped article 2402
Scraped article 2403
Scraped article 2404
Scraped article 2405
Scraped article 2406
Scraped article 2407
Scraped article 2408
Scraped article 2409
Scraped article 2410
Scraped article 2411
Scraped article 2412
Scraped article 2413
Scraped article 2414Scraped article 2415

Scraped article 2416
Scraped article 2417Scraped article 2418

Scraped article 2419
Scraped article 2420
Scraped article 2421
Scraped article 2422Scraped article 2423Scraped article 2424


Scraped article 2425
Scraped article 2426
Scraped article 2427
Scraped article 2428
Scraped article 2429
Scraped article 2430
Scraped article 2431
Scraped article 2432
Scraped article 2433
Scraped article 2434
Scraped article 2435
Scraped article 2436
Scraped article 2437
Scraped article 2438Scraped article 2439

Scraped article 2440
Scraped article 2441
Scraped article 2442
Scraped article 2443
Scraped article 2444
Scraped article 2445
Scraped article 2446
Scraped article 2447
Scraped article 2448
Scraped artic

Scraped article 2794Scraped article 2795
Scraped article 2796Scraped article 2797

Scraped article 2798

Scraped article 2799
Scraped article 2800
Scraped article 2801
Scraped article 2802Scraped article 2803
Scraped article 2804

Scraped article 2805
Scraped article 2806
Scraped article 2807
Scraped article 2808
Scraped article 2809
Scraped article 2810
Scraped article 2811
Scraped article 2812
Scraped article 2813
Scraped article 2814Scraped article 2815

Scraped article 2816
Scraped article 2817
Scraped article 2818
Scraped article 2819
Scraped article 2820Scraped article 2821Scraped article 2822


Scraped article 2823
Scraped article 2824Scraped article 2825Scraped article 2826


Scraped article 2827
Scraped article 2828
Scraped article 2829
Scraped article 2830Scraped article 2831

Scraped article 2832
Scraped article 2833
Scraped article 2834Scraped article 2835

Scraped article 2836
Scraped article 2837
Scraped article 2838
Scraped article 2839Scraped article 2840

Scraped artic

Scraped article 3186Scraped article 3187

Scraped article 3188
Scraped article 3189
Scraped article 3190
Scraped article 3191Scraped article 3192
Scraped article 3193
Scraped article 3194

Scraped article 3195
Scraped article 3196
Scraped article 3197
Scraped article 3198
Scraped article 3199
Scraped article 3200
Scraped article 3201
Scraped article 3202Scraped article 3203

Scraped article 3204
Scraped article 3205
Scraped article 3206
Scraped article 3207
Scraped article 3208
Scraped article 3209
Scraped article 3210
Scraped article 3211
Scraped article 3212
Scraped article 3213
Scraped article 3214
Scraped article 3215
Scraped article 3216Scraped article 3217
Scraped article 3218

Scraped article 3219
Scraped article 3220
Scraped article 3221
Scraped article 3222
Scraped article 3223
Scraped article 3224
Scraped article 3225
Scraped article 3226
Scraped article 3227Scraped article 3228

Scraped article 3229
Scraped article 3230
Scraped article 3231
Scraped article 3232
Scraped artic

Scraped article 3581
Scraped article 3582
Scraped article 3583
Scraped article 3584
Scraped article 3585
Scraped article 3586
Scraped article 3587
Scraped article 3588
Scraped article 3589
Scraped article 3590
Scraped article 3591
Scraped article 3592
Scraped article 3593
Scraped article 3594
Scraped article 3595
Scraped article 3596
Scraped article 3597Scraped article 3598

Scraped article 3599Scraped article 3600

Scraped article 3601
Scraped article 3602
Scraped article 3603
Scraped article 3604
Scraped article 3605
Scraped article 3606
Scraped article 3607
Scraped article 3608
Scraped article 3609
Scraped article 3610
Scraped article 3611
Scraped article 3612
Scraped article 3613
Scraped article 3614
Scraped article 3615
Scraped article 3616
Scraped article 3617
Scraped article 3618
Scraped article 3619
Scraped article 3620
Scraped article 3621
Scraped article 3622Scraped article 3623
Scraped article 3624Scraped article 3625


Scraped article 3626
Scraped article 3627
Scraped artic

Scraped article 3977
Scraped article 3978
Scraped article 3979
Scraped article 3980Scraped article 3981Scraped article 3982


Scraped article 3983
Scraped article 3984
Scraped article 3985
Scraped article 3986
Scraped article 3987
Scraped article 3988
Scraped article 3989
Scraped article 3990
Scraped article 3991
Scraped article 3992Scraped article 3993
Scraped article 3994

Scraped article 3995Scraped article 3996

Scraped article 3997
Scraped article 3998
Scraped article 3999Scraped article 4000

Scraped article 4001
Scraped article 4002
Scraped article 4003
Scraped article 4004
Scraped article 4005
Scraped article 4006
Scraped article 4007
Scraped article 4008
Scraped article 4009
Scraped article 4010
Scraped article 4011
Scraped article 4012
Scraped article 4013
Scraped article 4014
Scraped article 4015Scraped article 4016

Scraped article 4017
Scraped article 4018Scraped article 4019

Scraped article 4020
Scraped article 4021Scraped article 4022
Scraped article 4023

Scraped artic

Scraped article 4368
Scraped article 4369
Scraped article 4370
Scraped article 4371
Scraped article 4372
Scraped article 4373
Scraped article 4374
Scraped article 4375
Scraped article 4376
Scraped article 4377Scraped article 4378

Scraped article 4379
Scraped article 4380Scraped article 4381

Scraped article 4382
Scraped article 4383
Scraped article 4384Scraped article 4385

Scraped article 4386
Scraped article 4387Scraped article 4388

Scraped article 4389
Scraped article 4390Scraped article 4391

Scraped article 4392Scraped article 4393Scraped article 4394


Scraped article 4395
Scraped article 4396Scraped article 4397

Scraped article 4398
Scraped article 4399
Scraped article 4400
Scraped article 4401
Scraped article 4402
Scraped article 4403
Scraped article 4404
Scraped article 4405
Scraped article 4406
Scraped article 4407
Scraped article 4408
Scraped article 4409
Scraped article 4410
Scraped article 4411
Scraped article 4412
Scraped article 4413Scraped article 4414

Scraped artic

Scraped article 4759Scraped article 4760

Scraped article 4761
Scraped article 4762
Scraped article 4763Scraped article 4764

Scraped article 4765
Scraped article 4766Scraped article 4767

Scraped article 4768
Scraped article 4769
Scraped article 4770Scraped article 4771

Scraped article 4772
Scraped article 4773
Scraped article 4774Scraped article 4775

Scraped article 4776
Scraped article 4777
Scraped article 4778
Scraped article 4779
Scraped article 4780
Scraped article 4781
Scraped article 4782
Scraped article 4783
Scraped article 4784
Scraped article 4785
Scraped article 4786
Scraped article 4787
Scraped article 4788
Scraped article 4789
Scraped article 4790
Scraped article 4791
Scraped article 4792Scraped article 4793
Scraped article 4794

Scraped article 4795
Scraped article 4796
Scraped article 4797
Scraped article 4798
Scraped article 4799
Scraped article 4800
Scraped article 4801
Scraped article 4802
Scraped article 4803
Scraped article 4804
Scraped article 4805Scraped articl

Scraped article 5151Scraped article 5152

Scraped article 5153
Scraped article 5154Scraped article 5155

Scraped article 5156
Scraped article 5157
Scraped article 5158
Scraped article 5159
Scraped article 5160
Scraped article 5161
Scraped article 5162
Scraped article 5163Scraped article 5164

Scraped article 5165
Scraped article 5166
Scraped article 5167
Scraped article 5168
Scraped article 5169
Scraped article 5170
Scraped article 5171
Scraped article 5172
Scraped article 5173
Scraped article 5174
Scraped article 5175Scraped article 5176
Scraped article 5177

Scraped article 5178
Scraped article 5179
Scraped article 5180
Scraped article 5181
Scraped article 5182
Scraped article 5183
Scraped article 5184
Scraped article 5185
Scraped article 5186
Scraped article 5187
Scraped article 5188
Scraped article 5189
Scraped article 5190Scraped article 5191

Scraped article 5192
Scraped article 5193
Scraped article 5194
Scraped article 5195Scraped article 5196

Scraped article 5197
Scraped artic

Scraped article 5545
Scraped article 5546
Scraped article 5547
Scraped article 5548
Scraped article 5549
Scraped article 5550
Scraped article 5551
Scraped article 5552
Scraped article 5553
Scraped article 5554
Scraped article 5555Scraped article 5556

Scraped article 5557Scraped article 5558

Scraped article 5559
Scraped article 5560
Scraped article 5561
Scraped article 5562
Scraped article 5563
Scraped article 5564
Scraped article 5565
Scraped article 5566
Scraped article 5567
Scraped article 5568
Scraped article 5569
Scraped article 5570
Scraped article 5571
Scraped article 5572
Scraped article 5573
Scraped article 5574
Scraped article 5575
Scraped article 5576
Scraped article 5577
Scraped article 5578
Scraped article 5579
Scraped article 5580
Scraped article 5581
Scraped article 5582
Scraped article 5583
Scraped article 5584
Scraped article 5585
Scraped article 5586
Scraped article 5587
Scraped article 5588
Scraped article 5589Scraped article 5590Scraped article 5591


Scraped artic

Scraped article 5940
Scraped article 5941
Scraped article 5942
Scraped article 5943Scraped article 5944

Scraped article 5945
Scraped article 5946Scraped article 5947

Scraped article 5948
Scraped article 5949
Scraped article 5950
Scraped article 5951
Scraped article 5952
Scraped article 5953
Scraped article 5954
Scraped article 5955
Scraped article 5956
Scraped article 5957
Scraped article 5958
Scraped article 5959
Scraped article 5960
Scraped article 5961Scraped article 5962

Scraped article 5963
Scraped article 5964
Scraped article 5965
Scraped article 5966
Scraped article 5967
Scraped article 5968
Scraped article 5969
Scraped article 5970
Scraped article 5971
Scraped article 5972
Scraped article 5973Scraped article 5974
Scraped article 5975

Scraped article 5976
Scraped article 5977Scraped article 5978

Scraped article 5979
Scraped article 5980
Scraped article 5981
Scraped article 5982
Scraped article 5983Scraped article 5984
Scraped article 5985

Scraped article 5986Scraped articl

Scraped article 6333
Scraped article 6334
Scraped article 6335Scraped article 6336
Scraped article 6337

Scraped article 6338
Scraped article 6339
Scraped article 6340
Scraped article 6341
Scraped article 6342
Scraped article 6343
Scraped article 6344
Scraped article 6345
Scraped article 6346
Scraped article 6347
Scraped article 6348
Scraped article 6349
Scraped article 6350
Scraped article 6351
Scraped article 6352
Scraped article 6353
Scraped article 6354
Scraped article 6355
Scraped article 6356
Scraped article 6357
Scraped article 6358
Scraped article 6359
Scraped article 6360
Scraped article 6361
Scraped article 6362
Scraped article 6363Scraped article 6364

Scraped article 6365
Scraped article 6366
Scraped article 6367
Scraped article 6368
Scraped article 6369
Scraped article 6370
Scraped article 6371
Scraped article 6372
Scraped article 6373
Scraped article 6374
Scraped article 6375Scraped article 6376

Scraped article 6377
Scraped article 6378
Scraped article 6379
Scraped artic

Scraped article 6724
Scraped article 6725
Scraped article 6726
Scraped article 6727Scraped article 6728

Scraped article 6729
Scraped article 6730
Scraped article 6731
Scraped article 6732
Scraped article 6733
Scraped article 6734
Scraped article 6735
Scraped article 6736
Scraped article 6737
Scraped article 6738
Scraped article 6739
Scraped article 6740
Scraped article 6741
Scraped article 6742
Scraped article 6743Scraped article 6744

Scraped article 6745
Scraped article 6746
Scraped article 6747
Scraped article 6748
Scraped article 6749
Scraped article 6750
Scraped article 6751
Scraped article 6752
Scraped article 6753
Scraped article 6754
Scraped article 6755
Scraped article 6756
Scraped article 6757
Scraped article 6758
Scraped article 6759
Scraped article 6760
Scraped article 6761
Scraped article 6762
Scraped article 6763
Scraped article 6764
Scraped article 6765
Scraped article 6766
Scraped article 6767
Scraped article 6768Scraped article 6769

Scraped article 6770
Scraped artic

Scraped article 7116Scraped article 7117

Scraped article 7118
Scraped article 7119
Scraped article 7120
Scraped article 7121
Scraped article 7122
Scraped article 7123
Scraped article 7124
Scraped article 7125
Scraped article 7126
Scraped article 7127
Scraped article 7128
Scraped article 7129
Scraped article 7130
Scraped article 7131Scraped article 7132

Scraped article 7133
Scraped article 7134
Scraped article 7135
Scraped article 7136
Scraped article 7137
Scraped article 7138Scraped article 7139

Scraped article 7140
Scraped article 7141
Scraped article 7142
Scraped article 7143
Scraped article 7144
Scraped article 7145
Scraped article 7146
Scraped article 7147
Scraped article 7148
Scraped article 7149
Scraped article 7150
Scraped article 7151
Scraped article 7152
Scraped article 7153
Scraped article 7154
Scraped article 7155
Scraped article 7156
Scraped article 7157
Scraped article 7158Scraped article 7159
Scraped article 7160

Scraped article 7161
Scraped article 7162
Scraped artic

Scraped article 7508
Scraped article 7509
Scraped article 7510
Scraped article 7511
Scraped article 7512
Scraped article 7513
Scraped article 7514
Scraped article 7515
Scraped article 7516Scraped article 7517

Scraped article 7518
Scraped article 7519
Scraped article 7520
Scraped article 7521
Scraped article 7522Scraped article 7523

Scraped article 7524
Scraped article 7525
Scraped article 7526
Scraped article 7527
Scraped article 7528
Scraped article 7529
Scraped article 7530
Scraped article 7531
Scraped article 7532
Scraped article 7533
Scraped article 7534
Scraped article 7535Scraped article 7536

Scraped article 7537
Scraped article 7538
Scraped article 7539
Scraped article 7540
Scraped article 7541
Scraped article 7542
Scraped article 7543
Scraped article 7544
Scraped article 7545
Scraped article 7546
Scraped article 7547Scraped article 7548
Scraped article 7549

Scraped article 7550
Scraped article 7551
Scraped article 7552
Scraped article 7553
Scraped article 7554
Scraped artic

Scraped article 7904
Scraped article 7905
Scraped article 7906Scraped article 7907

Scraped article 7908
Scraped article 7909
Scraped article 7910
Scraped article 7911
Scraped article 7912
Scraped article 7913Scraped article 7914Scraped article 7915


Scraped article 7916
Scraped article 7917
Scraped article 7918
Scraped article 7919Scraped article 7920

Scraped article 7921Scraped article 7922
Scraped article 7923
Scraped article 7924

Scraped article 7925
Scraped article 7926Scraped article 7927

Scraped article 7928
Scraped article 7929
Scraped article 7930
Scraped article 7931
Scraped article 7932Scraped article 7933

Scraped article 7934
Scraped article 7935
Scraped article 7936
Scraped article 7937
Scraped article 7938
Scraped article 7939
Scraped article 7940
Scraped article 7941
Scraped article 7942
Scraped article 7943
Scraped article 7944Scraped article 7945

Scraped article 7946Scraped article 7947Scraped article 7948


Scraped article 7949
Scraped article 7950
Scraped artic

Scraped article 8295
Scraped article 8296
Scraped article 8297
Scraped article 8298
Scraped article 8299
Scraped article 8300
Scraped article 8301
Scraped article 8302
Scraped article 8303
Scraped article 8304Scraped article 8305

Scraped article 8306Scraped article 8307

Scraped article 8308
Scraped article 8309
Scraped article 8310
Scraped article 8311
Scraped article 8312
Scraped article 8313Scraped article 8314

Scraped article 8315
Scraped article 8316
Scraped article 8317
Scraped article 8318Scraped article 8319

Scraped article 8320
Scraped article 8321
Scraped article 8322
Scraped article 8323
Scraped article 8324
Scraped article 8325
Scraped article 8326
Scraped article 8327
Scraped article 8328
Scraped article 8329
Scraped article 8330
Scraped article 8331
Scraped article 8332
Scraped article 8333
Scraped article 8334
Scraped article 8335
Scraped article 8336
Scraped article 8337
Scraped article 8338
Scraped article 8339
Scraped article 8340
Scraped article 8341
Scraped artic

Scraped article 8689
Scraped article 8690
Scraped article 8691
Scraped article 8692
Scraped article 8693
Scraped article 8694
Scraped article 8695
Scraped article 8696
Scraped article 8697
Scraped article 8698
Scraped article 8699Scraped article 8700
Scraped article 8701

Scraped article 8702
Scraped article 8703
Scraped article 8704
Scraped article 8705
Scraped article 8706Scraped article 8707

Scraped article 8708
Scraped article 8709
Scraped article 8710
Scraped article 8711
Scraped article 8712
Scraped article 8713
Scraped article 8714
Scraped article 8715
Scraped article 8716
Scraped article 8717
Scraped article 8718
Scraped article 8719
Scraped article 8720
Scraped article 8721
Scraped article 8722
Scraped article 8723
Scraped article 8724
Scraped article 8725
Scraped article 8726
Scraped article 8727
Scraped article 8728
Scraped article 8729Scraped article 8730

Scraped article 8731
Scraped article 8732
Scraped article 8733
Scraped article 8734
Scraped article 8735
Scraped artic

Scraped article 9080
Scraped article 9081Scraped article 9082

Scraped article 9083
Scraped article 9084
Scraped article 9085
Scraped article 9086
Scraped article 9087Scraped article 9088
Scraped article 9089

Scraped article 9090
Scraped article 9091
Scraped article 9092
Scraped article 9093
Scraped article 9094
Scraped article 9095
Scraped article 9096Scraped article 9097

Scraped article 9098
Scraped article 9099
Scraped article 9100
Scraped article 9101
Scraped article 9102Scraped article 9103

Scraped article 9104
Scraped article 9105
Scraped article 9106
Scraped article 9107
Scraped article 9108
Scraped article 9109
Scraped article 9110
Scraped article 9111
Scraped article 9112
Scraped article 9113
Scraped article 9114
Scraped article 9115
Scraped article 9116
Scraped article 9117
Scraped article 9118
Scraped article 9119
Scraped article 9120
Scraped article 9121
Scraped article 9122
Scraped article 9123
Scraped article 9124
Scraped article 9125
Scraped article 9126
Scraped artic

Scraped article 9472Scraped article 9473

Scraped article 9474
Scraped article 9475Scraped article 9476

Scraped article 9477
Scraped article 9478
Scraped article 9479
Scraped article 9480
Scraped article 9481
Scraped article 9482
Scraped article 9483
Scraped article 9484
Scraped article 9485
Scraped article 9486
Scraped article 9487
Scraped article 9488
Scraped article 9489
Scraped article 9490
Scraped article 9491
Scraped article 9492
Scraped article 9493
Scraped article 9494
Scraped article 9495Scraped article 9496
Scraped article 9497

Scraped article 9498
Scraped article 9499Scraped article 9500

Scraped article 9501
Scraped article 9502
Scraped article 9503
Scraped article 9504
Scraped article 9505
Scraped article 9506
Scraped article 9507
Scraped article 9508Scraped article 9509
Scraped article 9510
Scraped article 9511

Scraped article 9512
Scraped article 9513
Scraped article 9514
Scraped article 9515
Scraped article 9516
Scraped article 9517
Scraped article 9518
Scraped artic

Scraped article 9865
Scraped article 9866
Scraped article 9867
Scraped article 9868
Scraped article 9869
Scraped article 9870Scraped article 9871

Scraped article 9872Scraped article 9873

Scraped article 9874
Scraped article 9875
Scraped article 9876Scraped article 9877

Scraped article 9878
Scraped article 9879
Scraped article 9880
Scraped article 9881
Scraped article 9882
Scraped article 9883
Scraped article 9884
Scraped article 9885Scraped article 9886

Scraped article 9887Scraped article 9888

Scraped article 9889
Scraped article 9890
Scraped article 9891Scraped article 9892

Scraped article 9893
Scraped article 9894Scraped article 9895

Scraped article 9896
Scraped article 9897
Scraped article 9898
Scraped article 9899
Scraped article 9900
Scraped article 9901
Scraped article 9902
Scraped article 9903
Scraped article 9904
Scraped article 9905
Scraped article 9906
Scraped article 9907
Scraped article 9908
Scraped article 9909Scraped article 9910

Scraped article 9911
Scraped artic

Scraped article 10245Scraped article 10246
Scraped article 10247Scraped article 10248


Scraped article 10249
Scraped article 10250
Scraped article 10251
Scraped article 10252
Scraped article 10253
Scraped article 10254
Scraped article 10255
Scraped article 10256Scraped article 10257

Scraped article 10258
Scraped article 10259Scraped article 10260

Scraped article 10261
Scraped article 10262
Scraped article 10263
Scraped article 10264
Scraped article 10265
Scraped article 10266Scraped article 10267

Scraped article 10268
Scraped article 10269
Scraped article 10270
Scraped article 10271
Scraped article 10272
Scraped article 10273
Scraped article 10274
Scraped article 10275
Scraped article 10276
Scraped article 10277
Scraped article 10278
Scraped article 10279
Scraped article 10280
Scraped article 10281
Scraped article 10282
Scraped article 10283
Scraped article 10284
Scraped article 10285
Scraped article 10286
Scraped article 10287
Scraped article 10288
Scraped article 10289
Scraped ar

Scraped article 10620
Scraped article 10621
Scraped article 10622
Scraped article 10623
Scraped article 10624
Scraped article 10625
Scraped article 10626
Scraped article 10627
Scraped article 10628
Scraped article 10629
Scraped article 10630
Scraped article 10631
Scraped article 10632
Scraped article 10633
Scraped article 10634
Scraped article 10635
Scraped article 10636
Scraped article 10637
Scraped article 10638
Scraped article 10639
Scraped article 10640
Scraped article 10641
Scraped article 10642Scraped article 10643

Scraped article 10644
Scraped article 10645Scraped article 10646
Scraped article 10647

Scraped article 10648
Scraped article 10649
Scraped article 10650
Scraped article 10651
Scraped article 10652
Scraped article 10653
Scraped article 10654
Scraped article 10655Scraped article 10656

Scraped article 10657
Scraped article 10658
Scraped article 10659
Scraped article 10660
Scraped article 10661
Scraped article 10662
Scraped article 10663
Scraped article 10664
Scraped ar

Scraped article 10994
Scraped article 10995
Scraped article 10996
Scraped article 10997
Scraped article 10998Scraped article 10999

Scraped article 11000
Scraped article 11001
Scraped article 11002
Scraped article 11003Scraped article 11004Scraped article 11005

Scraped article 11006
Scraped article 11007
Scraped article 11008

Scraped article 11009
Scraped article 11010
Scraped article 11011
Scraped article 11012
Scraped article 11013
Scraped article 11014
Scraped article 11015
Scraped article 11016
Scraped article 11017
Scraped article 11018
Scraped article 11019
Scraped article 11020
Scraped article 11021
Scraped article 11022Scraped article 11023

Scraped article 11024
Scraped article 11025
Scraped article 11026
Scraped article 11027
Scraped article 11028
Scraped article 11029
Scraped article 11030Scraped article 11031

Scraped article 11032
Scraped article 11033
Scraped article 11034
Scraped article 11035
Scraped article 11036
Scraped article 11037
Scraped article 11038
Scraped ar

Scraped article 11367
Scraped article 11368
Scraped article 11369
Scraped article 11370Scraped article 11371

Scraped article 11372
Scraped article 11373
Scraped article 11374
Scraped article 11375
Scraped article 11376Scraped article 11377

Scraped article 11378
Scraped article 11379
Scraped article 11380
Scraped article 11381
Scraped article 11382
Scraped article 11383
Scraped article 11384
Scraped article 11385
Scraped article 11386
Scraped article 11387
Scraped article 11388
Scraped article 11389
Scraped article 11390
Scraped article 11391
Scraped article 11392
Scraped article 11393
Scraped article 11394
Scraped article 11395Scraped article 11396

Scraped article 11397
Scraped article 11398Scraped article 11399

Scraped article 11400
Scraped article 11401
Scraped article 11402
Scraped article 11403
Scraped article 11404
Scraped article 11405
Scraped article 11406
Scraped article 11407
Scraped article 11408Scraped article 11409

Scraped article 11410
Scraped article 11411
Scraped ar

Scraped article 11742
Scraped article 11743
Scraped article 11744Scraped article 11745

Scraped article 11746
Scraped article 11747
Scraped article 11748
Scraped article 11749
Scraped article 11750
Scraped article 11751
Scraped article 11752
Scraped article 11753
Scraped article 11754
Scraped article 11755
Scraped article 11756
Scraped article 11757
Scraped article 11758Scraped article 11759

Scraped article 11760
Scraped article 11761
Scraped article 11762Scraped article 11763Scraped article 11764

Scraped article 11765

Scraped article 11766
Scraped article 11767
Scraped article 11768Scraped article 11769

Scraped article 11770
Scraped article 11771
Scraped article 11772
Scraped article 11773
Scraped article 11774
Scraped article 11775Scraped article 11776

Scraped article 11777
Scraped article 11778
Scraped article 11779
Scraped article 11780
Scraped article 11781
Scraped article 11782Scraped article 11783

Scraped article 11784Scraped article 11785

Scraped article 11786
Scraped ar

Scraped article 12116
Scraped article 12117
Scraped article 12118
Scraped article 12119
Scraped article 12120Scraped article 12121Scraped article 12122


Scraped article 12123
Scraped article 12124
Scraped article 12125
Scraped article 12126Scraped article 12127

Scraped article 12128
Scraped article 12129
Scraped article 12130
Scraped article 12131Scraped article 12132

Scraped article 12133
Scraped article 12134
Scraped article 12135
Scraped article 12136
Scraped article 12137
Scraped article 12138
Scraped article 12139
Scraped article 12140
Scraped article 12141
Scraped article 12142
Scraped article 12143
Scraped article 12144
Scraped article 12145
Scraped article 12146
Scraped article 12147
Scraped article 12148
Scraped article 12149
Scraped article 12150
Scraped article 12151Scraped article 12152

Scraped article 12153
Scraped article 12154Scraped article 12155

Scraped article 12156
Scraped article 12157
Scraped article 12158
Scraped article 12159Scraped article 12160

Scraped ar

Scraped article 12490
Scraped article 12491
Scraped article 12492
Scraped article 12493
Scraped article 12494
Scraped article 12495
Scraped article 12496
Scraped article 12497
Scraped article 12498
Scraped article 12499
Scraped article 12500
Scraped article 12501
Scraped article 12502
Scraped article 12503
Scraped article 12504
Scraped article 12505
Scraped article 12506
Scraped article 12507
Scraped article 12508
Scraped article 12509Scraped article 12510

Scraped article 12511
Scraped article 12512
Scraped article 12513
Scraped article 12514
Scraped article 12515
Scraped article 12516
Scraped article 12517
Scraped article 12518
Scraped article 12519
Scraped article 12520
Scraped article 12521Scraped article 12522

Scraped article 12523
Scraped article 12524
Scraped article 12525
Scraped article 12526
Scraped article 12527Scraped article 12528Scraped article 12529


Scraped article 12530
Scraped article 12531
Scraped article 12532
Scraped article 12533
Scraped article 12534
Scraped ar

Scraped article 12865Scraped article 12866

Scraped article 12867
Scraped article 12868
Scraped article 12869
Scraped article 12870
Scraped article 12871
Scraped article 12872
Scraped article 12873
Scraped article 12874Scraped article 12875

Scraped article 12876
Scraped article 12877
Scraped article 12878
Scraped article 12879
Scraped article 12880
Scraped article 12881
Scraped article 12882
Scraped article 12883
Scraped article 12884
Scraped article 12885
Scraped article 12886
Scraped article 12887
Scraped article 12889Scraped article 12890Scraped article 12891Scraped article 12888



Scraped article 12892
Scraped article 12893
Scraped article 12894
Scraped article 12895
Scraped article 12896
Scraped article 12897
Scraped article 12898Scraped article 12899

Scraped article 12900
Scraped article 12901Scraped article 12902Scraped article 12903


Scraped article 12904
Scraped article 12905


In [111]:
result_df

Unnamed: 0,target,text
0,personal finance,When the Bank of England raised the interest r...
1,personal finance,More than 5 crore income tax returns (ITRs) ha...
2,personal finance,Fintech company Cred has acqui-hired the found...
3,personal finance,My old acquaintance Amit called me yesterday. ...
4,personal finance,“An AI can play a pretty important role in thr...
...,...,...
14369,personal finance,America faces supply chain disruption and shor...
14370,personal finance,NEW DELHI: About 79.3 per cent of investors su...
14371,personal finance,Our panel of experts will answer questions rel...
14372,personal finance,We routinely meet people who make financial de...


In [112]:
df = result_df[result_df.text != '']
df

Unnamed: 0,target,text
0,personal finance,When the Bank of England raised the interest r...
1,personal finance,More than 5 crore income tax returns (ITRs) ha...
2,personal finance,Fintech company Cred has acqui-hired the found...
3,personal finance,My old acquaintance Amit called me yesterday. ...
4,personal finance,“An AI can play a pretty important role in thr...
...,...,...
14368,personal finance,This is the season to give. To light up anothe...
14369,personal finance,America faces supply chain disruption and shor...
14370,personal finance,NEW DELHI: About 79.3 per cent of investors su...
14371,personal finance,Our panel of experts will answer questions rel...


In [150]:
df.to_csv('personal_finance_half.csv', index = False)

## Scraping articles from GamesRadar

In [4]:
url_df = pd.read_csv('video_gaming_url.csv')
c = 0

@retry(wait_fixed=2000, stop_max_attempt_number=3)
def scrape_url_with_retry(url):
    global c
    try:
        page = requests.get(url, verify=True)
    except requests.exceptions.SSLError as e:
        page = requests.get(url, verify=False)
        
    try:
        page.raise_for_status()
        soup = BeautifulSoup(page.content, 'html.parser')
        
        content_with_class = soup.find('div', class_='text-copy bodyCopy auto')
        if content_with_class:
            c += 1
            print(f"Scraped article {c}")
            return content_with_class.text

    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL: {url}. Error: {e}")
    
    return ''

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_url_with_retry, url_df['URL']))

url_df['text'] = results
url_df['target'] = 'video gaming'

result_df = url_df[['target', 'text']]

Scraped article 1
Scraped article 2
Scraped article 3
Scraped article 4
Scraped article 5Scraped article 6
Scraped article 7
Scraped article 8

Scraped article 9
Scraped article 10
Scraped article 11
Scraped article 12
Scraped article 13
Scraped article 14
Scraped article 15
Scraped article 16
Scraped article 17Scraped article 18Scraped article 19
Scraped article 20


Scraped article 21
Scraped article 22Scraped article 23

Scraped article 24
Scraped article 25
Scraped article 26
Scraped article 27
Scraped article 28
Scraped article 29
Scraped article 30
Scraped article 31
Scraped article 32
Scraped article 33
Scraped article 34Scraped article 35

Scraped article 36Scraped article 37

Scraped article 38
Scraped article 39Scraped article 40

Scraped article 41
Scraped article 42
Scraped article 43
Scraped article 44
Scraped article 45
Scraped article 46
Scraped article 47Scraped article 48

Scraped article 49
Scraped article 50
Scraped article 51
Scraped article 52
Scraped article 53
Sc

Scraped article 419
Scraped article 420
Scraped article 421
Scraped article 422
Scraped article 423
Scraped article 424
Scraped article 425
Scraped article 426
Scraped article 427
Scraped article 428
Scraped article 429
Scraped article 430
Scraped article 431
Scraped article 432
Scraped article 433
Scraped article 434
Scraped article 435
Scraped article 436Scraped article 437

Scraped article 438
Scraped article 439
Scraped article 440Scraped article 441

Scraped article 442Scraped article 443

Scraped article 444
Scraped article 445
Scraped article 446
Scraped article 447
Scraped article 448
Scraped article 449
Scraped article 450
Scraped article 451
Scraped article 452
Scraped article 453
Scraped article 454
Scraped article 455
Scraped article 456
Scraped article 457
Scraped article 458
Scraped article 459
Scraped article 460
Scraped article 461
Scraped article 462
Scraped article 463
Scraped article 464
Scraped article 465
Scraped article 466
Scraped article 467
Scraped article 468


Scraped article 829
Scraped article 830Scraped article 831

Scraped article 832
Scraped article 833
Scraped article 834
Scraped article 835
Scraped article 836
Scraped article 837Scraped article 838

Scraped article 839
Scraped article 840
Scraped article 841
Scraped article 842
Scraped article 843
Scraped article 844
Scraped article 845
Scraped article 846
Scraped article 847
Scraped article 848
Scraped article 849
Scraped article 850
Scraped article 851
Scraped article 852
Scraped article 853
Scraped article 854
Scraped article 855
Scraped article 856
Scraped article 857Scraped article 858

Scraped article 859
Scraped article 860
Scraped article 861
Scraped article 862
Scraped article 863
Scraped article 864
Scraped article 865
Scraped article 866
Scraped article 867
Scraped article 868
Scraped article 869
Scraped article 870
Scraped article 871
Scraped article 872
Scraped article 873Scraped article 874

Scraped article 875
Scraped article 876
Scraped article 877Scraped article 878



Scraped article 1228
Scraped article 1229
Scraped article 1230
Scraped article 1231
Scraped article 1232
Scraped article 1233
Scraped article 1234
Scraped article 1235
Scraped article 1236
Scraped article 1237
Scraped article 1238
Scraped article 1239
Scraped article 1240
Scraped article 1241
Scraped article 1242
Scraped article 1243
Scraped article 1244
Scraped article 1245
Scraped article 1246
Scraped article 1247
Scraped article 1248
Scraped article 1249
Scraped article 1250Scraped article 1251

Scraped article 1252
Scraped article 1253
Scraped article 1254
Scraped article 1255
Scraped article 1256
Scraped article 1257
Scraped article 1258
Scraped article 1259
Scraped article 1260
Scraped article 1261Scraped article 1262

Scraped article 1263
Scraped article 1264
Scraped article 1265
Scraped article 1266
Scraped article 1267
Scraped article 1268
Scraped article 1269
Scraped article 1270
Scraped article 1271
Scraped article 1272
Scraped article 1273
Scraped article 1274
Scraped artic

Scraped article 1620
Scraped article 1621
Scraped article 1622
Scraped article 1623
Scraped article 1624
Scraped article 1625
Scraped article 1626
Scraped article 1627
Scraped article 1628
Scraped article 1629
Scraped article 1630
Scraped article 1631
Scraped article 1632
Scraped article 1633
Scraped article 1634
Scraped article 1635
Scraped article 1636
Scraped article 1637
Scraped article 1638
Scraped article 1639
Scraped article 1640
Scraped article 1641
Scraped article 1642
Scraped article 1643
Scraped article 1644
Scraped article 1645
Scraped article 1646Scraped article 1647

Scraped article 1648
Scraped article 1649
Scraped article 1650
Scraped article 1651
Scraped article 1652
Scraped article 1653
Scraped article 1654
Scraped article 1655
Scraped article 1656
Scraped article 1657
Scraped article 1658
Scraped article 1659
Scraped article 1660
Scraped article 1661Scraped article 1662

Scraped article 1663
Scraped article 1664
Scraped article 1665
Scraped article 1666
Scraped artic

Scraped article 2011Scraped article 2012

Scraped article 2013
Scraped article 2014
Scraped article 2015
Scraped article 2016
Scraped article 2017
Scraped article 2018
Scraped article 2019
Scraped article 2020
Scraped article 2021
Scraped article 2022
Scraped article 2023
Scraped article 2024
Scraped article 2025
Scraped article 2026
Scraped article 2027
Scraped article 2028
Scraped article 2029
Scraped article 2030
Scraped article 2031
Scraped article 2032
Scraped article 2033
Scraped article 2034
Scraped article 2035
Scraped article 2036
Scraped article 2037
Scraped article 2038
Scraped article 2039
Scraped article 2040
Scraped article 2041
Scraped article 2042
Scraped article 2043
Scraped article 2044
Scraped article 2045Scraped article 2046

Scraped article 2047
Scraped article 2048
Scraped article 2049
Scraped article 2050
Scraped article 2051Scraped article 2052

Scraped article 2053
Scraped article 2054
Scraped article 2055
Scraped article 2056
Scraped article 2057
Scraped artic

Scraped article 2404
Scraped article 2405
Scraped article 2406
Scraped article 2407
Scraped article 2408
Scraped article 2409
Scraped article 2410
Scraped article 2411
Scraped article 2412
Scraped article 2413
Scraped article 2414
Scraped article 2415
Scraped article 2416
Scraped article 2417
Scraped article 2418
Scraped article 2419
Scraped article 2420
Scraped article 2421
Scraped article 2422
Scraped article 2423
Scraped article 2424
Scraped article 2425
Scraped article 2426
Scraped article 2427
Scraped article 2428
Scraped article 2429
Scraped article 2430
Scraped article 2431
Scraped article 2432
Scraped article 2433
Scraped article 2434
Scraped article 2435Scraped article 2436

Scraped article 2437
Scraped article 2438
Scraped article 2439
Scraped article 2440
Scraped article 2441
Scraped article 2442
Scraped article 2443
Scraped article 2444
Scraped article 2445
Scraped article 2446
Scraped article 2447
Scraped article 2448
Scraped article 2449
Scraped article 2450
Scraped artic

Scraped article 2796
Scraped article 2797
Scraped article 2798
Scraped article 2799Scraped article 2800Scraped article 2801


Scraped article 2802
Scraped article 2803
Scraped article 2804
Scraped article 2805
Scraped article 2806
Scraped article 2807
Scraped article 2808
Scraped article 2809
Scraped article 2810
Scraped article 2811
Scraped article 2812
Scraped article 2813
Scraped article 2814
Scraped article 2815
Scraped article 2816
Scraped article 2817
Scraped article 2818
Scraped article 2819
Scraped article 2820
Scraped article 2821
Scraped article 2822
Scraped article 2823
Scraped article 2824
Scraped article 2825
Scraped article 2826
Scraped article 2827
Scraped article 2828
Scraped article 2829
Scraped article 2830
Scraped article 2831
Scraped article 2832
Scraped article 2833
Scraped article 2834
Scraped article 2835
Scraped article 2836
Scraped article 2837
Scraped article 2838
Scraped article 2839
Scraped article 2840
Scraped article 2841Scraped article 2842

Scraped artic

Scraped article 3188
Scraped article 3189
Scraped article 3190
Scraped article 3191
Scraped article 3192
Scraped article 3193
Scraped article 3194
Scraped article 3195
Scraped article 3196
Scraped article 3197
Scraped article 3198
Scraped article 3199Scraped article 3200

Scraped article 3201
Scraped article 3202
Scraped article 3203
Scraped article 3204
Scraped article 3205
Scraped article 3206
Scraped article 3207
Scraped article 3208
Scraped article 3209Scraped article 3210

Scraped article 3211
Scraped article 3212
Scraped article 3213
Scraped article 3214
Scraped article 3215
Scraped article 3216
Scraped article 3217
Scraped article 3218
Scraped article 3219
Scraped article 3220
Scraped article 3221
Scraped article 3222
Scraped article 3223Scraped article 3224
Scraped article 3225

Scraped article 3226
Scraped article 3227
Scraped article 3228
Scraped article 3229
Scraped article 3230
Scraped article 3231
Scraped article 3232
Scraped article 3233
Scraped article 3234
Scraped artic

Scraped article 3579
Scraped article 3580
Scraped article 3581
Scraped article 3582
Scraped article 3583Scraped article 3584
Scraped article 3585

Scraped article 3586
Scraped article 3587
Scraped article 3588
Scraped article 3589
Scraped article 3590
Scraped article 3591
Scraped article 3592
Scraped article 3593
Scraped article 3594
Scraped article 3595
Scraped article 3596
Scraped article 3597
Scraped article 3598
Scraped article 3599
Scraped article 3600
Scraped article 3601
Scraped article 3602
Scraped article 3603
Scraped article 3604
Scraped article 3605
Scraped article 3606
Scraped article 3607
Scraped article 3608
Scraped article 3609
Scraped article 3610
Scraped article 3611
Scraped article 3612
Scraped article 3613
Scraped article 3614Scraped article 3615

Scraped article 3616
Scraped article 3617
Scraped article 3618
Scraped article 3619
Scraped article 3620
Scraped article 3621
Scraped article 3622
Scraped article 3623
Scraped article 3624
Scraped article 3625
Scraped artic



Scraped article 3875
Scraped article 3876
Scraped article 3877Scraped article 3878

Scraped article 3879
Scraped article 3880
Scraped article 3881
Scraped article 3882
Scraped article 3883
Scraped article 3884
Scraped article 3885
Scraped article 3886
Scraped article 3887
Scraped article 3888
Scraped article 3889
Scraped article 3890
Scraped article 3891
Scraped article 3892
Scraped article 3893
Scraped article 3894
Scraped article 3895
Scraped article 3896
Scraped article 3897
Scraped article 3898
Scraped article 3899
Scraped article 3900
Scraped article 3901
Scraped article 3902
Scraped article 3903
Scraped article 3904
Scraped article 3905
Scraped article 3906
Scraped article 3907
Scraped article 3908
Scraped article 3909
Scraped article 3910
Scraped article 3911
Scraped article 3912
Scraped article 3913
Scraped article 3914
Scraped article 3915
Scraped article 3916
Scraped article 3917
Scraped article 3918
Scraped article 3919
Scraped article 3920
Scraped article 3921Scraped articl

Scraped article 4266
Scraped article 4267
Scraped article 4268
Scraped article 4269
Scraped article 4270
Scraped article 4271
Scraped article 4272
Scraped article 4273
Scraped article 4274
Scraped article 4275
Scraped article 4276
Scraped article 4277
Scraped article 4278
Scraped article 4279Scraped article 4280

Scraped article 4281
Scraped article 4282
Scraped article 4283
Scraped article 4284
Scraped article 4285
Scraped article 4286
Scraped article 4287
Scraped article 4288
Scraped article 4289Scraped article 4290
Scraped article 4291

Scraped article 4292
Scraped article 4293
Scraped article 4294
Scraped article 4295
Scraped article 4296
Scraped article 4297
Scraped article 4298
Scraped article 4299
Scraped article 4300
Scraped article 4301
Scraped article 4302
Scraped article 4303
Scraped article 4304
Scraped article 4305
Scraped article 4306
Scraped article 4307
Scraped article 4308
Scraped article 4309
Scraped article 4310
Scraped article 4311
Scraped article 4312
Scraped artic

Scraped article 5047
Scraped article 5048
Scraped article 5049
Scraped article 5050
Scraped article 5051
Scraped article 5052
Scraped article 5053
Scraped article 5054
Scraped article 5055
Scraped article 5056
Scraped article 5057
Scraped article 5058
Scraped article 5059
Scraped article 5060
Scraped article 5061
Scraped article 5062
Scraped article 5063
Scraped article 5064
Scraped article 5065
Scraped article 5066
Scraped article 5067
Scraped article 5068
Scraped article 5069
Scraped article 5070
Scraped article 5071
Scraped article 5072
Scraped article 5073
Scraped article 5074
Scraped article 5075
Scraped article 5076
Scraped article 5077
Scraped article 5078
Scraped article 5079
Scraped article 5080
Scraped article 5081
Scraped article 5082
Scraped article 5083
Scraped article 5084
Scraped article 5085
Scraped article 5086
Scraped article 5087
Scraped article 5088
Scraped article 5089
Scraped article 5090
Scraped article 5091
Scraped article 5092
Scraped article 5093
Scraped artic

Scraped article 5439Scraped article 5440

Scraped article 5441
Scraped article 5442
Scraped article 5443
Scraped article 5444
Scraped article 5445
Scraped article 5446Scraped article 5447

Scraped article 5448
Scraped article 5449
Scraped article 5450
Scraped article 5451
Scraped article 5452
Scraped article 5453
Scraped article 5454
Scraped article 5455
Scraped article 5456
Scraped article 5457
Scraped article 5458
Scraped article 5459
Scraped article 5460
Scraped article 5461
Scraped article 5462
Scraped article 5463
Scraped article 5464
Scraped article 5465Scraped article 5466

Scraped article 5467
Scraped article 5468
Scraped article 5469
Scraped article 5470
Scraped article 5471
Scraped article 5472
Scraped article 5473
Scraped article 5474
Scraped article 5475Scraped article 5476

Scraped article 5477
Scraped article 5478
Scraped article 5479
Scraped article 5480
Scraped article 5481
Scraped article 5482
Scraped article 5483
Scraped article 5484
Scraped article 5485
Scraped artic

Scraped article 5831
Scraped article 5832
Scraped article 5833
Scraped article 5834
Scraped article 5835
Scraped article 5836Scraped article 5837

Scraped article 5838
Scraped article 5839
Scraped article 5840Scraped article 5841

Scraped article 5842
Scraped article 5843
Scraped article 5844
Scraped article 5845
Scraped article 5846
Scraped article 5847
Scraped article 5848
Scraped article 5849
Scraped article 5850
Scraped article 5851
Scraped article 5852Scraped article 5853

Scraped article 5854
Scraped article 5855
Scraped article 5856
Scraped article 5857
Scraped article 5858
Scraped article 5859
Scraped article 5860
Scraped article 5861
Scraped article 5862Scraped article 5863
Scraped article 5864

Scraped article 5865
Scraped article 5866
Scraped article 5867
Scraped article 5868
Scraped article 5869
Scraped article 5870
Scraped article 5871
Scraped article 5872
Scraped article 5873
Scraped article 5874
Scraped article 5875
Scraped article 5876
Scraped article 5877
Scraped artic

Scraped article 6222
Scraped article 6223
Scraped article 6224
Scraped article 6225
Scraped article 6226
Scraped article 6227
Scraped article 6228
Scraped article 6229
Scraped article 6230
Scraped article 6231
Scraped article 6232
Scraped article 6233
Scraped article 6234
Scraped article 6235
Scraped article 6236
Scraped article 6237
Scraped article 6238
Scraped article 6239
Scraped article 6240Scraped article 6241

Scraped article 6242
Scraped article 6243
Scraped article 6244
Scraped article 6245
Scraped article 6246
Scraped article 6247
Scraped article 6248
Scraped article 6249
Scraped article 6250
Scraped article 6251
Scraped article 6252
Scraped article 6253Scraped article 6254

Scraped article 6255
Scraped article 6256
Scraped article 6257
Scraped article 6258
Scraped article 6259
Scraped article 6260
Scraped article 6261
Scraped article 6262
Scraped article 6263
Scraped article 6264
Scraped article 6265
Scraped article 6266
Scraped article 6267
Scraped article 6268
Scraped artic

Scraped article 6615
Scraped article 6616
Scraped article 6617Scraped article 6618

Scraped article 6619
Scraped article 6620
Scraped article 6621
Scraped article 6622
Scraped article 6623
Scraped article 6624
Scraped article 6625
Scraped article 6626
Scraped article 6627
Scraped article 6628
Scraped article 6629
Scraped article 6630
Scraped article 6631
Scraped article 6632
Scraped article 6633
Scraped article 6634
Scraped article 6635
Scraped article 6636
Scraped article 6637
Scraped article 6638
Scraped article 6639
Scraped article 6640
Scraped article 6641
Scraped article 6642
Scraped article 6643
Scraped article 6644
Scraped article 6645
Scraped article 6646
Scraped article 6647
Scraped article 6648
Scraped article 6649
Scraped article 6650
Scraped article 6651
Scraped article 6652
Scraped article 6653
Scraped article 6654Scraped article 6655

Scraped article 6656
Scraped article 6657Scraped article 6658
Scraped article 6659

Scraped article 6660
Scraped article 6661
Scraped artic

Scraped article 7007
Scraped article 7008
Scraped article 7009
Scraped article 7010
Scraped article 7011Scraped article 7012

Scraped article 7013
Scraped article 7014
Scraped article 7015
Scraped article 7016
Scraped article 7017
Scraped article 7018
Scraped article 7019Scraped article 7020

Scraped article 7021
Scraped article 7022
Scraped article 7023
Scraped article 7024
Scraped article 7025
Scraped article 7026
Scraped article 7027
Scraped article 7028
Scraped article 7029
Scraped article 7030
Scraped article 7031
Scraped article 7032Scraped article 7033

Scraped article 7034
Scraped article 7035Scraped article 7036

Scraped article 7037
Scraped article 7038
Scraped article 7039
Scraped article 7040
Scraped article 7041
Scraped article 7042
Scraped article 7043
Scraped article 7044
Scraped article 7045
Scraped article 7046
Scraped article 7047
Scraped article 7048
Scraped article 7049
Scraped article 7050
Scraped article 7051
Scraped article 7052
Scraped article 7053
Scraped artic

Scraped article 7398
Scraped article 7399
Scraped article 7400
Scraped article 7401
Scraped article 7402
Scraped article 7403
Scraped article 7404
Scraped article 7405
Scraped article 7406
Scraped article 7407Scraped article 7408

Scraped article 7409
Scraped article 7410Scraped article 7411

Scraped article 7412
Scraped article 7413
Scraped article 7414
Scraped article 7415
Scraped article 7416
Scraped article 7417
Scraped article 7418
Scraped article 7419
Scraped article 7420
Scraped article 7421
Scraped article 7422
Scraped article 7423
Scraped article 7424
Scraped article 7425
Scraped article 7426
Scraped article 7427
Scraped article 7428
Scraped article 7429
Scraped article 7430
Scraped article 7431
Scraped article 7432
Scraped article 7433
Scraped article 7434
Scraped article 7435
Scraped article 7436
Scraped article 7437
Scraped article 7438Scraped article 7439

Scraped article 7440
Scraped article 7441
Scraped article 7442
Scraped article 7443Scraped article 7444

Scraped artic

In [11]:
result_df = url_df[['target', 'text']]

In [12]:
result_df.target = result_df.target.replace({'personal finance':'video gaming'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df.target = result_df.target.replace({'personal finance':'video gaming'})


In [13]:
result_df

Unnamed: 0,target,text
0,video gaming,\nDevelopment on Apex Legends Mobile and Battl...
1,video gaming,\nAn all-new King of the Hill series from crea...
2,video gaming,\nStar Wars Jedi: Survivor has been delayed si...
3,video gaming,\nWarzone 2 has finally pulled back the curtai...
4,video gaming,\n Warning! This article contains spoilers for...
...,...,...
7540,video gaming,\nJames Cameron’s long-awaited sequel Avatar: ...
7541,video gaming,\nBethesda’s Todd Howard says he expects every...
7542,video gaming,\nPokemon Scarlet and Violet just got their fi...
7543,video gaming,\nQuantic Dream founder David Cage says the st...


In [14]:
result_df.iloc[10].text

'\nEarlier today, a report suggested that Sony, Microsoft, and Nintendo would all be skipping a physical presence at E3 this year, and now a follow-up suggests that the Switch publisher is dropping out because it simply doesn\'t have enough big games to show.That\'s according to VGC, whose sources indicate that Nintendo is skipping E3 2023 "because it feels it has fewer major releases than usual ready to show and which would justify significant event space." As with that previous report, it\'s worth noting that Xbox, PlayStation, and Nintendo will likely still have big summer showcase events streamed online around E3 time - just don\'t expect them to actually be on the show floor if you attend E3 yourself.But the suggestion that Nintendo\'s skipping E3 because it doesn\'t have enough major releases to show offers more concerning implications for the publisher\'s 2023 slate. Nintendo\'s biggest game this year is certainly The Legend of Zelda: Tears of the Kingdom, but its May 12 release

In [15]:
result_df.to_csv('d30.csv', index = False)

# Scraping articles from Business-Standards

In [28]:
base_url = "https://www.business-standard.com/finance/personal-finance/page-"
num_pages = 680

url_list = []
c = 0
headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
 }

for page_number in range(1, num_pages + 1):
    url = base_url + str(page_number)
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        div_tag = soup.find('div', class_='article-listing')

        if div_tag:
            a_tags = div_tag.find_all('a', class_='smallcard-title')
            for a in a_tags:
                url_list.append(a['href'])
        c += 1        
        print(c)
            
    except requests.exceptions.RequestException as e:
        print("Failed to fetch the webpage:", e)
            
data = {'URL': url_list}
df = pd.DataFrame(data)

df.drop_duplicates(subset='URL', keep='first', inplace=True)
df.to_csv('df_PersonalFinance3.csv', index = False)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [29]:
df

Unnamed: 0,URL
0,https://www.business-standard.com/finance/pers...
1,https://www.business-standard.com/finance/pers...
2,https://www.business-standard.com/finance/pers...
3,https://www.business-standard.com/finance/pers...
4,https://www.business-standard.com/finance/pers...
...,...
13595,https://www.business-standard.com/article/pf/t...
13596,https://www.business-standard.com/article/pf/n...
13597,https://www.business-standard.com/article/pf/b...
13598,https://www.business-standard.com/article/pf/l...


In [30]:
df.drop_duplicates()

Unnamed: 0,URL
0,https://www.business-standard.com/finance/pers...
1,https://www.business-standard.com/finance/pers...
2,https://www.business-standard.com/finance/pers...
3,https://www.business-standard.com/finance/pers...
4,https://www.business-standard.com/finance/pers...
...,...
13595,https://www.business-standard.com/article/pf/t...
13596,https://www.business-standard.com/article/pf/n...
13597,https://www.business-standard.com/article/pf/b...
13598,https://www.business-standard.com/article/pf/l...


In [31]:
df.iloc[50].URL

'https://www.business-standard.com/finance/personal-finance/last-day-to-file-returns-what-happens-if-you-miss-your-itr-deadline-today-123073100151_1.html'

In [37]:
url_df = df
c = 0

headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
 }

@retry(wait_fixed=2000, stop_max_attempt_number=3)

def scrape_url_with_retry(url):
    global c
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        content_with_class = soup.find('div', class_='story-detail')
        if content_with_class:
            c += 1
            print(f"Scraped article {c}")
            return content_with_class.text

    except requests.exceptions.RequestException as e:
        print(f"Error accessing URL: {url}. Error: {e}")
    
    return ''

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(scrape_url_with_retry, url_df['URL']))

url_df['text'] = results
url_df['target'] = 'personal finance'

result_df = url_df[['target', 'text']]

  retry_strategy = Retry(


Error accessing URL: https://www.business-standard.com/finance/personal-finance/what-are-multi-allocation-funds-why-have-they-become-so-popular-123081100294_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/what-are-multi-allocation-funds-why-have-they-become-so-popular-123081100294_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/switch-if-you-can-find-50-bps-cheaper-home-loan-lock-into-current-fd-rates-123081100696_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/switch-if-you-can-find-50-bps-cheaper-home-loan-lock-into-current-fd-rates-123081100696_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/rank-loans-save-money-seek-advice-strategies-to-get-out-of-debt-123081000714_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/rank-l

Error accessing URL: https://www.business-standard.com/finance/personal-finance/backstop-fund-facility-to-shield-debt-mutual-fund-investors-from-fire-sale-123080300493_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/backstop-fund-facility-to-shield-debt-mutual-fund-investors-from-fire-sale-123080300493_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/critical-illness-policy-must-cover-ailments-relevant-to-your-family-history-123080400632_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/critical-illness-policy-must-cover-ailments-relevant-to-your-family-history-123080400632_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/style-statement-that-goes-with-you-everywhere-shopping-for-luxury-handbags-123080300236_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.

Error accessing URL: https://www.business-standard.com/finance/personal-finance/how-to-reduce-your-emi-burden-if-you-have-taken-an-education-loan-123072700140_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/how-to-reduce-your-emi-burden-if-you-have-taken-an-education-loan-123072700140_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/filing-tax-returns-for-the-first-time-here-s-the-itr-form-meant-for-you-123072600405_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/filing-tax-returns-for-the-first-time-here-s-the-itr-form-meant-for-you-123072600405_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/sales-of-flats-above-rs-10-cr-up-50-in-mumbai-gurgaon-s-magnolia-flat-sells-for-rs-50-cr-123072401033_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/

Error accessing URL: https://www.business-standard.com/finance/personal-finance/consequences-of-failure-to-file-deceased-s-itr-penalties-interest-jail-123071900776_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/consequences-of-failure-to-file-deceased-s-itr-penalties-interest-jail-123071900776_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/filing-itr-don-t-forget-to-declare-income-from-other-sources-123071800962_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/filing-itr-don-t-forget-to-declare-income-from-other-sources-123071800962_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/nris-can-still-file-income-tax-return-even-if-the-pan-becomes-inoperative-123071900268_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/nris-can-s

Error accessing URL: https://www.business-standard.com/finance/personal-finance/choose-luxury-flooring-that-will-lift-up-your-home-s-aesthetics-123071300284_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/choose-luxury-flooring-that-will-lift-up-your-home-s-aesthetics-123071300284_1.html
Error accessing URL: https://www.business-standard.com/markets/news/590-bn-rally-since-march-faces-earning-reports-retail-pull-back-tests-123071300156_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/markets/news/590-bn-rally-since-march-faces-earning-reports-retail-pull-back-tests-123071300156_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/verify-pre-filled-data-carefully-before-filing-tax-keep-documents-handy-123071100516_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/verify-pre-filled-data-carefully

Error accessing URL: https://www.business-standard.com/finance/personal-finance/flipkart-offers-upto-rs-5-lakh-in-lending-to-customers-with-axis-tie-up-123070500532_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/flipkart-offers-upto-rs-5-lakh-in-lending-to-customers-with-axis-tie-up-123070500532_1.htmlError accessing URL: https://www.business-standard.com/finance/personal-finance/gst-council-likely-to-define-muvs-and-xuvs-for-levy-of-22-cess-report-123070501048_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/gst-council-likely-to-define-muvs-and-xuvs-for-levy-of-22-cess-report-123070501048_1.htmlError accessing URL: https://www.business-standard.com/finance/personal-finance/tax-harvesting-can-help-you-save-tax-on-your-mutual-fund-investments-123070500134_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finan

Error accessing URL: https://www.business-standard.com/finance/personal-finance/filing-your-returns-know-key-changes-in-itr-for-assessment-year-2023-24-123063000286_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/filing-your-returns-know-key-changes-in-itr-for-assessment-year-2023-24-123063000286_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/corporate-india-health-insurance-safety-net-doesn-t-cover-everyone-123063000255_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/corporate-india-health-insurance-safety-net-doesn-t-cover-everyone-123063000255_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/pan-aadhaar-link-deadline-today-check-fees-status-and-how-to-do-it-here-123063000214_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-financ

Error accessing URL: https://www.business-standard.com/finance/personal-finance/digit-life-enters-insurance-space-with-maiden-product-group-term-plan-123062600546_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/digit-life-enters-insurance-space-with-maiden-product-group-term-plan-123062600546_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/i-t-dept-tweaks-exemption-norms-for-charitable-institutions-seeks-details-123062600553_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/i-t-dept-tweaks-exemption-norms-for-charitable-institutions-seeks-details-123062600553_1.html
Error accessing URL: https://www.business-standard.com/markets/news/wealth-management-companies-expand-footprints-beyond-tier-i-cities-123062600448_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/markets/news/wealth-management

Error accessing URL: https://www.business-standard.com/finance/personal-finance/is-upi-the-new-rs-2000-data-suggests-so-says-sbi-123062000396_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/is-upi-the-new-rs-2000-data-suggests-so-says-sbi-123062000396_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/want-to-hedge-against-risk-aversion-invest-in-current-sgb-tranche-123061900790_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/want-to-hedge-against-risk-aversion-invest-in-current-sgb-tranche-123061900790_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/when-is-money-gifted-not-taxable-all-your-tax-queries-on-gifts-answered-123061900221_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/when-is-money-gifted-not-taxable-all-your-tax-

Error accessing URL: https://www.business-standard.com/finance/personal-finance/home-prices-costlier-by-8-across-india-delhi-ncr-sees-largest-hike-of-16-123061400330_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/home-prices-costlier-by-8-across-india-delhi-ncr-sees-largest-hike-of-16-123061400330_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/prioritise-retirement-security-withdraw-from-nps-only-in-emergency-123061300751_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/prioritise-retirement-security-withdraw-from-nps-only-in-emergency-123061300751_1.htmlError accessing URL: https://www.business-standard.com/finance/personal-finance/emirates-flying-premium-economy-to-india-on-a380-what-it-means-for-flyers-123061400434_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-fin

Error accessing URL: https://www.business-standard.com/finance/personal-finance/fine-dining-japanese-style-how-to-develop-a-taste-for-the-cuisine-123060800452_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/fine-dining-japanese-style-how-to-develop-a-taste-for-the-cuisine-123060800452_1.htmlError accessing URL: https://www.business-standard.com/finance/personal-finance/rbi-policy-today-book-your-fds-before-rates-dip-full-list-of-banks-here-123060800186_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/rbi-policy-today-book-your-fds-before-rates-dip-full-list-of-banks-here-123060800186_1.html




Error accessing URL: https://www.business-standard.com/finance/personal-finance/explained-worst-over-for-home-loan-borrowers-as-repo-rate-unchanged-at-6-123060800285_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-fina

Error accessing URL: https://www.business-standard.com/economy/news/aadhaar-to-identity-proof-all-questions-answered-about-rs-2-000-notes-123060100569_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/economy/news/aadhaar-to-identity-proof-all-questions-answered-about-rs-2-000-notes-123060100569_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/switched-jobs-consolidate-multiple-form-16s-accurately-123053000865_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/switched-jobs-consolidate-multiple-form-16s-accurately-123053000865_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/utilise-enhanced-scss-limit-but-diversify-your-retirement-corpus-123052900749_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/utilise-enhanced-scss-limit-but-diversify-your-retirement

Error accessing URL: https://www.business-standard.com/finance/personal-finance/fund-pick-hdfc-taxsaver-fund-123051900415_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/fund-pick-hdfc-taxsaver-fund-123051900415_1.htmlError accessing URL: https://www.business-standard.com/markets/mutual-fund/select-small-cap-fund-manager-with-proven-expertise-in-this-segment-123051700959_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/markets/mutual-fund/select-small-cap-fund-manager-with-proven-expertise-in-this-segment-123051700959_1.html

Error accessing URL: https://www.business-standard.com/finance/personal-finance/avoid-concentration-risk-by-buying-annuities-from-multiple-insurers-123051600931_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/avoid-concentration-risk-by-buying-annuities-from-multiple-insurers-123051600931_1.htmlError acc

Error accessing URL: https://www.business-standard.com/india-news/metal-allocation-growth-in-green-technologies-to-drive-demand-for-silver-123050100742_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/india-news/metal-allocation-growth-in-green-technologies-to-drive-demand-for-silver-123050100742_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/need-quick-credit-for-short-tenure-consider-loan-against-fixed-deposit-123042800896_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/need-quick-credit-for-short-tenure-consider-loan-against-fixed-deposit-123042800896_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/i-t-dept-releases-offline-forms-for-itr-1-4-for-fiscal-year-2022-2023-123042601020_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/i-t-dept-releases-

Error accessing URL: https://www.business-standard.com/finance/personal-finance/small-savings-scheme-pay-heed-to-return-taxation-and-lock-in-123040700632_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/small-savings-scheme-pay-heed-to-return-taxation-and-lock-in-123040700632_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/fund-pick-dsp-midcap-fund-123040700554_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/fund-pick-dsp-midcap-fund-123040700554_1.html
Error accessing URL: https://www.business-standard.com/finance/personal-finance/skilling-the-smart-way-selecting-the-right-online-course-for-your-career-123040700291_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/skilling-the-smart-way-selecting-the-right-online-course-for-your-career-123040700291_1.htmlError acc

Error accessing URL: https://www.business-standard.com/finance/personal-finance/invest-in-debt-mutual-funds-bank-fds-and-bonds-based-on-merit-123032600318_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/finance/personal-finance/invest-in-debt-mutual-funds-bank-fds-and-bonds-based-on-merit-123032600318_1.html
Error accessing URL: https://www.business-standard.com/article/pf/avoid-excess-exposure-to-risky-assets-amid-high-volatility-in-markets-123032301007_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/pf/avoid-excess-exposure-to-risky-assets-amid-high-volatility-in-markets-123032301007_1.html
Error accessing URL: https://www.business-standard.com/article/free-newsletter/a-snapshot-of-car-loan-interest-rates-offered-by-various-firms-with-details-123032400658_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/free-newsletter/a-snapshot-of-car-loan-interest-rates-

Error accessing URL: https://www.business-standard.com/article/pf/optimise-80c-tax-saving-benefit-by-investing-in-scss-in-tranches-123031000846_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/pf/optimise-80c-tax-saving-benefit-by-investing-in-scss-in-tranches-123031000846_1.htmlError accessing URL: https://www.business-standard.com/article/pf/climate-change-insurance-poor-reach-reliance-on-fossil-fuel-key-issues-123031200648_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/pf/climate-change-insurance-poor-reach-reliance-on-fossil-fuel-key-issues-123031200648_1.html

Error accessing URL: https://www.business-standard.com/article/free-newsletter/a-snapshot-of-car-loan-interest-rates-offered-by-various-firms-with-details-123031000672_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/free-newsletter/a-snapshot-of-car-loan-interest-rates-offered-by-various-f

Error accessing URL: https://www.business-standard.com/article/free-newsletter/a-snapshot-of-car-loan-interest-rates-offered-by-various-firms-with-details-123022401081_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/free-newsletter/a-snapshot-of-car-loan-interest-rates-offered-by-various-firms-with-details-123022401081_1.html
Error accessing URL: https://www.business-standard.com/article/companies/allianz-dunzo-tie-up-to-provide-insurance-cover-to-delivery-partners-123022200946_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/companies/allianz-dunzo-tie-up-to-provide-insurance-cover-to-delivery-partners-123022200946_1.htmlError accessing URL: https://www.business-standard.com/article/free-newsletter/fund-pick-axis-bluechip-fund-123022401129_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/free-newsletter/fund-pick-axis-bluechip-fund-123022401129_1.html

Error accessing URL: https://www.business-standard.com/article/pf/realty-check-for-luxury-housing-after-crucial-change-in-tax-structure-123021001644_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/pf/realty-check-for-luxury-housing-after-crucial-change-in-tax-structure-123021001644_1.html
Error accessing URL: https://www.business-standard.com/article/pf/opt-for-presumptive-tax-scheme-only-if-you-can-adhere-to-its-rules-123021301240_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/pf/opt-for-presumptive-tax-scheme-only-if-you-can-adhere-to-its-rules-123021301240_1.htmlError accessing URL: https://www.business-standard.com/article/pf/kotak-edelweiss-mfs-stop-fresh-investments-in-international-schemes-123021001471_1.html. Error: 403 Client Error: Forbidden for url: https://www.business-standard.com/article/pf/kotak-edelweiss-mfs-stop-fresh-investments-in-international-schemes-123021001471_1.htmlErro

KeyboardInterrupt: 

#### Note: We were not able to scrape content from Business-Standards articles.

# Concatenating all scraped data

In [41]:
df14 = pd.read_csv('d14.csv')
df14.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [42]:
df15 = pd.read_csv('d15.csv')
df15.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [43]:
df16 = pd.read_csv('d16.csv')
df16.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [44]:
df17 = pd.read_csv('d17.csv')
df17.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [45]:
df18 = pd.read_csv('d18.csv')
df18.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [46]:
df19 = pd.read_csv('d19.csv')
df19.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [47]:
df20 = pd.read_csv('d20.csv')
df20.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [48]:
df21 = pd.read_csv('d21.csv')
df21.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [49]:
df22 = pd.read_csv('d22.csv')
df22.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [50]:
df23 = pd.read_csv('d23.csv')
df23.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [51]:
df24 = pd.read_csv('d24.csv')
df24.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [52]:
df25 = pd.read_csv('d25.csv')
df25.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [53]:
df26 = pd.read_csv('d26.csv')
df26.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [54]:
df27 = pd.read_csv('d27.csv')
df27.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [55]:
df28 = pd.read_csv('d28.csv')
df28.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [56]:
df29 = pd.read_csv('d29.csv')
df29.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [57]:
df30 = pd.read_csv('d30.csv')
df30.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [85]:
df31 = pd.read_csv('d31.csv')
df31.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [96]:
df32 = pd.read_csv('d32.csv')
df32.rename(columns={'category': 'target', 'categories':'target', 'Category':'target','article':'text', 'Text':'text'}, inplace=True)

In [97]:
df_scraped = pd.concat([df14, df15, df16, df17, df18, df19, df20, df21, df22, df23, df24, df25, df26, df27, df28, df29, df30, df31, df32], axis = 0, ignore_index = True)

In [98]:
df_scraped

Unnamed: 0,target,text
0,pets,"Ludhiana: The pet registration programme, laun..."
1,pets,"LUDHIANA: The pet registration programme, laun..."
2,pets,NEW DELHI: Uttar Pradesh Subordinate Services ...
3,pets,NEW DELHI: Andhra Pradesh State Level Police R...
4,pets,PUNE: Veterinarians are noticing a rise in tic...
...,...,...
373026,hobbies and interest,Pollywood diva Monica Gill is currently baskin...
373027,hobbies and interest,Father of two<br /><br />Family car: Alto <br ...
373028,hobbies and interest,Sad part is that you suffered for that defeat....
373029,hobbies and interest,"Today, Sharma has a very serious assignment - ..."


In [99]:
df_scraped.iloc[295912].text

' NEW DELHI: Scholars pursuing PhD at Delhi University can now appear for their viva through Skype or other modes of video conferencing.  The varsity has also made it mandatory that the thesis submitted by the PhD scholars pass a "plagiarism check" and procured specialised softwares for it.  "Earlier the students had to appear in person for their viva for PhD programmes. So, the ones who had completed their thesis and were offered any opportunities abroad, they had to travel back for the process. Now this need has been done away with," a senior varsity official said.  "The students willing to appear for viva through skype or other modes of videoconferencing need to inform their respective departments in advance. Similarly, if certain expert on interview panel is unable to come in person, then the viva will be arranged through similar modes," he added.  Following directives from the University Grants Commission (UGC), to discourage plagiarism in PhD thesis, DU has made it mandatory that

In [100]:
df_scraped.dropna(subset=['text', 'target'], inplace=True)

In [112]:
df_scraped.drop_duplicates(subset='text', keep='first', inplace=True)

In [102]:
df_scraped.reset_index(drop=True, inplace=True)

In [110]:
df_scraped.target.value_counts()

academic interests       66200
pets                     25900
real estate              25146
arts and culture         24736
business and finance     23259
video gaming             21441
sports                   20458
personal finance         19384
television               18468
shopping                 18243
travel                   17535
food and drink           16460
hobbies and interests    15121
style and fashion        14008
home and garden          11184
music and audio           6753
Name: target, dtype: int64

In [109]:
df_scraped.target = df_scraped.target.replace({'hobbies and interest':'hobbies and interests', 'style and faishion':'style and fashion'})

In [113]:
df_scraped.shape

(344296, 2)

In [114]:
df_scraped

Unnamed: 0,target,text
0,pets,"Ludhiana: The pet registration programme, laun..."
1,pets,"LUDHIANA: The pet registration programme, laun..."
2,pets,NEW DELHI: Uttar Pradesh Subordinate Services ...
3,pets,NEW DELHI: Andhra Pradesh State Level Police R...
4,pets,PUNE: Veterinarians are noticing a rise in tic...
...,...,...
344291,hobbies and interests,Pollywood diva Monica Gill is currently baskin...
344292,hobbies and interests,Father of two<br /><br />Family car: Alto <br ...
344293,hobbies and interests,Sad part is that you suffered for that defeat....
344294,hobbies and interests,"Today, Sharma has a very serious assignment - ..."


In [115]:
df_scraped.to_csv('scraped_dataset.csv', index = False)

In [1]:
import pandas as pd
data  = pd.read_csv('scraped_dataset.csv')

In [2]:
data

Unnamed: 0,target,text
0,pets,"Ludhiana: The pet registration programme, laun..."
1,pets,"LUDHIANA: The pet registration programme, laun..."
2,pets,NEW DELHI: Uttar Pradesh Subordinate Services ...
3,pets,NEW DELHI: Andhra Pradesh State Level Police R...
4,pets,PUNE: Veterinarians are noticing a rise in tic...
...,...,...
344291,hobbies and interests,Pollywood diva Monica Gill is currently baskin...
344292,hobbies and interests,Father of two<br /><br />Family car: Alto <br ...
344293,hobbies and interests,Sad part is that you suffered for that defeat....
344294,hobbies and interests,"Today, Sharma has a very serious assignment - ..."


In [13]:
dt = data[data.target == 'music and audio']
dt

Unnamed: 0,target,text
126083,music and audio,"Lights out and sound on, a hall packed with mu..."
126084,music and audio,It was in 2021 when Apple first introduced sup...
126085,music and audio,Alphabet's Google has been asked to pay patent...
126086,music and audio,The upcoming Tamil action film 'Jailer' is the...
126087,music and audio,"Ram Charan's next venture, titled 'Game Change..."
...,...,...
132831,music and audio,"<div class=""section1""><div class=""Normal""><scr..."
132832,music and audio,"It’s the summer of tech, we believe! With the ..."
132833,music and audio,"ajay devgan hasn't given up on action, though ..."
132834,music and audio,1. Acceptance of Terms of Use Welcome to “TOI ...


In [21]:
temp = dt.iloc[0].text.split()
len(temp)
# temp

518

In [8]:
data1 = data[data.text.str.len() > 1000]
data1

Unnamed: 0,target,text
0,pets,"Ludhiana: The pet registration programme, laun..."
1,pets,"LUDHIANA: The pet registration programme, laun..."
2,pets,NEW DELHI: Uttar Pradesh Subordinate Services ...
3,pets,NEW DELHI: Andhra Pradesh State Level Police R...
4,pets,PUNE: Veterinarians are noticing a rise in tic...
...,...,...
344290,hobbies and interests,"Besides interacting with students and farmers,..."
344292,hobbies and interests,Father of two<br /><br />Family car: Alto <br ...
344293,hobbies and interests,Sad part is that you suffered for that defeat....
344294,hobbies and interests,"Today, Sharma has a very serious assignment - ..."


In [10]:
data1.target.value_counts()

academic interests       59927
real estate              23560
pets                     23213
arts and culture         23162
business and finance     22622
video gaming             20795
personal finance         17920
sports                   17525
food and drink           15604
travel                   15238
television               14757
shopping                 14454
hobbies and interests    14193
style and fashion        12092
home and garden          10718
music and audio           4269
Name: target, dtype: int64

In [11]:
data.target.value_counts()

academic interests       66200
pets                     25900
real estate              25146
arts and culture         24736
business and finance     23259
video gaming             21441
sports                   20458
personal finance         19384
television               18468
shopping                 18243
travel                   17535
food and drink           16460
hobbies and interests    15121
style and fashion        14008
home and garden          11184
music and audio           6753
Name: target, dtype: int64

In [22]:
import pandas as pd

def split_data(input_data, chunk_size=500):
    input_data = input_data.split()
    data_length = len(input_data)
    chunks = [input_data[i:i + chunk_size] for i in range(0, data_length, chunk_size)]
    return chunks

original_df = data
long_text_df = original_df[original_df['Word Count'] > 500]
long_text_df['text_chunks'] = long_text_df['text'].apply(split_data)

df_rows = []
for _, row in long_text_df.iterrows():
    target = row['target']
    text = row['text']
    for chunk in row['text_chunks']:
        df_rows.append({'target': target, 'text': ' '.join(chunk)})

new_df = pd.DataFrame(df_rows)

new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  long_text_df['text_chunks'] = long_text_df['text'].apply(split_data)


Unnamed: 0,target,text
0,A,one two three four five one two three four fiv...
1,A,one two three four five one two three four fiv...
2,A,one two three four five one two three four fiv...
3,C,eleven twelve thirteen fourteen fifteen sixtee...
4,C,thirteen fourteen fifteen sixteen eleven twelv...
5,C,fifteen sixteen eleven twelve thirteen fourtee...
6,C,eleven twelve thirteen fourteen fifteen sixtee...
7,C,thirteen fourteen fifteen sixteen eleven twelv...


In [23]:
def count(text):
    text = str(text)
    return len(text.split())

new_df['Word Count'] = new_df['text'].apply(count)
new_df

Unnamed: 0,target,text,Word Count
0,A,one two three four five one two three four fiv...,500
1,A,one two three four five one two three four fiv...,500
2,A,one two three four five one two three four fiv...,500
3,C,eleven twelve thirteen fourteen fifteen sixtee...,500
4,C,thirteen fourteen fifteen sixteen eleven twelv...,500
5,C,fifteen sixteen eleven twelve thirteen fourtee...,500
6,C,eleven twelve thirteen fourteen fifteen sixtee...,500
7,C,thirteen fourteen fifteen sixteen eleven twelv...,76
