In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# WebScrapping News Headlines

This script is designed to scrape news headlines from the archive section of the Economic Times website. It iterates over a range of dates starting from January 1, 2022, and collects up to 300 valid headlines per day. The scraped data is stored in a Pandas DataFrame and exported as a CSV file.

In [2]:
!pip install beautifulsoup4



# Import Libraries

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import re

An empty DataFrame df_empty is created with columns news and date. start_date is set to January 1, 2022, and current_date will iterate over each day.

In [4]:
df_empty = pd.DataFrame(columns=['news', 'date'])
start_date = datetime(2022, 1, 1)

current_date = start_date

# Scraping Headlines

In [5]:
current_date = start_date
url1 = 'https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-'
no='44562'
x='.cms'
pattern = r"\s{3}"
for i in range(820):
    if i!=0:
        current_date += timedelta(days=1)
    m=str(int(no)+i)
    url=url1+m+x
    response = requests.get(url)
    soup=BeautifulSoup(response.content,'html.parser')
    print('Status code: ', response.status_code)
    print('Text: ', response.text[:50])
    print(url,current_date)
    headlines=soup.find_all('a')
    c=0
    for h in headlines:
        if c<300:
            if len(h.text)>25 and not(re.search(pattern,h.text)):
                data_to_append = {
                    'news': [h.text],
                    'date': [current_date]
                }
                df_to_append = pd.DataFrame(data_to_append)
                df_empty = pd.concat([df_empty, df_to_append], ignore_index=True)
                c+=1
        else:
            break
display(df_empty)


Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-44562.cms 2022-01-01 00:00:00


  df_empty = pd.concat([df_empty, df_to_append], ignore_index=True)


Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-44563.cms 2022-01-02 00:00:00
Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-44564.cms 2022-01-03 00:00:00
Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-44565.cms 2022-01-04 00:00:00
Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-44566.cms 2022-01-05 00:00:00
Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://economictimes.indiatimes.com/archivelist/year-2022,month-1,starttime-44567.cms 2022-01-06 00:00:00
Status code:  200
Text:  <!DOCTYPE html><!DOCTYPE HTML><html xmlns:err="htt
https://e

Unnamed: 0,news,date
0,assam targeting to become six lakh crore gdp s...,2022-01-01
1,in manipur bjp will form a government on its o...,2022-01-01
2,there will be rationalization of the armed for...,2022-01-01
3,media companies will have to create content fo...,2022-01-01
4,stadium walls will come down,2022-01-01
...,...,...
38750,HR asks candidate her marriage plans during in...,2024-03-30
38751,Maharashtra election 2024: Kuch toh gadbad hai...,2024-03-30
38752,"As threat of Trump tariffs looms, Europe's lea...",2024-03-30
38753,A lesson from Poland: Reversing populist polic...,2024-03-30


In [None]:
display(df_empty)

Unnamed: 0,news,date
0,assam targeting to become six lakh crore gdp s...,2022-01-01
1,in manipur bjp will form a government on its o...,2022-01-01
2,there will be rationalization of the armed for...,2022-01-01
3,media companies will have to create content fo...,2022-01-01
4,stadium walls will come down,2022-01-01
...,...,...
37427,Safe-haven gold on track for weekly gain,2024-03-30
37428,Waqf land issue: BJP leaders hold protest in H...,2024-03-30
37429,"Wall Street closes higher as Dow, S&P hit one-...",2024-03-30
37430,Denying tickets to sitting MLAs shows lack of ...,2024-03-30


# Exporting the Data

In [8]:
df_empty.to_csv("/content/drive/MyDrive/scrapped_data.csv", index=False)

In [9]:
x = datetime(2023, 4, 1)

In [10]:
print(x)

2023-04-01 00:00:00


# Merging News Data and Stock Data

In [12]:
import pandas as pd

df2 = pd.read_csv('/content/drive/MyDrive/scrapped_data.csv')
df1= pd.read_csv('/content/drive/MyDrive/NIFTY50.csv')

# Converting date Column to Datetime Format

In [13]:
# Attempt to convert to datetime, coercing invalid formats to NaT
df1['date'] = pd.to_datetime(df1['date'], errors='coerce')

# Find rows where the conversion failed
invalid_dates = df1[df1['date'].isna()]
print("Invalid dates in df1:")
print(invalid_dates)

Invalid dates in df1:
    date     Open      High       Low     Close   Shares Traded   \
8    NaT  18257.00  18272.25  18163.80  18257.80     303700545.0   
9    NaT  18185.00  18286.95  18119.65  18255.75     229451608.0   
10   NaT  18235.65  18321.55  18228.75  18308.10     266702919.0   
11   NaT  18337.20  18350.95  18085.90  18113.05     227507319.0   
12   NaT  18129.20  18129.20  17884.90  17938.40     276662654.0   
..   ...       ...       ...       ...       ...             ...   
551  NaT  21989.90  22080.95  21941.30  22011.95     353172434.0   
552  NaT  21932.20  22180.70  21883.30  22096.75     388656439.0   
553  NaT  21947.90  22073.20  21947.55  22004.70     328403719.0   
554  NaT  22053.95  22193.60  22052.85  22123.65     409136866.0   
555  NaT  22163.60  22516.00  22163.60  22326.90     407422815.0   

     Turnover (₹ Cr)  
8           28670.93  
9           21024.51  
10          23859.38  
11          20852.67  
12          25310.19  
..               ...  


In [14]:
df2['date'] = pd.to_datetime(df2['date'], errors='coerce')
invalid_dates = df2[df2['date'].isna()]
print("Invalid dates in df2:")
print(invalid_dates)

Invalid dates in df2:
Empty DataFrame
Columns: [news, date]
Index: []


# Handling Invalid Dates

In [15]:
df1 = df1.dropna(subset=['date'])
df2 = df2.dropna(subset=['date'])

# Standardizing date Format

In [16]:
df1['date'] = df1['date'].astype(str)
df2['date'] = df2['date'].astype(str)

# Merging Data

In [17]:
merged_df = pd.merge(df1, df2, on='date', how='left')
merged_df = merged_df.fillna(-1)
display(merged_df)

Unnamed: 0,date,Open,High,Low,Close,Shares Traded,Turnover (₹ Cr),news
0,2022-03-01,17387.15,17646.65,17383.30,17625.70,200456430.0,16181.36,Greatest bubble in human history about to burs...
1,2022-03-01,17387.15,17646.65,17383.30,17625.70,200456430.0,16181.36,Gautam Adani is facing the biggest trial of hi...
2,2022-03-01,17387.15,17646.65,17383.30,17625.70,200456430.0,16181.36,Virat Kohli's post gives fans a scare; specula...
3,2022-03-01,17387.15,17646.65,17383.30,17625.70,200456430.0,16181.36,Kalpataru Advay—Borivali gets an Illustrious r...
4,2022-03-01,17387.15,17646.65,17383.30,17625.70,200456430.0,16181.36,Education qualifications of Keerthy Suresh's h...
...,...,...,...,...,...,...,...,...
9584,2024-05-03,22371.25,22416.90,22269.15,22356.30,296153036.0,29165.37,-1
9585,2024-06-03,22327.50,22497.20,22224.35,22474.05,312262560.0,31029.01,-1
9586,2024-07-03,22505.30,22525.65,22430.00,22493.55,379865142.0,33558.46,-1
9587,2024-11-03,22517.50,22526.60,22307.25,22332.65,277897373.0,27222.37,-1


In [18]:
merged_df.to_csv('/content/drive/MyDrive/merged_data.csv', index=False)