
### Web-Scraping-and-Data-Analysis project-on-Economic-impact-of-COVID-19 

### Importing the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
import datetime
import yfinance as yf

### Covid cases

In [2]:
# URL for Covid cases from wikipedia

urlcovid = "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_pandemic_data/India_medical_cases_chart&action=edit"
response_covid = requests.get(urlcovid)
response_covid

<Response [200]>

In [3]:
# Scrapping data from URL

response_covid = requests.get(urlcovid)
response_covid.text

# Data cleaning to get data from text

soup_covid = bs(response_covid.text)
covid_find = soup_covid.find("textarea",class_="mw-editfont-monospace").text.split("\n")

cases = []
for i in covid_find:
    cases.append(i.split(";"))
cases = cases[17:]

# Making a Data frame

cov_cases = pd.DataFrame(cases)
df0=pd.DataFrame()
df0["date"]= cov_cases[0][:524]
df0["cases"]= cov_cases[3][:524]

# Replacing missing values with NaN and droping them

df0.replace(r'', np.nan,inplace = True)
df0= df0.dropna().reset_index()
df0.reset_index(drop=True)

Unnamed: 0,index,date,cases
0,0,2020-01-30,1
1,2,2020-02-02,2
2,3,2020-02-03,3
3,5,2020-02-21,3
4,7,2020-03-02,5
...,...,...,...
516,519,2021-07-27,31440492
517,520,2021-07-28,31483463
518,521,2021-07-29,31526628
519,522,2021-07-30,31571299


In [4]:
# Filling zeros where no cases are present

fill1 = []
for i in range(244):
    fill1.append(0)
    
# Filling the dates where cases are present

fill2 =[1,0,0,2,3]
fill3 = []
for i in range(17):
    fill3.append(0)
fill4 = []
for i in range(9):
    fill4.append(0)
covid_cases =fill1+fill2+fill3+fill4
covid_cases =pd.DataFrame(covid_cases)
covid_cases.reset_index(drop=True, inplace=True)
covid_cases

# Concatenating with previous data frame

df = df0.iloc[4:]
cf = df["cases"]
cf.reset_index()
cov_df = pd.concat([covid_cases,cf]) 
cov_df["Covid cases"]=cov_df
cov_df.drop(0, inplace=True, axis=1)
new_df = cov_df.reset_index()
covid = pd.DataFrame(new_df["Covid cases"])
covid["Covid cases"] = covid["Covid cases"].astype(int)
covid

Unnamed: 0,Covid cases
0,0
1,0
2,0
3,0
4,0
...,...
787,31440492
788,31483463
789,31526628
790,31571299


### Currency exchange rates

In [5]:
# Scrapping data from url

rates =[]
for i in range(2019,2022):
    my_url = f"https://www.exchangerates.org.uk/USD-INR-spot-exchange-rates-history-{i}.html"
    response = requests.get(my_url)
    soup_rupee = bs(response.text)
    rupee_find = soup_rupee.find("div",class_="row").text.split("US Dollar ")
    rates.append(rupee_find)
    
# data cleaning

split_rates = []
for i in range(len(rates)):
        temp = re.findall(r"[₹]\d+.\d+",str(rates[i]))
        split_rates.append(temp)
exchange_rates =[]
for i in split_rates:
    for j in i:
        temp = j.removeprefix("₹")
        exchange_rates.append(temp)
# Finding indexes of required dates in raw text for slicing

#'to Indian Rupee Monthly Exchange Rates\n\nSaturday  1 June 2019$1 USD = ₹69.5795'
#'Indian Rupee rate for 30/07/2021Saturday 31 July 2021$1 USD = ₹74.3793'

index_june2019 = exchange_rates.index("69.5795")
index_july2021 = exchange_rates.index("74.3793")
exchange_rates = exchange_rates[index_june2019:index_july2021+1]
exchange_rates = np.array(exchange_rates,dtype=float)
len(exchange_rates)

792

### Crude oil prices

In [6]:
# Scrapping data from URL

response_oilrates = requests.get("https://www.indexmundi.com/commodities/?commodity=crude-oil&months=60&currency=inr")

#Response_oilrates.text

soup_oilrates = bs(response_oilrates.text)
crudeoil = soup_oilrates.find("div",class_="row").text.split("\n")
find_index_start = crudeoil.index('Jun 20194,149.67-11.03%')
find_index_stop =  crudeoil.index('Jul 20215,460.903.40%') + 1
crudeoil = crudeoil[find_index_start : find_index_stop]

#Finding the text and cleaning

find_price = []
crudeoil_monthlyrates = []

for i in crudeoil:
        temp = re.findall(r"\d+.\d+.\d+|\d+-\d+.\d+",i)
        find_price.append(temp)
    

# Flattening the list

find_price = sum(find_price,[])

for i in find_price:
        temp = float(i[4:].replace(",",""))
        crudeoil_monthlyrates.append(temp)
crudeoil_monthlyrates

# Approximating minimum and maximum values for the average values

crudeoilprice_minrange = []
crudeoilprice_maxrange = []
for i in crudeoil_monthlyrates:
    min_range = i - 30
    max_range = i + 30
    crudeoilprice_minrange.append(min_range)
    crudeoilprice_maxrange.append(max_range)


In [7]:
# create lists to pass in the function

years = [2019,2020,2021]
common_year = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
leap_year = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
dates = []
for i in years:
    if i % 4 == 0:
        for j in leap_year:
            dates.append(j)
    else:
        for k in common_year:
            dates.append(k)
dates = dates[5:-5]
len(dates)

26

In [8]:
#function to create random daily values

ranlist = []
def month_days(x,y,n):
    global random_float_list
    for i in range(0,n):
        k = round(random.uniform(x,y),2)
        ranlist.append(k)

In [9]:
for x,y,z in zip(crudeoilprice_minrange,crudeoilprice_maxrange,dates):
    month_days(x,y,z)
crude_oil_rates = ranlist
len(crude_oil_rates)

792

### Gold prices

In [10]:
# Scrapping data from URL

response_gold =requests.get("https://www.indexmundi.com/commodities/?commodity=gold&months=60&currency=inr")
soup_gold = bs(response_gold.text)
gold = soup_gold.find("div",class_="row").text.split("\n")
find_goldindex_start = gold.index('Jun 201994,370.205.34%')
find_goldindex_stop =  gold.index('Jul 2021134,722.00-0.17%') + 1
gold_index = gold[find_goldindex_start : find_goldindex_stop]
gold_price = []
gold_monthlyrates = []
for i in gold_index:
    g1 = re.findall(r"\d+.\d+.\d+",i)
    gold_price.append(g1)

# flattening the list

gold_price = sum(gold_price,[])

for i in gold_price:
        temp = float(i[4:].replace(",",""))
        gold_monthlyrates.append(temp)
        
# approximating minimum and maximum values for the average values

gold_minrange = []
gold_maxrange = []
for i in gold_monthlyrates:
    min_range = i - 30
    max_range = i + 30
    gold_minrange.append(min_range)
    gold_maxrange.append(max_range)

len(gold_monthlyrates)

26

In [11]:
goldlist = []
def month_days(x,y,n):
    global random_float_list
    for i in range(0,n):
        z = round(random.uniform(x,y),2)
        goldlist.append(z)

In [12]:
for x,y,z in zip(gold_minrange,gold_maxrange,dates):
    month_days(x,y,z)
gold_price = goldlist
len(gold_price)

792

### Food prices

In [13]:
# Scrapping data from URL

response_food = requests.get("https://www.indexmundi.com/commodities/?commodity=food-price-index&months=60")
soup_food = bs(response_food.text)
food = soup_food.find("div",class_="row").text.split("\n")
find_foodindex_start = food.index('Jun 201988.063.63%')
find_foodindex_stop =  food.index('Jul 2021121.08-1.18%') + 1
food_index= food[find_foodindex_start:find_foodindex_stop]

food_price = []
food_monthlyrates = []

for i in food_index:
    m1 = re.findall(r"\d+.\d+.\d+|\d+-\d+.\d+",i)
    food_price.append(m1)

# Flattening the list

food_price = sum(food_price,[])

for i in food_price:
        temp = float(i[4:-3].replace(",",""))
        food_monthlyrates.append(temp)
        
# Approximating minimum and maximum values for the average values

food_minrange = []
food_maxrange = []
for i in food_monthlyrates:
    min_range = i - 30
    max_range = i + 30
    food_minrange.append(min_range)
    food_maxrange.append(max_range)
len(food_monthlyrates)

26

In [14]:
foodlist = []
def month_days(x,y,n):
    global random_float_list
    for i in range(0,n):
        z = round(random.uniform(x,y),2)
        foodlist.append(z)

In [20]:
for x,y,z in zip(food_minrange,food_maxrange,dates):
    month_days(x,y,z)
food_index = foodlist
len(food_index)

792

### Getting Stock data using Yahoo finance API

### Nifty IT

In [15]:
# Set the start and end date

start_date = '2019-05-31'
end_date = '2021-08-03'

# Set the ticker(tickers can be found in yahoofinance website)

ticker = '^CNXIT'

# Get the data

it_date = yf.download(ticker, start_date, end_date)
itdf=pd.DataFrame(it_date)

#converted datetime into column

itdf.reset_index(inplace=True)
itdf["Date"]=itdf.iloc[:,0]
itdf.head()

#added missing dates with date range and imputed corresponding columns with previous date

daterange = pd.date_range(start=itdf["Date"].min(), end=itdf["Date"].max())
it = itdf.set_index('Date').reindex(daterange).fillna(0.0).rename_axis('Date').reset_index()
it.head()
it["NIFTY_IT"] = it.iloc[:,4]

#replaced zero with nan

it["NIFTY_IT"].replace(0, np.nan,inplace = True)

#replaced nan with previous value

it["NIFTY_IT"].fillna(method='pad', inplace=True)
it1 = it.iloc[1:793]
nifty_it =it1.iloc[:,0::7]
date =nifty_it.iloc[:,0]
nifty_it = nifty_it.reset_index(drop=True)
nifty_it

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,NIFTY_IT
0,2019-06-01,16160.650391
1,2019-06-02,16160.650391
2,2019-06-03,16360.549805
3,2019-06-04,16105.200195
4,2019-06-05,16105.200195
...,...,...
787,2021-07-27,29892.550781
788,2021-07-28,29954.150391
789,2021-07-29,30371.400391
790,2021-07-30,30480.050781


In [16]:
date = nifty_it.iloc[:,0].reset_index(drop=True)
date = pd.DataFrame(date)
date.head()

Unnamed: 0,Date
0,2019-06-01
1,2019-06-02
2,2019-06-03
3,2019-06-04
4,2019-06-05


### Nifty Bank

In [17]:
start_date = '2019-05-31'
end_date = '2021-08-03'
ticker = '^NSEBANK'
bank_date = yf.download(ticker, start_date, end_date)
bank_df=pd.DataFrame(bank_date)

#converted datetime into column

bank_df.reset_index(inplace=True)
bank_df["Date"]=bank_df.iloc[:,0]

#added missing dates with date range and imputed corresponding columns with previous date

bankrange = pd.date_range(start=bank_df["Date"].min(), end=bank_df["Date"].max())
bank_index = bank_df.set_index('Date').reindex(bankrange).fillna(0.0).rename_axis('Date').reset_index()
bank_index["NIFTY_BANK"] = bank_index.iloc[:,4]

#replaced zero with nan

bank_index["NIFTY_BANK"].replace(0, np.nan,inplace = True)

#replaced nan with previous value

bank_index["NIFTY_BANK"].fillna(method='pad', inplace=True)
bank = bank_index.iloc[1:793]
nifty_bankx =bank.iloc[:,0::7]
nifty_bank = nifty_bankx.iloc[:,1:]
nifty_bank = nifty_bank.reset_index(drop=True)
nifty_bank

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,NIFTY_BANK
0,31375.400391
1,31375.400391
2,31653.650391
3,31589.050781
4,31589.050781
...,...
787,34797.449219
788,34532.898438
789,34691.500000
790,34584.351562


### Nifty 50

In [18]:
start_date = '2019-05-31'
end_date = '2021-08-03'
ticker = '^NSEI'
fifty_date = yf.download(ticker, start_date, end_date)
fifty_df =pd.DataFrame(fifty_date)

#converted datetime into column

fifty_df.reset_index(inplace=True)
fifty_df["Date"]=fifty_df.iloc[:,0]

#added missing dates with date range and imputed corresponding columns with previous date

fiftyrange = pd.date_range(start=fifty_df["Date"].min(), end=fifty_df["Date"].max())
fifty_index = fifty_df.set_index('Date').reindex(fiftyrange).fillna(0.0).rename_axis('Date').reset_index()
fifty_index["NIFTY_50"] = fifty_index.iloc[:,4]

#replaced zero with nan

fifty_index["NIFTY_50"].replace(0, np.nan,inplace = True)

#replaced nan with previous value

fifty_index["NIFTY_50"].fillna(method='pad', inplace=True)
fifty = fifty_index.iloc[1:793]
nifty_50X =fifty.iloc[:,0::7]
nifty_50 = nifty_50X.iloc[:,1:]
nifty_50 = nifty_50.reset_index(drop=True)
nifty_50

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,NIFTY_50
0,11922.799805
1,11922.799805
2,12088.549805
3,12021.650391
4,12021.650391
...,...
787,15746.450195
788,15709.400391
789,15778.450195
790,15763.049805


## Final Dataframe

In [21]:
data = pd.DataFrame({"Date":date["Date"],"Covid cases":covid["Covid cases"],"USD( $ ) to INR( ₹ )":exchange_rates,
                      "crude oil ( ₹ / Barrel )":crude_oil_rates,
                      "Gold Price( ₹ / Troy ounce )":gold_price,
                      "Food Price Index":food_index,"NIFTY_IT":nifty_it["NIFTY_IT"],"NIFTY_BANK":nifty_bank["NIFTY_BANK"], "NIFTY_50":nifty_50["NIFTY_50"]
                      })
data

Unnamed: 0,Date,Covid cases,USD( $ ) to INR( ₹ ),crude oil ( ₹ / Barrel ),Gold Price( ₹ / Troy ounce ),Food Price Index,NIFTY_IT,NIFTY_BANK,NIFTY_50
0,2019-06-01,0,69.5795,4165.63,94373.57,61.23,16160.650391,31375.400391,11922.799805
1,2019-06-02,0,69.5795,4164.87,94347.23,76.93,16160.650391,31375.400391,11922.799805
2,2019-06-03,0,69.5781,4145.50,94387.47,78.86,16360.549805,31653.650391,12088.549805
3,2019-06-04,0,69.1697,4122.90,94367.43,100.67,16105.200195,31589.050781,12021.650391
4,2019-06-05,0,69.3155,4131.08,94354.65,110.97,16105.200195,31589.050781,12021.650391
...,...,...,...,...,...,...,...,...,...
787,2021-07-27,31440492,74.2838,5434.03,134745.11,106.85,29892.550781,34797.449219,15746.450195
788,2021-07-28,31483463,74.5378,5454.22,134706.31,104.38,29954.150391,34532.898438,15709.400391
789,2021-07-29,31526628,74.4524,5486.79,134738.82,150.10,30371.400391,34691.500000,15778.450195
790,2021-07-30,31571299,74.2468,5463.00,134715.97,149.63,30480.050781,34584.351562,15763.049805


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 792 entries, 0 to 791
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Date                          792 non-null    datetime64[ns]
 1   Covid cases                   792 non-null    int32         
 2   USD( $ ) to INR( ₹ )          792 non-null    float64       
 3   crude oil ( ₹ / Barrel )      792 non-null    float64       
 4   Gold Price( ₹ / Troy ounce )  792 non-null    float64       
 5   Food Price Index              792 non-null    float64       
 6   NIFTY_IT                      792 non-null    float64       
 7   NIFTY_BANK                    792 non-null    float64       
 8   NIFTY_50                      792 non-null    float64       
dtypes: datetime64[ns](1), float64(7), int32(1)
memory usage: 52.7 KB


### Converting dataframe to csv file for further analysis

In [23]:
data.to_csv('webscraping-covid.csv')