# TEXT MINING

Web Scraping Wikipedia Table into a Dataframe

Website used: COVID-19 pandemic in Kerala

Website url: https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Kerala

In [1]:
import pandas as pd             # library for data analysis
import requests                 # library to handle requests
from bs4 import BeautifulSoup   # library to parse HTML documents

In [2]:
# Get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Kerala"
table_class="wikitable"
response=requests.get(wikiurl)
print(response.status_code)

200


In [3]:
# Parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find_all('table',{'class':"wikitable"})

In [4]:
indiatable

[<table class="wikitable plainrowheaders" style="float:right; clear:right;">
 <caption>
 COVID-19 Dashboard
 </caption>
 <tbody><tr>
 <th>Total samples tested<sup class="reference" id="cite_ref-18"><a href="#cite_note-18">[17]</a></sup>
 </th>
 <td style="text-align:right;">14,081,632
 </td></tr>
 <tr>
 <th>Total positive cases
 </th>
 <td style="text-align:right;">1,207,332
 </td></tr>
 <tr>
 <th>Total active cases
 </th>
 <td style="text-align:right;">69,869
 </td></tr>
 <tr>
 <th>Total deaths
 </th>
 <td style="text-align:right;">4,877
 </td></tr>
 <tr>
 <th>Total recovered cases
 </th>
 <td style="text-align:right;">1,132,267
 </td></tr>
 <tr>
 <th>People vaccinated 1st dose<sup class="reference" id="cite_ref-:11_19-0"><a href="#cite_note-:11-19">[18]</a></sup>
 </th>
 <td style="text-align:right;">4,897,155
 </td></tr>
 <tr>
 <th>People vaccinated 2nd dose
 </th>
 <td style="text-align:right;">593,285
 </td></tr>
 <tr>
 <td colspan="2" style="text-align:center;">As of April 15, 20

### Convert Wikipedia Table into a Python Dataframe

In [5]:
df=pd.read_html(str(indiatable))
# convert list to dataframe
#Table 1
df1=pd.DataFrame(df[0])
df1

Unnamed: 0,0,1
0,Total samples tested[17],14081632
1,Total positive cases,1207332
2,Total active cases,69869
3,Total deaths,4877
4,Total recovered cases,1132267
5,People vaccinated 1st dose[18],4897155
6,People vaccinated 2nd dose,593285
7,"As of April 15, 2021[19]","As of April 15, 2021[19]"


In [6]:
#Table 2
df2=pd.DataFrame(df[1])
df2

Unnamed: 0,Travel,Total passengers,Home quarantine,Institutional quarantine,Passengers in isolation
0,International Airport,591902,460129,35256,3396
1,Seaport,1884,514,1101,6
2,Check post,375584,367102,8329,153
3,Railway,100656,97988,1912,156
4,Total,1070026,1070026,1070026,1070026


In [7]:
#Table 3
df3=pd.DataFrame(df[2])
df3

Unnamed: 0_level_0,Date (2020),District(s),Origin,Cases,Cases,Type of transmission,Source(s)
Unnamed: 0_level_1,Date (2020),District(s),Origin,New,Total,Type of transmission,Source(s)
0,Jan-30,Thrissur,Wuhan,1,1,T,[71]
1,Feb-2,Alappuzha,Wuhan,1,2,T,
2,Feb-3,Kasargod,Wuhan,1,3,T,
3,Mar-9,Pathanamthitta,Italy,3,8,T,[72][21]
4,Mar-9,Pathanamthitta,Italy,2,8,PTP,[72][21]
...,...,...,...,...,...,...,...
905,November-16,"Alappuzha (226), Ernakulam (279), Idukki (83),...",Import,-,527708,T,
906,November-19,"Alappuzha (527), Ernakulam (423), Idukki (276)...",,4904,545641,PTP,
907,November-19,"Alappuzha (527), Ernakulam (423), Idukki (276)...",Import,-,545641,T,
908,November-20,"Alappuzha (395), Ernakulam (554), Idukki (85),...",,5213,551669,PTP,


### Covert DataFrame to csv file

In [8]:
#Covert Table1 to csv file
df1.to_csv('Table1.csv', index=False, encoding='utf-8')

In [9]:
#Covert Table2 to csv file
df2.to_csv('Table2.csv', index=False, encoding='utf-8')

In [10]:
#Covert Table3 to csv file
df3.to_csv('Table3.csv', index=False, encoding='utf-8')

### Reading CSV File

In [11]:
#Reading Table1 CSV File
dff1 = pd.read_csv("Table1.csv")
dff1

Unnamed: 0,0,1
0,Total samples tested[17],14081632
1,Total positive cases,1207332
2,Total active cases,69869
3,Total deaths,4877
4,Total recovered cases,1132267
5,People vaccinated 1st dose[18],4897155
6,People vaccinated 2nd dose,593285
7,"As of April 15, 2021[19]","As of April 15, 2021[19]"


In [12]:
#Reading Table2 CSV File
dff2 = pd.read_csv("Table2.csv")
dff2

Unnamed: 0,Travel,Total passengers,Home quarantine,Institutional quarantine,Passengers in isolation
0,International Airport,591902,460129,35256,3396
1,Seaport,1884,514,1101,6
2,Check post,375584,367102,8329,153
3,Railway,100656,97988,1912,156
4,Total,1070026,1070026,1070026,1070026


In [13]:
#Reading Table3 CSV File
dff3 = pd.read_csv("Table3.csv")
dff3

Unnamed: 0,Date (2020),District(s),Origin,Cases,Cases.1,Type of transmission,Source(s)
0,Date (2020),District(s),Origin,New,Total,Type of transmission,Source(s)
1,Jan-30,Thrissur,Wuhan,1,1,T,[71]
2,Feb-2,Alappuzha,Wuhan,1,2,T,
3,Feb-3,Kasargod,Wuhan,1,3,T,
4,Mar-9,Pathanamthitta,Italy,3,8,T,[72][21]
...,...,...,...,...,...,...,...
906,November-16,"Alappuzha (226), Ernakulam (279), Idukki (83),...",Import,-,527708,T,
907,November-19,"Alappuzha (527), Ernakulam (423), Idukki (276)...",,4904,545641,PTP,
908,November-19,"Alappuzha (527), Ernakulam (423), Idukki (276)...",Import,-,545641,T,
909,November-20,"Alappuzha (395), Ernakulam (554), Idukki (85),...",,5213,551669,PTP,
