A moderate web scrapping attempt from https://lightning.nsstc.nasa.gov for lightning data. The challenge was to unify different links and tables for each year from the source.

Source Link: https://lightning.nsstc.nasa.gov/nlisib/lissearch.pl?origin=ST&lat=23.5&lon=90.5&alat=7&alon=5&donob=both

In [None]:
import requests, bs4, re
from urllib.parse import urljoin
import pandas as pd
from tqdm import tnrange, tqdm_notebook
from time import sleep

In [None]:
def get_links(url):
    soup = bs4.BeautifulSoup(requests.get(url).text, "lxml")
    a_tags = soup.find_all('a', href=re.compile(r"^lis1orbit"))
    links = [urljoin(start_url, a['href'])for a in a_tags]
    return links

In [None]:
years = range(1998, 2016)
final = pd.DataFrame(columns = ["Flash time (UTC)", "Date",	"Latitude",	"Longitude",	"Radiance",	"Milliseconds",	"Groups",	"Events"])

for year in tqdm_notebook(years, desc="Total Progress"):
  num = 0
  if year != 2015:
    start_url = 'https://lightning.nsstc.nasa.gov/nlisib/lisfound.exe?origin=ST&lat=23.5&lon=90.5&alat=7&alon=5&donob=both&'+str(year)+'.1=on&'+str(year)+'.2=on&'+str(year)+'.3=on&'+str(year)+'.4=on&'+str(year)+'.5=on&'+str(year)+'.6=on&'+str(year)+'.7=on&'+str(year)+'.8=on&'+str(year)+'.9=on&'+str(year)+'.10=on&'+str(year)+'.11=on&'+str(year)+'.12=on'
  else:
    start_url = 'https://lightning.nsstc.nasa.gov/nlisib/lisfound.exe?origin=ST&lat=23.5&lon=90.5&alat=7&alon=5&donob=both&2015.1=on&2015.2=on&2015.3=on&2015.4=on'
  links = get_links(start_url)
  for i in tnrange(len(links), desc="Year "+str(year)):
    try:
      soup = bs4.BeautifulSoup(requests.get(links[i]).text, "lxml")
      table = soup.find("table", {"cellspacing":"0"})
      rows = table.find_all('tr')
      data = []
    except:
      pass
    for row in rows[1:]:
      cols = row.find_all('td')
      cols = [ele.text.strip() for ele in cols]
      data.append([ele for ele in cols if ele])
    result = pd.DataFrame(data, columns = ["Flash time (UTC)", "Date",	"Latitude",	"Longitude",	"Radiance",	"Milliseconds",	"Groups",	"Events"])
    final = final.append(result, ignore_index=True)
    num = num + len(links)
    sleep(0.01)
print("Finished Scrapping!! Total", num, "tables from", len(years), "years.")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, description='Total Progress', max=18.0, style=ProgressStyle(descriptio…

  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, description='Year 1998', max=257.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 1999', max=256.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2000', max=247.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2001', max=263.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2002', max=264.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2003', max=314.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2004', max=272.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2005', max=290.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2006', max=308.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2007', max=283.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2008', max=301.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2009', max=269.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2010', max=316.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2011', max=292.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2012', max=285.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2013', max=309.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2014', max=247.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Year 2015', max=32.0, style=ProgressStyle(description_wid…



Finished Scrapping!! Total 1024 tables from 18 years.


In [None]:
final.head()

Unnamed: 0,Flash time (UTC),Date,Latitude,Longitude,Radiance,Milliseconds,Groups,Events
0,1998-056T03:34:31.34Z,[Feb 25],-17.854,-66.972,470995,320,11,62
1,1998-056T03:35:15.14Z,[Feb 25],-17.813,-66.949,35766,326,3,10
2,1998-056T03:38:23.53Z,[Feb 25],-6.865,-58.965,44204,146,4,11
3,1998-056T03:39:45.22Z,[Feb 25],-4.261,-51.224,313291,316,6,29
4,1998-056T03:39:56.93Z,[Feb 25],-3.631,-50.843,102125,349,4,22


In [None]:
final.tail()

Unnamed: 0,Flash time (UTC),Date,Latitude,Longitude,Radiance,Milliseconds,Groups,Events
1459093,2015-098T11:53:31.47Z,[Apr 08],15.484,103.06,20628,0,1,5
1459094,2015-098T11:53:33.21Z,[Apr 08],15.4,103.07,248458,227,6,24
1459095,2015-098T11:53:36.04Z,[Apr 08],15.501,103.071,180579,325,6,38
1459096,2015-098T11:57:09.12Z,[Apr 08],5.334,116.03,2368203,389,28,199
1459097,2015-098T11:57:09.25Z,[Apr 08],5.42,116.019,248382,231,24,49


In [None]:
final.to_csv("final.csv")