In [7]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
% matplotlib inline
from matplotlib import pyplot as plt
import requests
from bs4 import BeautifulSoup
import re

In [8]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
url = 'https://www.justdial.com/Mumbai/Gyms-in-Andheri-West/nct-11575244'
# request data from server
response = requests.get(url,headers = headers)
response

<Response [200]>

In [9]:
# crate BeautifulSoup to scrape required information
soup = BeautifulSoup(response.content,'lxml')
stores = soup.find_all('a',class_ ='nlogo lazy srtbyPic')

In [10]:
# Create function stores_df to extract url and name of the gym
def stores_df(stores):
    df_list = []
    for store in stores:
        url = store['href']
        title = store['title']
        df_list.append({'url':url,
                    'title': title})
    return pd.DataFrame(df_list,columns = ['url','title'])

# create function locations to extract location from dataframe created in stored_df function
def locations(dataframe):
    locations_store = [] 
    for title in dataframe['title']:
        location = title.split('in')[-1].split(',')[0]
        locations_store.append({'location': location})
    return pd.DataFrame(locations_store)      

# create function votes_stores to scrape votes of gyms
def votes_stores(bsoup):
    votes_list = []
    for vote in bsoup.find_all('span',class_ ='rt_count lng_vote'):
        votes = ''.join(re.findall('\d+',vote.text))
        votes_list.append({'votes': votes})
    #return every 2 row 
    return votes_list[0:20:2]


# create function to scrape opening and closing time of a Gym
def time(df):
    df_list = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    for url in df:
        response = requests.get(url,headers = headers)
        soup = BeautifulSoup(response.content, 'lxml')
        opening_time = soup.find('span', class_ ='mreinflispn2').text.replace('\t','').strip().split('-')[0].strip()
        closing_time = soup.find('span', class_ ='mreinflispn2').text.replace('\t','').strip().split('-')[-1].strip()
        df_list.append({'opening_time':opening_time,
                    'closing_time':closing_time})
    return df_list


# create function est_year to scrape established year of gym
def est_year(df):
    established_year  = []
    # captures errors if any
    errors = {}
    for index,url in enumerate(df):
        response = requests.get(url,headers = headers)
        soup = BeautifulSoup(response.content,'lxml')
        try:
            year = int(soup.find_all('ul',class_ = 'alstdul')[-1].text.strip())
            established_year.append({0:year})
        except Exception as e:
            errors[index] = url
    return established_year,errors
        


In [11]:
df = stores_df(stores)
location = locations(df)
votes_df = votes_stores(soup)
time_stores = time(df['url'])
estab_year,error = est_year(df['url'])

In [21]:
for item,value in error.items():
    print(f"Index {item}")
    pos = item
    r = requests.get(error[item],headers = headers)
    s = BeautifulSoup(r.content,'lxml')
    y = int(s.find_all('ul',class_ = 'alstdul')[-2].text.strip())

Index 4


- As we can see error occurred at index 4. Hence we worked on it separately and extracted year from it

In [22]:
# Insert establish year at index 4 and keep everything else in dataframe as it is.
year = pd.concat([pd.DataFrame(estab_year[:pos]),pd.DataFrame([y]),pd.DataFrame(estab_year[pos:])],ignore_index=True)
year.head()

Unnamed: 0,0
0,2013
1,2008
2,2014
3,2017
4,2006


In [23]:
time = pd.DataFrame(time_stores,columns = ['opening_time','closing_time'])
votes = pd.DataFrame(votes_df)
location = pd.DataFrame(location)
stores_justdial = pd.concat([df,time,votes,location,year],axis = 1)
print(f"Number of observations: {stores_justdial.shape}")
stores_justdial.head()

Number of observations: (10, 7)


Unnamed: 0,url,title,opening_time,closing_time,votes,location,0
0,https://www.justdial.com/Mumbai/48-Fitness-(Cr...,48 Fitness (Crystal Point Mall) in Andheri Wes...,06:00 am,11:30 pm,324,Andheri West,2013
1,https://www.justdial.com/Mumbai/Fitness-First-...,Fitness First India Pvt Ltd (Mega Mall) in Osh...,10:00 am,07:00 pm,969,Oshiwara,2008
2,https://www.justdial.com/Mumbai/Endurance-Fitn...,Endurance Fitness in Lokhandwala Complex-Andhe...,08:00 am,08:00 pm,26,Lokhandwala Complex-Andheri West,2014
3,https://www.justdial.com/Mumbai/Forever-Fitnes...,"Forever Fitness in Andheri West, Mumbai",11:00 am,07:30 pm,15,Andheri West,2017
4,https://www.justdial.com/Mumbai/Golds-Gym-Oppo...,"Golds Gym in Andheri West, Mumbai",06:00 am,11:30 pm,1147,Andheri West,2006


In [65]:
stores_justdial.columns = ['url', 'title', 'opening_time', 'closing_time', 'votes', 'location', 'established_year']

In [66]:
stores_justdial.head()

Unnamed: 0,url,title,opening_time,closing_time,votes,location,established_year
0,https://www.justdial.com/Mumbai/48-Fitness-(Cr...,48 Fitness (Crystal Point Mall) in Andheri Wes...,06:00 am,11:30 pm,324,Andheri West,2013
1,https://www.justdial.com/Mumbai/Fitness-First-...,Fitness First India Pvt Ltd (Mega Mall) in Osh...,10:00 am,07:00 pm,969,Oshiwara,2008
2,https://www.justdial.com/Mumbai/Endurance-Fitn...,Endurance Fitness in Lokhandwala Complex-Andhe...,08:00 am,08:00 pm,26,Lokhandwala Complex-Andheri West,2014
3,https://www.justdial.com/Mumbai/Forever-Fitnes...,"Forever Fitness in Andheri West, Mumbai",11:00 am,07:30 pm,15,Andheri West,2017
4,https://www.justdial.com/Mumbai/Golds-Gym-Oppo...,"Golds Gym in Andheri West, Mumbai",06:00 am,11:30 pm,1147,Andheri West,2006
