<a href="https://colab.research.google.com/github/ramyakrishna02/Projects/blob/main/WebScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import time
from IPython.display import clear_output
base_url = 'https://www.makaan.com/hyderabad-residential-property/buy-property-in-hyderabad-city?page='

In [2]:
def seller_inf(soup):
  ''' Function for scraping seller information
  Arguments : soup - soup object from Beautifulsoup
  Returns : seller info
  '''
  s_inf = soup.find_all('span',attrs={'class':'seller-type'})
  s_info = [s.text for s in s_inf]
  return s_info

def bhk_no_inf(soup):
  no1 = soup.find_all('a',attrs={'data-type':"listing-link"})
  num = [no.span.text.strip() for no in no1]
  return num

def price_inf(soup):
  p = soup.find_all('div',attrs={'data-type':"price-link"})
  price_ = [pr.text for pr in p]
  return price_

def address_inf(soup):
  add = soup.find_all('span',attrs={'itemprop':"addressLocality"})
  address = [ad.text for ad in add]
  return address

def area_inf(soup):
  sz = soup.find_all('td',attrs={'class':"size"})
  area = [s.text for s in sz]
  return area

def area_sqft_inf(soup):
  ft = soup.find_all('td',attrs={'class':'lbl rate'})
  feet = [f.text for f in ft]
  return feet

def status_inf(soup):
  tx = soup.find_all('td',attrs={'class':"val"})
  text = [t.text for t in tx]
  return text

def bathrooms_inf(soup):
  bt = soup.find_all('ul',attrs={'class':'listing-details'})
  bath = []
  for b in bt:
    try:
      bath.append(b.li.text.split(' ')[0])
    except AttributeError:
      bath.append('None')
  return bath

In [3]:
def scrapedata(url,num_pages):

  global base_url

  seller = []
  bhk = []
  price = []
  locality = []
  area = []
  area_sqft = []
  status = []
  bathrms = []

  for i in range(1,num_pages):
    time.sleep(3)
    print(f'Page No : {i}')
    clear_output(wait=True)

    url = base_url + str(i)
    request = requests.get(url)
    soup = BeautifulSoup(request.content,'html')

    seller1 = seller_inf(soup)
    for s in seller1:
      seller.append(s)

    bhk1 = bhk_no_inf(soup)
    for b in bhk1:
      bhk.append(b)

    price1 = price_inf(soup)
    for p in price1:
      price.append(p)

    locality1 = address_inf(soup)
    for l in locality1:
      locality.append(l)

    area1 = area_inf(soup)
    for a in area1:
      area.append(a)

    area_sqft1 = area_sqft_inf(soup)
    for sq in area_sqft1:
      area_sqft.append(sq)

    status1 = status_inf(soup)
    for st in status1:
      status.append(st)

    bathrms1 = bathrooms_inf(soup)
    for br in bathrms1:
      bathrms.append(br)
      
  return seller,bhk,price,locality,area,area_sqft,status,bathrms

In [4]:
seller,bhk,price,locality,area,area_sqft,status,bathrms = scrapedata(base_url,501)

Page No : 500


In [5]:
print(len(seller))
print(len(bhk))
print(len(price))
print(len(locality))
print(len(area))
print(len(area_sqft))
print(len(status))
print(len(bathrms))

10000
10000
10000
10000
10000
10000
10000
10000


In [6]:
import pandas as pd
data = pd.DataFrame({'Seller_info':seller,'No_BHK':bhk,'Price':price,'Locality':locality,'Area':area,'Area_sqft':area_sqft,'Construction_status':status,'Bathrooms':bathrms})
data

Unnamed: 0,Seller_info,No_BHK,Price,Locality,Area,Area_sqft,Construction_status,Bathrooms
0,BUILDER,3,1.69 Cr,Yapral,2950,"5,756 / sq ft",Under Construction,Possession
1,BUILDER,Residential Plot,32.29 L,Maheshwaram,1938,"1,667 / sq ft",New,Possession
2,BUILDER,3,88.03 L,Kompally,1834,"4,800 / sq ft",Under Construction,Possession
3,AGENT,Residential Plot,22 L,Sangareddy,1800,"1,222 / sq ft",New,1
4,BUILDER,2,61.57 L,Kompally,1310,"4,700 / sq ft",Under Construction,Possession
...,...,...,...,...,...,...,...,...
9995,AGENT,Residential Plot,25.05 L,Shankarpally Road,1503,"1,666 / sq ft",New,3
9996,AGENT,Residential Plot,31.08 L,Maheshwaram,222,"14,000 / sq ft",New,121
9997,AGENT,Residential Plot,29 L,Keesara,1800,"1,611 / sq ft",New,121
9998,AGENT,Residential Plot,36.25 L,Keesara,2250,"1,611 / sq ft",New,121


In [7]:
# to save the data
data.to_csv('scraped_data.csv')