### DS-GA 1001 Final Project
### Web Scraping Project for second-hand housing market in Shanghai City 

In [14]:
import requests
import time
import ast
import os
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [2]:
# List of URLs of website's homepages 
url1 = 'https://sh.lianjia.com/'
url2 = 'https://sh.58.com/ershoufang/'
url_list = [url1, url2]

In [3]:
# List of Districts in Shanghai City
district1 = '黄浦'
district2 = '卢湾'
district3 = '徐汇'
district4 = '静安'
district5 = '长宁'
district6 = '普陀'
district7 = '闸北'
district8 = '虹口'
district9 = '杨浦'
district10 = '浦东'
district11 = '闵行'
district12 = '宝山'
district13 = '嘉定'
district14 = '青浦'
district15 = '松江'
district16 = '金山'
district17 = '奉贤'
district18 = '崇明'
district_list = [district1, district2, district3, district4,
                district5, district6, district7, district8,
                district9, district10, district11, district12,
                district13, district14, district15, district16,
                district17, district18]

In [17]:
# List of districts in Shanghai city in English format 
district1_eng = 'huangpu'
district2_eng = 'luwan'
district3_eng = 'xuhui'
district4_eng = 'jingan'
district5_eng = 'changning'
district6_eng = 'putuo'
district7_eng = 'zhabei'
district8_eng = 'hongkou'
district9_eng = 'yangpu'
district10_eng = 'pudongxinqu'
district11_eng = 'minxing'
district12_eng = 'baoshan'
district13_eng = 'jiading'
district14_eng = 'qingpu'
district15_eng = 'songjiang'
district16_eng = 'jinshan'
district17_eng = 'fengxiansh'
district18_eng = 'chongming'
district_eng_list = [district1_eng, district2_eng, district3_eng, district4_eng,
                district5_eng, district6_eng, district7_eng, district8_eng,
                district9_eng, district10_eng, district11_eng, district12_eng,
                district13_eng, district14_eng, district15_eng, district16_eng,
                district17_eng, district18_eng]

#### For lianjia.com

In [5]:
# Create a fake user agent and display the fake mask for Chrome browser
ua = UserAgent()
ua.Chrome

'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'

In [6]:
# Load html source code to a beautifulsoup object
header = {'User-Agent':str(ua.chrome)}
html_content = requests.get(url1, headers = header)
content_soup = BeautifulSoup(html_content.text, 'html.parser')

In [6]:
# Output the html contents to a file for backup purpose 
fout = open('lianjia_homepage.html','w')
fout.write(content_soup.prettify())
fout.close()

In [13]:
# Extract info for each house
def find_total_pages(district_name):
    # Set up driver and open the home page
    driver = webdriver.Chrome('/Users/haonantian/chromedriver') # Make sure you put chromedriver into your PATH
    driver.get(url1)

    # locate input slot and input district to search
    input_element = driver.find_element_by_name('keyword')
    input_element.send_keys(district_name)
    time.sleep(3)
    input_element.send_keys(Keys.ENTER)

    # Get the url for district front page and open load the website to the program
    district_firstpage_url = driver.current_url
    time.sleep(3)
    driver.close()
    district_content = requests.get(district_firstpage_url, headers=header)
    district_soup = BeautifulSoup(district_content.text, 'html.parser')
    
    # Find the total houses available for the district
    total = district_soup.find('h2',{'class':'total fl'}).span.text.strip().lstrip()
    
    # Find the total page number for a district
    total_page = district_soup.find('div',{'class':'page-box house-lst-page-box'})['page-data']
    total_page = ast.literal_eval(total_page)['totalPage']
    return total, total_page

In [8]:
def make_up_lianjia_district(district_name, total_pages): # Make up a list of district front page urls
    district_url = []
    temp_url = 'https://sh.lianjia.com/ershoufang/'
    for i in range(total_pages):
        if i == 0:
            district_url.append(temp_url + 'rs' + district_name + '/')
        else:
            district_url.append(temp_url + 'pg' + str(i+1) + 'rs' + district_name + '/')
    return district_url

In [9]:
def extract_single_house(house_url): # Function to extract the information for a certain house
    result = []
    
    
    # Find the unit price
    content = requests.get(house_url, headers=header)
    soup = BeautifulSoup(content.text, 'html.parser')
    unit_price = soup.find('span',{'class':'unitPriceValue'}).text.strip().lstrip()
    
    # Find the information about community
    community = []
    communities = soup.find('div',{'class':'aroundInfo'})
    community_name = communities.find('a',{'class':'info'}).text.strip().lstrip()
    community.append(community_name)
    area = communities.find('div',{'class':'areaName'})
    area_contents = area.find('span',{'class':'info'})
    a_s = area_contents.find_all('a')
    for item in a_s:
        community.append(item.text.strip().lstrip())
    try:
        community.append(communities.find('a',{'class':'supplement'}).text.strip().lstrip())
    except:
        pass
    
    # Find the information about basic features
    introContent = soup.find('div',{'class':'introContent'})
    content = introContent.find('div',{'class':'base'})
    lis = content.find_all('li')
    basic = []
    for item in lis:
        basic.append(item.text.strip().lstrip())
        
    # Find the information about transaction features
    transactions = introContent.find('div',{'class':'transaction'})
    lis2 = transactions.find_all('li')
    transaction = []
    for item in lis2:
        spans = item.find_all('span')
        transaction.append(spans[1].text.strip().lstrip())
        
    # Make up the results
    result.append(unit_price)
    result.append(community)
    result.append(basic)
    result.append(transaction)
    return result

In [10]:
def parse_page_lianjia(page_url): # Function to extract information for a certain dictrict page
    result = []
    temp_content = requests.get(page_url, headers=header)
    page_soup = BeautifulSoup(temp_content.text, 'html.parser')
    house_info_div = page_soup.find('div',{'class':'bigImgList'})
    
    # Identify the content of a list of house info
    house_list = page_soup.find_all('div',{'class':'item'})
    for house in house_list:
        total_price = house.find('div',{'class':'price'}).span.text.strip().lstrip()
        house_url = house.find('a',{'class':'img'})['href']
        
        # Extract information fr each house
        info_list = extract_single_house(house_url)
        info_list = [total_price] + info_list
        
        result.append(info_list)
    return result

In [14]:
# Main function to utilize the above function to parge and record all second-hand house info for Shanghai on lianjia.com
total_found = 0
for district in district_list:
    print('Start District {} !!!'.format(district))
    temp_result = []
    total_houses, total_pages = find_total_pages(district)
    total_found += int(total_houses)
    district_url = make_up_lianjia_district(district, total_pages)
    counter = 1
    for url in district_url:
        temp_result += parse_page_lianjia(url)
        print('Finished District {} Page {}'.format(district, counter))
        counter += 1
    #result_dict[district] = temp_result
    fout = open(district + '_result.txt','w')
    fout.write(str(temp_result))
    fout.close()
    print('\nFinished District {} !!!\n'.format(district))
print('\nTotal Houses Found {}'.format(str(total_foundound)))
        

Start District 黄浦 !!!
Finished District 黄浦 Page 1
Finished District 黄浦 Page 2
Finished District 黄浦 Page 3
Finished District 黄浦 Page 4
Finished District 黄浦 Page 5
Finished District 黄浦 Page 6
Finished District 黄浦 Page 7

Finished District 黄浦 !!!

Start District 卢湾 !!!
Finished District 卢湾 Page 1
Finished District 卢湾 Page 2
Finished District 卢湾 Page 3

Finished District 卢湾 !!!

Start District 徐汇 !!!
Finished District 徐汇 Page 1
Finished District 徐汇 Page 2
Finished District 徐汇 Page 3
Finished District 徐汇 Page 4
Finished District 徐汇 Page 5
Finished District 徐汇 Page 6
Finished District 徐汇 Page 7
Finished District 徐汇 Page 8
Finished District 徐汇 Page 9
Finished District 徐汇 Page 10
Finished District 徐汇 Page 11
Finished District 徐汇 Page 12
Finished District 徐汇 Page 13
Finished District 徐汇 Page 14
Finished District 徐汇 Page 15

Finished District 徐汇 !!!

Start District 青浦 !!!
Finished District 青浦 Page 1
Finished District 青浦 Page 2
Finished District 青浦 Page 3
Finished District 青浦 Page 4
Finished Dist

AttributeError: 'NoneType' object has no attribute 'span'

In [22]:
def calculate_total(input_path): # Function to calculate the total number of houses extracted from lianjia.com
    total = 0
    for district in district_list:
        input_file = open(input_path+'/' + district + '_result.txt','r')
        content = input_file.read()
        content_list = ast.literal_eval(content)
        total += len(content_list)
        input_file.close()
    return total

In [23]:
# Find the amount of houses found on lianjia.com
import os
current_path = os.getcwd()
total = calculate_total(current_path)
print(total)

3565


#### For 58.com

In [107]:
# The url for second-hand house shanghai front page
url_shanghai = 'https://sh.58.com/ershoufang/?PGTID=0d30000c-0057-806a-76a1-3cdec2e105da&ClickID=1'

In [122]:
# Module to record the page information on second-hand house shanghai front page
content_58 = requests.get('https://sh.58.com/ershoufang/36083087591556x.shtml?from=1-list-0', headers=header)
content_58_soup = BeautifulSoup(content_58.text, 'html.parser')
fout = open('huangpu_sample_58.html', 'w')
fout.write(content_58_soup.prettify())
fout.close()

In [7]:
# Function to make up a list of url for every page for a certain district in shanghai
def make_url_58(district_name_eng):
    url_list = []
    front_page_url = 'https://sh.58.com/' + district_name_eng + '/ershoufang/'
    url_list.append(front_page_url)
    front_content = requests.get(front_page_url, headers=header)
    front_soup = BeautifulSoup(front_content.text, 'html.parser')
    page_block = front_soup.find('div',{'class':'pager'})
    total_page = int(page_block.find_all('span')[-2].text.strip().lstrip())
    counter = 2
    while counter <= total_page:
        url_list.append('https://sh.58.com/' + district_name_eng + '/ershoufang/' + 'pn' + str(counter) + '/')
        counter += 1
    return url_list

In [8]:
def parse_page(page_url): # Function to parse pages for certain houses
    result = []
    house_content = requests.get(page_url, headers=header)
    house_soup = BeautifulSoup(house_content.text, 'html.parser')
    
    try:
        # locate the parse basic information
        basic_info = []
        general_info = house_soup.find('div',{'id':'generalSituation'})
        col_left = general_info.find('ul',{'class':'general-item-left'})
        info_list_left = col_left.find_all('li')
        for item in info_list_left[1:]:
            basic_info.append(item.text.strip().lstrip())
        col_right = general_info.find('ul',{'class':'general-item-right'})
        info_list_right = col_right.find_all('li')
        for item in info_list_right:
            basic_info.append(item.text.strip().lstrip())
        result.append(basic_info)
        
        # locate and parse community information
        community_info = []
        info_block = house_soup.find('ul',{'class':'xiaoqu-desc'})
        community_list = info_block.find_all('li')
        for item in community_list:
            community_info.append(item.text.strip().lstrip())
        result.append(community_info)
    except:
        pass
    
    return result

In [9]:
def parse_page_58(page_url): # Function to parse distict front page in shanghai 
    page_result = []
    counter = 0
    # load the page for certain district to the program
    page_content = requests.get(page_url, headers=header)
    page_soup = BeautifulSoup(page_content.text, 'html.parser')
    try:
        # locate the house block and append all the info of house into a list 
        house_block = page_soup.find('ul',{'class':'house-list-wrap'})
        house_list = house_block.find_all('li')
        for house in house_list:
            temp_result = []
            try: 
                # load house page's url to the program
                house_url = house.find('h2',{'class':'title'}).a['href']
                
                # generate results for each house
                temp_result.append(parse_page(house_url))
                
                # locate and parse the price info for a house
                price_block = house.find('div',{'class':'price'})
                ps = price_block.find_all('p')
                temp_result.append(ps[0].text.strip().lstrip())
                temp_result.append(ps[1].text.strip().lstrip())
                page_result.append(temp_result)
                counter += 1
            except:
                pass
    except:
        pass
    return page_result, counter

In [11]:
# Main module to utilize the above functions to generate results for all second-hand houses in Shanghai
total_found = 0
for district in district_eng_list:
    result = []
    district_found = 0
    page_urls = make_url_58(district)
    print('Start District {} !!!'.format(district))
    page_counter = 0
    while page_counter < len(page_urls):
        temp_result, house_num = parse_page_58(page_urls[page_counter])
        result += temp_result
        district_found += house_num
        page_counter += 1
        print('Finished District {} Page {} House Found {}'.format(district, str(page_counter), str(house_num)))
    fout = open(district + '_58.txt','w')
    fout.write(str(result))
    fout.close()
    print('\nFinished District {} House Found {} !!!\n'.format(district, str(district_found)))
    total_found += district_found
print('\nTotal Houses Found {}'.format(str(total_found)))

Start District fengxiansh !!!
Finished District fengxiansh Page 1 House Found 55
Finished District fengxiansh Page 2 House Found 50
Finished District fengxiansh Page 3 House Found 50
Finished District fengxiansh Page 4 House Found 47
Finished District fengxiansh Page 5 House Found 48
Finished District fengxiansh Page 6 House Found 44
Finished District fengxiansh Page 7 House Found 44
Finished District fengxiansh Page 8 House Found 44
Finished District fengxiansh Page 9 House Found 44
Finished District fengxiansh Page 10 House Found 44
Finished District fengxiansh Page 11 House Found 43
Finished District fengxiansh Page 12 House Found 47
Finished District fengxiansh Page 13 House Found 49
Finished District fengxiansh Page 14 House Found 49
Finished District fengxiansh Page 15 House Found 48
Finished District fengxiansh Page 16 House Found 50
Finished District fengxiansh Page 17 House Found 48
Finished District fengxiansh Page 18 House Found 49
Finished District fengxiansh Page 19 House 

In [18]:
# Function to find total item extracted from 58.com
def find_total(path):
    total = 0
    for name in district_eng_list:
        fin = open(path + name + '_58.txt','r')
        content = fin.read()
        total += len(ast.literal_eval(content))
        fin.close()
    return total

In [19]:
# Calculate total number of second-hand houses scraped from 58.com
current_path = os.getcwd()
total_house = find_total(current_path+'/58/')
print(total_house)

49170
