In [7]:
from bs4 import BeautifulSoup
import requests
import pickle
import urllib.request
import os

# [ Fabia, Octavia, SuperB ]
car_types = [
            ("Fabia","https://www.hasznaltauto.hu/talalatilista/auto/0P5WA5O3L3YQM583L5U61285URLHHYM3OJDPO6T3URLQJGU28LLURQ95RWEEETSOF6O9IEW9QEWEQQG1G7RC69IOIWHK3F0SWYROQ9MROPR7S11H3U8T9U3PG4152L08PEWH9T35ROF6TY6K671Y8YFOWEOMTG9GL3C2REKG/page"),
            ("Octavia","https://www.hasznaltauto.hu/talalatilista/auto/KL85WLTWU1YTY6Y6IEQ6K36E49MD9L6TALYIGPDKP4K85FY3E2COO9YOSF0TK3ZT6CJHE9GUJQ3ZU66HUEH1YH35QUYFP3ODQ3J9SUW1WEGPPGHLZ726OJR0UI7W84LTT44Q7RDMEIQYQTFKGI8CE7CUEMCUCMIO5JLSY/page"),
            ("SuperB","https://www.hasznaltauto.hu/talalatilista/auto/YHUQ5HD5JESDSGSG237GYWG3PR0KRHGDFHS2Z6KY6PYUQCSW3O944RS4LCIDYW8DG9AM3RZJA7W8JGGMJ3MESMWQ7JSC6W4K7WARLEKS3Z365ZIEUARUG65W10L3PGKMGMAYMC0Y3OT7RLYA2G63QM799MT9ES235L6F0S/page")
            ]

# This class is responsible for downloading our database.
class data_miner:
    
    # ------------------------------------------------ Variables ------------------------------------------------ #    
    # Variables for reaching the website
    website = ""
    car = ""
    cookies = dict( BCPermissionLevel = 'PERSONAL' ) # With this you will not get cookie warning page
    headers = { "User-Agent" : "Mozilla/5.0" }
    
    # Variables for web
    page_count = -1
    car_links = []
    car_image_links = []
    
    def download_database(self):
        self.get_page_count()
    
    # ------------------------------------------------ Functions ------------------------------------------------ #
    # This function get the number of car pages
    def get_page_count(self):
        # Create 'Data' folder
        if not os.path.exists("data"):
            os.makedirs("data")
            
        self.page_count = -1
        page = requests.get( self.website + "1", headers = self.headers, cookies = self.cookies )
        soup = BeautifulSoup( page.content, 'lxml' )
        for link in soup.find_all("a", {"title" : "Utolsó oldal"}):
            self.page_count = int(link.getText())
        print("%s \n<- links's page count is: %s" % (self.website, self.page_count))
        # assert self.page_count > -1      # If the page does not contain "Utolsó oldal" element, the page_count's value equals -1
        self.get_car_links()
        
    # This function get the car links on a page
    def get_car_links(self):
        for page_number in range(1,self.page_count + 1):
            page = requests.get( self.website + str(page_number), headers = self.headers, cookies = self.cookies )
            soup = BeautifulSoup( page.content, 'lxml' )
            
            car_count_per_page = 1 # Just for printing the state
            for link in soup.select(".talalati_lista .talalati_lista_head h2 a"):
                car_link = link.get("href")
                print("Page %d's %s. car: %s" % (page_number, car_count_per_page, car_link))
                self.car_links.append(car_link)
                self.get_car_information(car_link)
                car_count_per_page += 1
                
        print("Saving car data from the link (%s) is Done." % (self.website))
        
    # This function save the image and car data
    def get_car_information(self, _car_link):        
        page = requests.get( _car_link, headers = self.headers, cookies = self.cookies )
        soup = BeautifulSoup( page.content, 'lxml' )
        
        # Get the image links
        image_counter = 1 # Just for printing the state  
        for link in soup.select(".img_page a img"): 
            image_link = link.get("src")
            if image_link not in self.car_image_links:
                self.car_image_links.append(image_link)
                
                # Save image. The folder name equals the car's unique 8digit key
                image_name = _car_link[len(_car_link) - 8: ]
                folder_name = "data/" + self.car + "/" + image_name
                
                # Create folder. 
                if not os.path.exists("data/" + self.car):
                    os.makedirs("data/" + self.car)
                
                if not os.path.exists(folder_name):
                    os.makedirs(folder_name)
                    
                # Dowload the image
                urllib.request.urlretrieve(image_link, folder_name + "/" + image_name + "_" + str(image_counter) + ".jpg")
                #print("%d. image ( %s ) is saved!" % (image_counter, image_link) )
                image_counter += 1
                
        # Get car's data        
        data = []
        
        # Ár Évjárat ...
        for link in soup.select(".adatlap-adatok strong"): 
            data.append(link.getText())
            
        # Extrák Multimédia ...
        for link in soup.select(".adatlap-adatok li"): 
            data.append(link.getText())
            
        # Leírás
        for link in soup.find_all("span", {"property" : "p:description"}):
            data.append(link.getText())
            
        self.save_car_data(data, folder_name + "/" + image_name)
        
    # Saves all of the car info in one list
    def save_car_data(self, _obj, _file_name):
        with open(_file_name + '.pkl', 'wb') as output:
            pickle.dump(_obj, output, pickle.HIGHEST_PROTOCOL)
            
    # Loads all of the car info in one list
    def load_car_data(self, _file_name):
        with open(_file_name + '.pkl', 'rb') as input:
            print( pickle.load(input) ) 
            #return pickle.load(input)
        
def main():
    # To download the database.
    dm = data_miner()
    for car_type in car_types:
        dm.car = car_type[0]
        dm.website = car_type[1]
        dm.download_database()
        
    # TODO
    # To read the already downloaded data.
    
if __name__ == "__main__":
    main()

https://www.hasznaltauto.hu/talalatilista/auto/0P5WA5O3L3YQM583L5U61285URLHHYM3OJDPO6T3URLQJGU28LLURQ95RWEEETSOF6O9IEW9QEWEQQG1G7RC69IOIWHK3F0SWYROQ9MROPR7S11H3U8T9U3PG4152L08PEWH9T35ROF6TY6K671Y8YFOWEOMTG9GL3C2REKG/page 
<- links's page count is: 82
Page 1's 1. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_1.4_classic-12047804
Page 1's 2. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_1.2._mo-i._szervo-12024089
Page 1's 3. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_1.2._mo-i._szervizkonyv-12024171
Page 1's 4. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_1.4_classic_magyarorszagifriss_muszaki-12018750
Page 1's 5. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_1.2_6v_choice_szervo-12043595
Page 1's 6. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_combi_1.2_12v_joy_igazan_megkimelt.szervos-12054847
Page 1's 7. car: https://www.hasznaltauto.hu/auto/skoda/fabia/skoda_fabia_1.2._mo-i._szervo-12

KeyboardInterrupt: 

In [24]:

2. image ( https://hasznaltauto.medija.hu/1119/11877293_2t.jpg ) is saved!


SyntaxError: invalid syntax (<ipython-input-24-53c334e6764f>, line 2)