# import packages

In [3]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from datetime import datetime

# start url & scraping date

In [4]:
start_url = 'https://asgoodasnew.de/Handys/Apple/iPhone-14/'
scrap_date = datetime.now().strftime('%Y-%m-%d')

# function definition

In [5]:
def scrap_url(url):

    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')

    return soup

In [6]:
def count_pages(soup):

    n_sites = np.int(np.trunc(np.int(soup.find('div', class_='listlocator-articlecount pull-left').text.strip().replace(' Artikel', '')) / 12) + 1)
    return n_sites

In [7]:
def create_agan_main_url(url, add):
    url = f'{url}?pgNr={add}'
    return url

In [8]:
def read_model_name(soup):
    models = soup.find_all('div', class_='listitem-title')

    models_mod = []
    
    for model in models:
        model = model.text.strip()

        models_mod.append(model)

    return models_mod


In [9]:
def create_agan_model_url(model_name):

    #model_add = model_name.replace(' ', '-')

    model_url = f"{start_url}{model_name.replace(' ','-')}.html"

    return model_url

In [10]:
def read_model_conditions(soup):
    conditions = soup.find_all('div', class_='btn-radio')

    conditions_mod = []
    status = []

    for condition in conditions:
        condition = condition.label.text.strip().split('\n')

        if len(condition) == 2:
            conditions_mod.append(condition[0])
            status.append(condition[1].strip())
        else:
            conditions_mod.append(condition[0])
            status.append('verfügbar')
        
    return conditions_mod, status

In [11]:
def read_model_prices(soup):
    prices = soup.find_all('div', class_='conditions-price')

    old_prices = []
    new_prices = []

    for price in prices:
        old_price = np.int(price.text.strip()[:4].replace(',', ''))
        new_price = price.find('div', class_='conditions--newprice').text.strip()[:-3]

        if new_price == '':
            new_price = 0
        else: new_price = np.int(new_price)

        old_prices.append(old_price)
        new_prices.append(new_price)
    
    return old_prices, new_prices

In [12]:
def read_model_specs(soup):
    attr_vals = soup.find('td', attrs={'class':"datasheet_cell td_value td_1er td_02 td-6",
                                    'datag':"grp_23",
                                    'datak':"kat_469"}).text.strip().split('x')

    attr_val_2 = []

    for attr_val in attr_vals:    

        attr_val = float(attr_val.replace('mm', '').strip().replace(',', '.'))
        attr_val_2.append(attr_val)

    return attr_val_2[0], attr_val_2[1], attr_val_2[2]

In [13]:
def create_model_list():

    soup = scrap_url(start_url)
    n_sites = count_pages(soup)

    models = []

    for i in range(n_sites):

        url = create_agan_main_url(start_url, i)

        soup = scrap_url(url)

        models.append(read_model_name(soup))

        #if soup.find('li', class_='next disabled') != None:
            #break

    models = [model for model_list in models for model in model_list]

    return models

In [14]:
def create_spec_model_url(model_name):

    models_adj = model_name.split()[:-2]

    new_model_name = []
    sizes = ['GB', 'TB']

    for m in models_adj:
        
        if any([size in m for size in sizes]):
            m = ''
        
        new_model_name.append(m.lower())

    new_model = '-'.join(new_model_name)

    model_url = f'https://www.inside-digital.de/handys/{new_model}'

    return model_url


In [15]:
def df_mod(df):

    df['Datum'] = scrap_date
    df['Datum'] = pd.to_datetime(df['Datum'])

    df['Rabatt'] = df.apply(lambda x: x['aktueller Preis'] - x['Preis'] 
                                            if x['aktueller Preis'] > 0 else 0, axis=1)

    df = df[['Datum',
            'Modell',
            'Zustand',
            'Status',
            'Preis',
            'aktueller Preis', 
            'Rabatt',
            'Höhe',
            'Breite',
            'Tiefe'
            ]]
    df = df.reset_index(drop=True)

    return df

In [16]:
def create_scraped_data(model_list):

    full_data = pd.DataFrame()

    for model in model_list:

        # scrape model prices
        model_url_agan = create_agan_model_url(model)
        soup = scrap_url(model_url_agan)

        con, stat = read_model_conditions(soup)
        old, new = read_model_prices(soup)

        # scrape models specifications
        model_url_spec = create_spec_model_url(model)

        soup = scrap_url(model_url_spec)

        height, width, depth = read_model_specs(soup)

        # create DataFrame
        data = pd.DataFrame(list(zip(con,stat, old, new)), 
                            columns =['Zustand', 'Status', 'Preis', 'aktueller Preis'])
        data['Modell'] = model
        data['Höhe'] = height
        data['Breite'] = width
        data['Tiefe'] = depth

        full_data = pd.concat([full_data, data])

    full_data = df_mod(full_data)

    return full_data


In [17]:
def export_data(full_data):
    export = input(f'Do you want do download the data to Excel? Press y or n or ESC: ')

    while export.lower() not in ['y', 'n', '']:
        export = input(f'Please enter a valid value? Press y or n or ESC: ')

    if export.lower() == 'y':
            
            file = 'iphone asgoodasnew preise scraping.xlsx'
            df = pd.read_excel(file)

            last_date = df['Datum'].max().strftime('%Y-%m-%d')

            new_date = full_data['Datum'].max().strftime('%Y-%m-%d')

            if new_date != last_date:

                df_new = pd.concat([df, full_data])
                df_new.to_excel(file, index=False)

                print('Neue Daten exportiert')
            
            else:
                print('Daten bereits vorhanden. Daten werden nicht exportiert.')
    else:
        print('Daten werden nicht exportiert')

# run script

In [18]:
model_list = create_model_list()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
full_data = create_scraped_data(model_list)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  del sys.path[0]


In [20]:
export_data(full_data)

Daten werden nicht exportiert
