# import packages

In [140]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from datetime import datetime

# start url & scraping date

In [141]:
start_url = 'https://asgoodasnew.de/Handys/Apple/iPhone-14/'
scrap_date = datetime.now().strftime('%Y-%m-%d')

# function definition

In [142]:
def scrap_url(url):

    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')

    return soup

In [143]:
def count_pages(soup):

    n_sites = int(np.trunc(int(soup.find('div', class_='listlocator-articlecount pull-left').text.strip().replace(' Artikel', '')) / 12) + 1)
    return n_sites

In [144]:
def create_agan_main_url(url, add):
    url = f'{url}?pgNr={add}'
    return url

In [145]:
def read_model_name(soup):
    models = soup.find_all('div', class_='listitem-title')

    models_mod = []
    
    for model in models:
        model = model.text.strip()

        models_mod.append(model)

    return models_mod


In [146]:
def create_model_list():

    soup = scrap_url(start_url)
    n_sites = count_pages(soup)

    models = []

    for i in range(n_sites):

        url = create_agan_main_url(start_url, i)

        soup = scrap_url(url)

        models.append(read_model_name(soup))

        #if soup.find('li', class_='next disabled') != None:
            #break

    models = [model for model_list in models for model in model_list]

    return models

In [147]:
def create_agan_model_url(model_name):

    #model_add = model_name.replace(' ', '-')

    model_url = f"{start_url}{model_name.replace(' ','-')}.html"

    return model_url

In [148]:
def read_model_conditions(soup):
    conditions = soup.find_all('div', class_='btn-radio')

    conditions_mod = []
    status = []

    for condition in conditions:
        condition = condition.label.text.strip().split('\n')

        if len(condition) == 2:
            conditions_mod.append(condition[0])
            status.append(condition[1].strip())
        else:
            conditions_mod.append(condition[0])
            status.append('verfügbar')
        
    return conditions_mod, status

In [149]:
def read_model_prices(soup):
    prices = soup.find_all('div', class_='conditions-price')

    old_prices = []
    new_prices = []

    for price in prices:
        old_price = int(price.text.strip()[:4].replace(',', ''))
        new_price = price.find('div', class_='conditions--newprice').text.strip()[:-3]

        if new_price == '':
            new_price = 0
        else: new_price = int(new_price)

        old_prices.append(old_price)
        new_prices.append(new_price)
    
    return old_prices, new_prices

In [150]:
def create_model_attr(model):

    model = model.split()

    for number, item in enumerate(model):

        brand = model[0]
        product = model[1]
        series = model[2]

        if item.endswith(('GB', 'TB')):
            size_index = number
            size_str = model[size_index]

            if size_str[-2:] == 'TB':
                size_gb = int(size_str[:-2]) * 1000 + int(size_str[:-2]) * 24
            else:
                size_gb = size_str[:-2]

            spec = ' '.join(model[3:size_index])
            color = ' '.join(model[size_index + 1:])

    return brand, product, series, spec, size_gb, color

In [151]:
def read_model_specs(soup):
    attr_vals = soup.find('td', attrs={'class':"datasheet_cell td_value td_1er td_02 td-6",
                                    'datag':"grp_23",
                                    'datak':"kat_469"}).text.strip().split('x')

    attr_val_2 = []

    for attr_val in attr_vals:    

        attr_val = float(attr_val.replace('mm', '').strip().replace(',', '.'))
        attr_val_2.append(attr_val)

    return attr_val_2[0], attr_val_2[1], attr_val_2[2]

In [152]:
def create_spec_model_url(model_name):

    models_adj = model_name.split()[:-2]

    new_model_name = []
    sizes = ['GB', 'TB']

    for m in models_adj:
        
        if any([size in m for size in sizes]):
            m = ''
        
        new_model_name.append(m.lower())

    new_model = '-'.join(new_model_name)

    model_url = f'https://www.inside-digital.de/handys/{new_model}'

    return model_url


In [153]:
def df_mod(df):

    df['date'] = scrap_date
    df['date'] = pd.to_datetime(df['date'])

    df['discount'] = df.apply(lambda x: x['cur_price'] - x['price'] 
                                            if x['cur_price'] > 0 else 0, axis=1)

    df = df[['date',
            'full_model_name',
            'brand',
            'product',
            'series',
            'specification',
            'size_gb',
            'color',
            'condition',
            'status',
            'price',
            'cur_price', 
            'discount',
            'heigth',
            'width',
            'depth'
            ]]
    df = df.reset_index(drop=True)

    return df

In [154]:
def create_scraped_data(model_list):

    full_data = pd.DataFrame()

    for model in model_list:

        # read model attributes
        brand, product, series, spec, size_gb, color = create_model_attr(model)

        # scrape model prices
        model_url_agan = create_agan_model_url(model)
        soup = scrap_url(model_url_agan)

        con, stat = read_model_conditions(soup)
        old, new = read_model_prices(soup)

        # scrape models specifications
        model_url_spec = create_spec_model_url(model)

        soup = scrap_url(model_url_spec)

        height, width, depth = read_model_specs(soup)

        # create DataFrame
        data = pd.DataFrame(list(zip(con,stat, old, new)), 
                            columns =['condition', 'status', 'price', 'cur_price'])
        
        data['full_model_name'] = model
        data['brand'] = brand
        data['product'] = product
        data['series'] = series
        data['specification'] = spec
        data['size_gb'] = size_gb
        data['color'] = color   
        
        data['heigth'] = height
        data['width'] = width
        data['depth'] = depth

        full_data = pd.concat([full_data, data])

    full_data = df_mod(full_data)

    return full_data


In [155]:
def export_data(full_data):
    export = input(f'Do you want do download the data to Excel? Press y or n or ESC: ')

    while export.lower() not in ['y', 'n', '']:
        export = input(f'Please enter a valid value? Press y or n or ESC: ')

    if export.lower() == 'y':
            
            file = 'iphone asgoodasnew price scraping.xlsx'

            try:
                df = pd.read_excel(file)
                last_date = df['date'].max().strftime('%Y-%m-%d')
                new_date = full_data['date'].max().strftime('%Y-%m-%d')

                if new_date != last_date:
                    df = pd.concat([df, full_data])
                    df.to_excel(file, index=False)

                    print('New records added to existing Excel file.')
                
                else:
                    print('Records already existing in the file.')

            except:
                full_data.to_excel(file, index=False)
                print('New records extracted and Excel file created.')
            
    else:
        print('No records extracted.')

# run script

In [156]:
model_list = create_model_list()

In [159]:
full_data = create_scraped_data(model_list)

In [162]:
export_data(full_data)

Records already existing in the file.
