# import packages

In [23]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from datetime import datetime

# start url & scraping date

In [24]:
price_url = 'https://asgoodasnew.de/Handys/Apple/iPhone-14/'
specs_url = 'https://phonesdata.com/de/smartphones/apple/'
scrap_date = datetime.now().strftime('%Y-%m-%d')

# function definition

## general functions

In [25]:
def scrap_url(url):

    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')

    return soup

In [26]:
def merge_dfs(df_price, df_specs):
    df = df_price.merge(df_specs, how='left')
    return df

In [47]:
def export_data():
    export = input(f'Do you want do download the data to Excel? Press y or n or ESC: ')

    while export.lower() not in ['y', 'n', '']:
        export = input(f'Please enter a valid value? Press y or n or ESC: ')

    full_data = merge_dfs(df_price, df_specs)

    if export.lower() == 'y':
            
            file = 'iphone asgoodasnew price scraping.xlsx'
            new_date = full_data['date'].max().strftime('%Y-%m-%d')

            try:
                df = pd.read_excel(file)
                last_date = df['date'].max().strftime('%Y-%m-%d')
                
                if new_date != last_date:
                    df = pd.concat([df, full_data])
                    df.to_excel(file, index=False)

                    print(f'New records of {new_date} added to existing Excel file.')
                
                else:
                    print(f'Records of {new_date} already existing in the file.')

            except:
                full_data.to_excel(file, index=False)
                print(f'New records of {new_date} extracted and Excel file created.')
            
    else:
        print('No records extracted.')

## price scraping functions

In [28]:
def create_urls(soup):

    raw_urls = soup.find('div', class_="listlocator-paging").find_all('a')[:-1]

    cleaned_urls = []

    for url in raw_urls:
        cleaned_urls.append(url['href'])

    return cleaned_urls

In [29]:
def create_model_list(url):

    model_list = []

    soup = scrap_url(url)

    urls = create_urls(soup)

    for url in urls:

        soup = scrap_url(url)

        models = soup.find_all('div', class_="col-xs-6 col-sm-4")

        for model in models:
            if model.a != None:

                model_dict = {}

                model_name = model.a['title']
                model_url = model.a['href']

                model_dict['name'] = model_name
                model_dict['url'] = model_url

                model_list.append(model_dict)

    return model_list

In [30]:
def read_model_conditions(soup):
    conditions = soup.find_all('div', class_='btn-radio')

    #conditions_mod = []
    #status = []

    conditions_list = []

    for condition in conditions:

        condition_dict = {}

        condition = condition.label.text.strip().split('\n')

        if len(condition) == 2:
            #conditions_mod.append(condition[0])
            #status.append(condition[1].strip())

            condition_dict['condition'] = condition[0]
            condition_dict['status'] = condition[1].strip()
        else:
            #conditions_mod.append(condition[0])
            #status.append('verfügbar')

            condition_dict['condition'] = condition[0]
            condition_dict['status'] = 'verfügbar'

        conditions_list.append(condition_dict)
        
    #return conditions_mod, status
    return conditions_list

In [31]:
def read_model_prices(soup):
    prices = soup.find_all('div', class_='conditions-price')

    #old_prices = []
    #new_prices = []

    prices_list = []

    for price in prices:

        prices_dict = {}

        old_price = int(price.text.strip()[:4].replace(',', ''))
        new_price = price.find('div', class_='conditions--newprice').text.strip()[:-3]

        if new_price == '':
            new_price = 0
        else: new_price = int(new_price)

        #old_prices.append(old_price)
        #new_prices.append(new_price)

        prices_dict['price'] = old_price
        prices_dict['cur_price'] = new_price

        prices_list.append(prices_dict)
    
    #return old_prices, new_prices
    return prices_list

In [32]:
def create_model_attr(model):

    model = model.split()

    for number, item in enumerate(model):

        brand = model[0]
        product = model[1]
        series = int(model[2])

        if item.endswith(('GB', 'TB')):
            size_index = number
            size_str = model[size_index]

            if size_str[-2:] == 'TB':
                size_gb = int(size_str[:-2]) * 1000 + int(size_str[:-2]) * 24
            else:
                size_gb = int(size_str[:-2])

            spec = ' '.join(model[3:size_index])
            color = ' '.join(model[size_index + 1:])

    return brand, product, series, spec, size_gb, color

In [33]:
def create_scraped_data(model_list):

    full_data = pd.DataFrame()

    for model in model_list:

        # read model attributes
        brand, product, series, spec, size_gb, color = create_model_attr(model['name'])

        # scrape model prices
        model_url = model['url']
        soup = scrap_url(model_url)

        #con, stat = read_model_conditions(soup)
        #old, new = read_model_prices(soup)

        conditions = read_model_conditions(soup)
        prices = read_model_prices(soup)

        # create DataFrame
        #data = pd.DataFrame(list(zip(con,stat, old, new)), columns =['condition', 'status', 'price', 'cur_price'])
        data = pd.concat([pd.DataFrame(conditions), pd.DataFrame(prices)], axis=1)
        
        data['full_model_name'] = model['name']
        data['model_name'] = f'{product} {series} {spec}'.strip()
        data['brand'] = brand
        data['product'] = product
        data['series'] = series
        data['specification'] = spec
        data['size_gb'] = size_gb
        data['color'] = color   

        full_data = pd.concat([full_data, data])

    full_data = df_mod(full_data)

    return full_data


In [34]:
def df_mod(df):

    df['date'] = scrap_date
    df['date'] = pd.to_datetime(df['date'])

    df['discount'] = df.apply(lambda x: x['cur_price'] - x['price'] 
                                            if x['cur_price'] > 0 else 0, axis=1)

    df = df[['date',
            'full_model_name',
            'model_name',
            'brand',
            'product',
            'series',
            'specification',
            'size_gb',
            'color',
            'condition',
            'status',
            'price',
            'cur_price', 
            'discount',
            ]]
    df = df.reset_index(drop=True)

    return df

## specs functions

In [35]:
def create_model_list_specs(url):

    soup = scrap_url(url)

    models = soup.find_all('div', class_='col-md-2 col-sm-3 col-xs-3')

    model_list = []

    for model in models:

        model_dict = {}

        model_name = model.span.text
        model_url = model.a['href']

        model_dict['name'] = model_name
        model_dict['url'] = model_url

        model_list.append(model_dict)

    return model_list

In [36]:
def read_model_details(model_list):

    feature_list = []

    for model in model_list:
        model_url = model['url']

        soup = scrap_url(model_url)

        feature_names = soup.find('table', id="commontec").find_all('td', class_="datasheet-features-type")
        feature_values = soup.find('table', id="commontec").find_all('p')

        feature_dict = {}

        for n, v in zip(feature_names, feature_values):

            n = n.text
            v = v.text

            rel_feature_names = ['Abmessung', 'Gewicht']

            if any(f in n for f in rel_feature_names):

                feature_dict['model_name'] = model['name']
                feature_dict[n] = v
        feature_dict['model_url'] = model_url

        feature_list.append(feature_dict)

    return pd.DataFrame(feature_list)

# run script

In [37]:
model_list = create_model_list(price_url)

In [38]:
df_price = create_scraped_data(model_list)

In [39]:
df_price.head()

Unnamed: 0,date,full_model_name,model_name,brand,product,series,specification,size_gb,color,condition,status,price,cur_price,discount
0,2023-09-27,Apple iPhone 14 256GB violett,iPhone 14,Apple,iPhone,14,,256,violett,neu,verfügbar,929,889,-40
1,2023-09-27,Apple iPhone 14 256GB violett,iPhone 14,Apple,iPhone,14,,256,violett,wie neu,ausverkauft,909,0,0
2,2023-09-27,Apple iPhone 14 256GB violett,iPhone 14,Apple,iPhone,14,,256,violett,sehr gut,ausverkauft,889,0,0
3,2023-09-27,Apple iPhone 14 256GB violett,iPhone 14,Apple,iPhone,14,,256,violett,gut,ausverkauft,849,0,0
4,2023-09-27,Apple iPhone 14 Pro 512GB dunkellila,iPhone 14 Pro,Apple,iPhone,14,Pro,512,dunkellila,neu,verfügbar,1419,0,0


In [40]:
df_specs = read_model_details(create_model_list_specs(specs_url))
df_specs.head()

Unnamed: 0,model_name,Abmessungen (HxBxT),Gewicht,model_url
0,iPhone 15,147.6 Х 71.6 Х 7.8 mm,171 g,https://phonesdata.com/de/smartphones/apple/ip...
1,iPhone 15 Plus,160.9 Х 77.8 Х 7.8 mm,201 g,https://phonesdata.com/de/smartphones/apple/ip...
2,iPhone 15 Pro,146.6 Х 70.6 Х 8.3 mm,187 g,https://phonesdata.com/de/smartphones/apple/ip...
3,iPhone 15 Pro Max,159.9 Х 76.7 Х 8.3 mm,221 g,https://phonesdata.com/de/smartphones/apple/ip...
4,iPhone 14,146.7 Х 71.5 Х 7.8 mm,172 g,https://phonesdata.com/de/smartphones/apple/ip...


In [48]:
export_data()

New records of 2023-09-27 extracted and Excel file created.
