In [5]:
import pandas as pd
import re
from datetime import datetime

# Load and concat

In [6]:
#load scraped data
df1 = pd.read_csv('../data/scraped_phones_2023-11-24_stage1.csv', encoding='utf-16')
df2 = pd.read_csv('../data/scraped_phones_2023-11-27_stage1.csv', encoding='utf-16')
df3 = pd.read_csv('../data/scraped_phones_2023-12-04_stage1.csv', encoding='utf-16')

#concat the dataframes
df = pd.concat([df1, df2, df3]) 

In [7]:
#have a look at the data
df.head()

Unnamed: 0,nr,brand,name,ratings,specs,price,delivery_information,scraped_at
0,1,Samsung,Galaxy A54 5G,311 Bewertungen 4.4 von 5 Sternen,"256 GB, Awesome Graphite, 6.40"", Hybrid Dual S...",449.–,morgen geliefert,2023-11-24 11:45:20.220938
1,2,Samsung,Galaxy S23 Ultra,301 Bewertungen 4.6 von 5 Sternen,"512 GB, Phantom Black, 6.80"", SIM + eSIM, 200 ...",1229.–,morgen geliefert,2023-11-24 11:45:21.652983
2,3,Samsung,Galaxy A53 5G Enterprise Edition,58 Bewertungen 4.5 von 5 Sternen,"128 GB, Awesome Black, 6.50"", Dual SIM, 64 Mpx...",310.–,morgen geliefert,2023-11-24 11:45:23.086290
3,4,Google,Pixel 7a,146 Bewertungen 4.2 von 5 Sternen,"128 GB, Sea, 6.10"", SIM + eSIM, 64 Mpx, 5G",349.–,morgen geliefert,2023-11-24 11:45:24.533107
4,5,Google,Pixel 8 Pro,143 Bewertungen 4.2 von 5 Sternen,"256 GB, Obsidian, 6.70"", SIM + eSIM, 50 Mpx, 5G",1049.–,morgen geliefert,2023-11-24 11:45:25.968087


# fix data types (price, scraped_at)

In [8]:
#remove the currency symbol
df['price'] = df['price'].str.replace('.–', '')
#convert to decimal
df['price'] = df['price'].astype(float)

#convert scraped_at to date
df['scraped_at'] = pd.to_datetime(df['scraped_at'])

# fix ratings column

In [9]:
#regular expression to extract the rating
df['extracted_rating'] = df['ratings'].str.extract(r'(\d+\.\d+) von 5 Sternen')

#convert extracted ratings to object
df['extracted_rating'] = df['extracted_rating'].astype(object)

#drop original ratings column
df = df.drop('ratings', axis=1)

#replace 0.0 zero rating with not rated
df['extracted_rating'] = df['extracted_rating'].replace('0.0', 'not rated')

# missing values (brand and specs)

In [10]:
#there are some phones with unknown/missing brand, we will remove this
df = df[df['brand'].notnull()].copy()

#we will also remove the phones with missing specs, as we will not be able to use them
#there is no real structre recognizable in the specs -> we would need to fix this manually for each entry
df = df[df['specs'].notnull()].copy()

# extract color and storage

In [11]:
# Regular expression to extract the GB value
df['extracted_gb'] = df['specs'].str.extract(r'(\d+)\s*GB')

# Convert extracted GB values to numeric
df['extracted_gb'] = pd.to_numeric(df['extracted_gb'])

# get phone color
df["phone_color"] = df['specs'].str.extract(r'GB,\s*([^,]+)')

# remove phones missing color/GB

In [12]:
#remove missing GB phones
df = df[df['extracted_gb'].notnull()].copy()
#remove missing phone color phones
df = df[df['phone_color'].notnull()].copy()

# replace missing values in the delivery information
impute with mode for each brand (because sometimes the hover over window did not work during scraping)

In [13]:
#helper function to impute missing values with the mode
def impute_with_mode(group):
    try:
        mode = group['delivery_information'].mode()[0]
    except KeyError:
        return group
    group['delivery_information'].fillna(mode, inplace=True)
    return group

# Apply the function to each group
df = df.groupby('brand').apply(impute_with_mode)

#drop the remaining null's
df = df[df['delivery_information'].notnull()].copy()

#reset the index
df = df.reset_index(drop=True)

#get the proper date (remove time)
df["date"] = df["scraped_at"].dt.date

#only select relevant cols
cleaned_df = df[["brand", "name", "price", "delivery_information", "extracted_rating", "extracted_gb",
                 "phone_color", "date"]].copy()


## replace delivery information with days

In [14]:
def calculate_delivery_time(row):
    if row['delivery_information'] == 'morgen geliefert':
        return 1
    elif row['delivery_information'] == 'übermorgen geliefert':
        return 2
    else:
        # Extract the first date from the string
        match = re.search(r'\d{1,2}\.\d{1,2}\.', row['delivery_information'])
        if match:
            start_date_str = match.group()
            # Parse the dates
            start_date = datetime.strptime(start_date_str + '2023', '%d.%m.%Y').date()  
            order_date = row["date"]
            # Calculate the difference in days
            return (start_date - order_date).days
        else:
            return "Lieferung nicht möglich"  #if the phone cant be delivered


cleaned_df['delivery_time_days'] = cleaned_df.apply(calculate_delivery_time, axis=1)

## check for duplicates

In [15]:
cleaned_df.duplicated().sum()

306

In [16]:
#remove the duplicates, since these will be irrelevant for our analysis
cleaned_df = cleaned_df.drop_duplicates()

## save cleaned data

In [130]:
cleaned_df.to_csv('data/robin_portmann_stage2.csv', index=False)