In [87]:
import pandas as pd
from scipy.stats import mode
import re

In [56]:
df_robin = pd.read_csv("data/robin_portmann_stage2.csv")
df_oli = pd.read_csv("data/oliver_heisel_stage2.csv")

In [57]:
df_robin.head()

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date,delivery_time_days
0,ASUS,ROG Phone 6,599.0,morgen geliefert,4.4,512.0,Phantom Black,2023-11-24,1
1,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24,5
2,ASUS,Zenfone 10,649.0,morgen geliefert,3.6,256.0,Midnight Black,2023-11-24,1
3,ASUS,ROG Phone 7,1040.0,morgen geliefert,4.5,512.0,Storm White,2023-11-24,1
4,ASUS,Zenfone 10,828.0,morgen geliefert,3.9,512.0,Midnight Black,2023-11-24,1


In [58]:
df_oli.head()

Unnamed: 0,scrapeday,brand,model,storage,color,refurbished,rating,deliverydays,price_CHF
0,bf,XIAOMI,Redmi Note 12,128,Onyx Gray,0,4.125,0,192.09
1,bf,XIAOMI,12 T 5G,256,Black,0,4.5263,1,346.54
2,bf,XIAOMI,Redmi 12,128,Midnight Black,0,4.4872,1,153.48
3,bf,XIAOMI,Redmi A2,32,Black,0,4.0,1,75.28
4,bf,XIAOMI,Redmi A2,32,Light Green,0,4.0,1,75.28


### first we need to match the date and scrapeday column

In [59]:
df_robin["date"].value_counts()

2023-11-24    1955
2023-11-27    1785
2023-12-04    1783
Name: date, dtype: int64

In [60]:
df_oli["scrapeday"].value_counts()

af    2051
bf    1850
cm    1800
Name: scrapeday, dtype: int64

In [61]:
#replace the date column values
date_replacements = {
    '2023-11-24': 'bf',
    '2023-11-27': 'cm',
    '2023-12-04': 'af'
}

df_robin["scrapeday"] = df_robin["date"].replace(date_replacements)

In [62]:
df_robin["scrapeday"].value_counts()

bf    1955
cm    1785
af    1783
Name: scrapeday, dtype: int64

In [63]:
df_robin["name"].unique()

array(['ROG Phone 6', 'ROG Phone 6 Diablo Immortal Edition', 'Zenfone 10',
       ..., '8i', '9 Pro+', 'GT Neo 3'], dtype=object)

In [64]:
df_robin

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date,delivery_time_days,scrapeday
0,ASUS,ROG Phone 6,599.00,morgen geliefert,4.4,512.0,Phantom Black,2023-11-24,1,bf
1,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24,5,bf
2,ASUS,Zenfone 10,649.00,morgen geliefert,3.6,256.0,Midnight Black,2023-11-24,1,bf
3,ASUS,ROG Phone 7,1040.00,morgen geliefert,4.5,512.0,Storm White,2023-11-24,1,bf
4,ASUS,Zenfone 10,828.00,morgen geliefert,3.9,512.0,Midnight Black,2023-11-24,1,bf
...,...,...,...,...,...,...,...,...,...,...
5518,realme,C35,149.00,Zwischen Sa. 9.12. und Fr. 15.12. geliefert,5.0,64.0,Glowing green,2023-12-04,5,af
5519,realme,9i,114.00,Zwischen Mi. 13.12. und Fr. 15.12. geliefert,5.0,64.0,Prism Blue,2023-12-04,9,af
5520,realme,C33 4/64GB Dual SIM Night Sea,115.00,Zwischen Fr. 15.12. und Di. 19.12. geliefert,not rated,64.0,Black,2023-12-04,11,af
5521,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,211.00,Zwischen Do. 7.12. und Mo. 11.12. geliefert,not rated,4.0,Schwarz,2023-12-04,3,af


In [65]:
df_oli

Unnamed: 0,scrapeday,brand,model,storage,color,refurbished,rating,deliverydays,price_CHF
0,bf,XIAOMI,Redmi Note 12,128,Onyx Gray,0,4.1250,0,192.09
1,bf,XIAOMI,12 T 5G,256,Black,0,4.5263,1,346.54
2,bf,XIAOMI,Redmi 12,128,Midnight Black,0,4.4872,1,153.48
3,bf,XIAOMI,Redmi A2,32,Black,0,4.0000,1,75.28
4,bf,XIAOMI,Redmi A2,32,Light Green,0,4.0000,1,75.28
...,...,...,...,...,...,...,...,...,...
5696,af,VIVANCO,36300,0,Weiß,0,0.0000,1,11.95
5697,af,VIVANCO,37973,1,Schwarz,0,0.0000,4,33.28
5698,af,WIKO,Y82,32,Dark Blue,0,5.0000,0,94.54
5699,af,WIKO,Y52,16,Deep Blue,0,2.0000,0,66.18


In [67]:
#remove all the refurbished phones from olis' dataset
df_oli = df_oli[df_oli["refurbished"] == 0].copy()

df_oli

Unnamed: 0,scrapeday,brand,model,storage,color,refurbished,rating,deliverydays,price_CHF
0,bf,XIAOMI,Redmi Note 12,128,Onyx Gray,0,4.1250,0,192.09
1,bf,XIAOMI,12 T 5G,256,Black,0,4.5263,1,346.54
2,bf,XIAOMI,Redmi 12,128,Midnight Black,0,4.4872,1,153.48
3,bf,XIAOMI,Redmi A2,32,Black,0,4.0000,1,75.28
4,bf,XIAOMI,Redmi A2,32,Light Green,0,4.0000,1,75.28
...,...,...,...,...,...,...,...,...,...
5696,af,VIVANCO,36300,0,Weiß,0,0.0000,1,11.95
5697,af,VIVANCO,37973,1,Schwarz,0,0.0000,4,33.28
5698,af,WIKO,Y82,32,Dark Blue,0,5.0000,0,94.54
5699,af,WIKO,Y52,16,Deep Blue,0,2.0000,0,66.18


# merging the data

In [17]:
pd.merge(df_robin, df_oli, left_on=["brand", "name", "phone_color", "extracted_gb", "scrapeday"],
         right_on=["brand", "model", "color", "storage", "scrapeday"],
         how="inner", suffixes=("_digitec", "_mediamarkt"))

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date,delivery_time_days,scrapeday,model,storage,color,refurbished,rating,deliverydays,price_CHF
0,ASUS,ROG Phone 7,1040.0,morgen geliefert,4.5,512.0,Storm White,2023-11-24,1,bf,ROG Phone 7,512,Storm White,0,5.0,3,1157.39
1,ASUS,ROG Phone 7 Ultimate,1349.0,morgen geliefert,3.9,512.0,Storm White,2023-11-24,1,bf,ROG Phone 7 Ultimate,512,Storm White,0,4.5238,3,1350.45
2,ASUS,ROG Phone 7,959.0,Zwischen Sa. 25.11. und Do. 30.11. geliefert,4.5,512.0,Phantom Black,2023-11-24,1,bf,ROG Phone 7,512,Phantom Black,0,4.64,5,1156.43
3,ASUS,ROG Phone 7,878.0,morgen geliefert,3.4,256.0,Phantom Black,2023-11-24,1,bf,ROG Phone 7,256,Phantom Black,0,4.5,5,1051.0
4,ASUS,ROG Phone 7,940.0,morgen geliefert,3.4,256.0,Storm White,2023-11-24,1,bf,ROG Phone 7,256,Storm White,0,4.5,5,974.31
5,ASUS,Zenfone 9,593.0,morgen geliefert,4.6,128.0,Midnight Black,2023-11-24,1,bf,Zenfone 9,128,Midnight Black,0,4.7143,1,602.32
6,ASUS,Zenfone 9,746.0,morgen geliefert,4.6,256.0,Midnight Black,2023-11-24,1,bf,Zenfone 9,256,Midnight Black,0,4.4,7,1453.37
7,ASUS,Zenfone 9,537.33,Zwischen Do. 30.11. und Mo. 4.12. geliefert,4.6,128.0,Moonlight White,2023-11-24,6,bf,Zenfone 9,128,Moonlight White,0,5.0,1,546.48
8,ASUS,ROG Phone 7 Ultimate,1299.0,morgen geliefert,3.9,512.0,Storm White,2023-11-27,1,cm,ROG Phone 7 Ultimate,512,Storm White,0,4.5238,2,1348.78
9,ASUS,ROG Phone 7,1041.0,morgen geliefert,4.5,512.0,Storm White,2023-11-27,1,cm,ROG Phone 7,512,Storm White,0,5.0,2,1155.96


Unfortunately, the data is not matching well, this is most likely due to different names of the coloring of the phones. We will therefore first group the data by brand, name and storage and take the average price

In [27]:
#helper function to aggregate by the mode
def mode_agg(series):
    return mode(series).mode[0]

df_robin_grouped = df_robin.groupby(["brand", "name", "extracted_gb", "scrapeday"], as_index=False).\
    agg({"price": "mean", "extracted_rating": mode_agg, "delivery_time_days": mode_agg})

In [70]:
df_robin_grouped

Unnamed: 0,brand,name,extracted_gb,scrapeday,price,extracted_rating,delivery_time_days
0,ASUS,ROG Phone 5 (12GB),256.0,af,782.005,4.6,1
1,ASUS,ROG Phone 5 (12GB),256.0,bf,782.005,4.6,1
2,ASUS,ROG Phone 5 (12GB),256.0,cm,782.005,4.6,1
3,ASUS,ROG Phone 5 (16GB),256.0,af,804.000,4.7,1
4,ASUS,ROG Phone 5 (16GB),256.0,bf,804.000,4.7,1
...,...,...,...,...,...,...,...
3402,realme,Narzo 50A,64.0,cm,179.000,not rated,38
3403,realme,Narzo 50A Prime,4.0,bf,197.000,not rated,34
3404,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,4.0,af,211.000,not rated,3
3405,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,4.0,bf,196.000,not rated,4


In [68]:
#aggregatig oli's data as well
df_oli_grouped = df_oli.groupby(["brand", "model", "storage", "scrapeday"], as_index=False).\
    agg({"price_CHF": "mean", "rating": "mean", "deliverydays": "median"})

In [69]:
df_oli_grouped

Unnamed: 0,brand,model,storage,scrapeday,price_CHF,rating,deliverydays
0,ALCATEL,1B 2022 5031G,32,af,92.70,5.00000,1.0
1,ALCATEL,1B 2022 5031G,32,bf,94.64,5.00000,1.0
2,ALCATEL,1B 2022 5031G,32,cm,94.52,5.00000,1.0
3,ALCATEL,5033FR,16,af,64.57,0.00000,3.0
4,ALCATEL,5033FR,16,bf,72.30,0.00000,5.0
...,...,...,...,...,...,...,...
3465,ZTE,V40 Pro,128,bf,241.32,2.00000,1.0
3466,ZTE,V40 Pro,128,cm,241.02,2.00000,2.0
3467,ZTE,V40 Vita,128,af,132.36,3.91665,1.0
3468,ZTE,V40 Vita,128,bf,135.13,3.91665,1.5


In [73]:
df_robin_grouped.query("brand == 'Apple'")

Unnamed: 0,brand,name,extracted_gb,scrapeday,price,extracted_rating,delivery_time_days
105,Apple,iPhone 11,64.0,af,443.500000,4.7,1
106,Apple,iPhone 11,64.0,bf,459.000000,4.7,1
107,Apple,iPhone 11,64.0,cm,443.500000,4.7,1
108,Apple,iPhone 11,128.0,af,455.666667,4.7,1
109,Apple,iPhone 11,128.0,bf,499.000000,4.7,1
...,...,...,...,...,...,...,...
355,Apple,iPhone 14 Plus,256.0,bf,869.000000,4.6,1
356,Apple,iPhone 14 Plus,256.0,cm,869.000000,4.7,1
357,Apple,iPhone 14 Plus,512.0,af,900.000000,4.7,5
358,Apple,iPhone 14 Plus,512.0,bf,993.000000,4.6,7


In [74]:
df_oli_grouped.query("brand == 'Apple'")

Unnamed: 0,brand,model,storage,scrapeday,price_CHF,rating,deliverydays


In [77]:
df_oli_grouped["brand"].unique()

array(['ALCATEL', 'ALLVIEW', 'APPLE', 'ASUS', 'BEAFON', 'BLACKBERRY',
       'BLACKVIEW', 'CARBON', 'CAT', 'CROSSCALL', 'CUBOT', 'DOOGEE',
       'DORO', 'EMPORIA', 'ENERGIZER', 'FAIRPHONE', 'FOSSIBOT', 'FUNKER',
       'FYSIC', 'GIGASET', 'GOOGLE', 'HAMMER', 'HONOR', 'HUAWEI',
       'IIIF150', 'INFINIX', 'LENOVO', 'LEXIBOOK', 'LG', 'MAXCOM',
       'MOTOROLA', 'MP', 'MYPHONE', 'Myphone', 'NOKIA', 'NOTHING',
       'OLYMPIA', 'OTROS', 'OUKITEL', 'PEAQ', 'POCO', 'POCOPHONE', 'QUBO',
       'REALME', 'REPHONE', 'RUGGEAR', 'SAMSUNG', 'SONY', 'SPC',
       'SWISSVOICE', 'TCL', 'TECNO', 'TELECOM', 'ULEFONE', 'UMIDIGI',
       'VIVANCO', 'WIKO', 'XIAOMI', 'ZTE'], dtype=object)

In [80]:
df_robin_grouped["brand"].unique()

array(['ASUS', 'ACER', 'ALCATEL', 'ALLVIEW', 'APPLE', 'BEA-FON',
       'BLABLOO', 'BLACKBERRY', 'BLACKVIEW', 'BRONDI', 'CAT', 'CROSSCALL',
       'CUBOT', 'CUSTOM', 'CYRUS', 'DENSO', 'DOOGEE', 'DORO', 'EMPORIA',
       'ENERGIZER', 'FAIRPHONE', 'GIGASET', 'GOOGLE', 'HTC', 'HAMMER',
       'HONOR', 'HUAWEI', 'I.SAFE MOBILE', 'IIIF150', 'INAPA', 'INFINIX',
       'INOI', 'KAZAM', 'KRÜGER&MATZ', 'LG', 'MAXCOM', 'MICROSOFT',
       'MOTOROLA', 'MYPHONE', 'NGM', 'NONAME', 'NOKIA', 'NOTHING',
       'NUBIA', 'OPPO', 'OLYMPIA', 'ONEPLUS', 'ORDISSIMO', 'OUKITEL',
       'POCO', 'REPHONE', 'RUG GEAR', 'SAMSUNG', 'SONY', 'SYCO', 'TCL',
       'TE CONNECTIVITY', 'TIM', 'TECNO', 'TELEKOM', 'ULEFONE', 'UMI',
       'UMIDIGI', 'VIVO', 'VODAFONE', 'VOLLA', 'XIAOMI', 'ZTE', 'IGET',
       'REALME'], dtype=object)

the brand names in oli's df are all upper case whereas most of the brand names in robin's df have only the first capital letter, we will therefore make them also all uppercase

In [79]:
#convert to uppercase
df_robin_grouped["brand"] = df_robin_grouped["brand"].str.upper()

In [82]:
#now we can merge the data
df_merged = pd.merge(df_robin_grouped, df_oli_grouped,
                     left_on=["brand", "name", "extracted_gb","scrapeday"],
                     right_on=["brand", "model", "storage", "scrapeday"],
                     how="inner", suffixes=("_digitec", "_mediamarkt"))

df_merged

Unnamed: 0,brand,name,extracted_gb,scrapeday,price,extracted_rating,delivery_time_days,model,storage,price_CHF,rating,deliverydays
0,ASUS,ROG Phone 7,256.0,af,921.0,3.4,1,ROG Phone 7,256,1015.500,4.5000,1.0
1,ASUS,ROG Phone 7,256.0,bf,909.0,3.4,1,ROG Phone 7,256,1012.655,4.5000,5.0
2,ASUS,ROG Phone 7,256.0,cm,999.0,3.4,1,ROG Phone 7,256,1048.460,4.5000,4.0
3,ASUS,ROG Phone 7,512.0,af,1079.0,4.5,1,ROG Phone 7,512,1133.650,4.8269,2.0
4,ASUS,ROG Phone 7,512.0,bf,999.5,4.5,1,ROG Phone 7,512,1156.910,4.8200,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
537,REALME,C30,32.0,cm,84.4,3.7,5,C30,32,79.190,2.5000,1.0
538,REALME,C31,64.0,cm,142.5,5.0,11,C31,64,129.530,0.0000,1.0
539,REALME,C35,128.0,af,191.0,5.0,1,C35,128,222.635,2.5000,2.5
540,REALME,C35,128.0,bf,191.0,5.0,1,C35,128,204.085,0.0000,4.5


In [84]:
df_merged["name"].unique()

array(['ROG Phone 7', 'ROG Phone 7 Ultimate', 'Zenfone 9', 'iPhone 11',
       'iPhone 12', 'iPhone 12 mini', 'iPhone 13', 'iPhone 13 Pro',
       'iPhone 13 mini', 'iPhone 14', 'iPhone 14 Plus', 'iPhone 14 Pro',
       'iPhone 14 Pro Max', 'iPhone 15', 'iPhone 15 Pro', 'iPhone SE',
       'BL5000', 'BL8800 Pro', 'BV4900', 'BV4900 Pro', 'BV6300 Pro',
       'BV8800', 'S42 H+', 'S62 Pro', 'S75', '8200', 'SMART.5 mini', '5',
       'GS4', 'GS5 LITE', 'Pixel 2', 'Pixel 6', 'Pixel 6 Pro', 'Pixel 6a',
       'Pixel 7', 'Pixel 7 Pro', 'Pixel 7a', 'Pixel 8', 'Pixel 8 Pro',
       'Iron 4', '50', '70', '90', '90 Lite', 'Magic 4 Lite 5G',
       'Magic5 Pro', 'X8', 'Mate 50 Pro', 'Mate X3', 'Nova 10',
       'Nova 10 SE', 'P30', 'P30 Pro', 'P40 Lite', 'P40 Lite 5G',
       'P40 Pro', 'P50 Pro', 'P60 Pro', 'Y5 (2019)', 'Defy',
       'Edge 30 Neo', 'Edge 40 Pro', 'Moto E22', 'Moto E6i', 'Moto G22',
       'Moto G31', 'Moto G32', 'Moto G52', 'Moto G72', 'Moto G9 Play',
       'Razr 40', 'C12', 'C

In [85]:
df_robin_grouped["name"].unique()

array(['ROG Phone 5 (12GB)', 'ROG Phone 5 (16GB)', 'ROG Phone 5s (12GB)',
       ..., 'Narzo 50A', 'Narzo 50A Prime',
       'Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ") IPS LCD screen, Android 11, 50MP triple camera'],
      dtype=object)

In [86]:
df_oli_grouped["model"].unique()

array(['1B 2022 5031G', '5033FR', 'PIXI 4-5 (3G) 5010D BLACK', ...,
       'P606F01-GREY', 'V40 Pro', 'V40 Vita'], dtype=object)

Looking at the phone names, sometimes the storage and/or the color is also in there, which makes matching the phones even more difficult, we will try to remove them as good as possible by using regex

In [92]:
# Function to clean phone names
def clean_phone_name(name):
    # Remove storage capacity patterns like '64GB'
    name = re.sub(r'\d+GB', '', name)
    # Remove common color names and other patterns
    name = re.sub(r'\b(schwarz|grün|black|white|blue|red|gold|silver)\b', '', name, flags=re.IGNORECASE)
    # Remove additional unwanted patterns (e.g., '(NEU)', '-')
    name = re.sub(r'\(.*?\)|-', '', name)
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# Apply the cleaning function
df_robin_grouped['cleaned_phone_name'] = df_robin_grouped['name'].apply(clean_phone_name)
df_oli_grouped['cleaned_phone_name'] = df_oli_grouped['model'].apply(clean_phone_name)

In [93]:
pd.merge(df_robin_grouped, df_oli_grouped,
                     left_on=["brand", "cleaned_phone_name", "extracted_gb","scrapeday"],
                     right_on=["brand", "cleaned_phone_name", "storage", "scrapeday"],
                     how="inner", suffixes=("_digitec", "_mediamarkt"))

Unnamed: 0,brand,name,extracted_gb,scrapeday,price,extracted_rating,delivery_time_days,cleaned_phone_name,model,storage,price_CHF,rating,deliverydays
0,ASUS,ROG Phone 7,256.0,af,921.0,3.4,1,ROG Phone 7,ROG Phone 7,256,1015.500,4.5000,1.0
1,ASUS,ROG Phone 7,256.0,bf,909.0,3.4,1,ROG Phone 7,ROG Phone 7,256,1012.655,4.5000,5.0
2,ASUS,ROG Phone 7,256.0,cm,999.0,3.4,1,ROG Phone 7,ROG Phone 7,256,1048.460,4.5000,4.0
3,ASUS,ROG Phone 7,512.0,af,1079.0,4.5,1,ROG Phone 7,ROG Phone 7,512,1133.650,4.8269,2.0
4,ASUS,ROG Phone 7,512.0,bf,999.5,4.5,1,ROG Phone 7,ROG Phone 7,512,1156.910,4.8200,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
613,REALME,C30,32.0,cm,84.4,3.7,5,C30,C30,32,79.190,2.5000,1.0
614,REALME,C31,64.0,cm,142.5,5.0,11,C31,C31,64,129.530,0.0000,1.0
615,REALME,C35,128.0,af,191.0,5.0,1,C35,C35,128,222.635,2.5000,2.5
616,REALME,C35,128.0,bf,191.0,5.0,1,C35,C35,128,204.085,0.0000,4.5
