In [92]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
pd.set_option('display.max_rows', 100)

In [93]:
#load scraped data
df1 = pd.read_csv('data/scraped_phones_2023-11-24_stage1.csv', encoding='utf-16')
df2 = pd.read_csv('data/scraped_phones_2023-11-27_stage1.csv', encoding='utf-16')
df3 = pd.read_csv('data/scraped_phones_2023-12-04_stage1.csv', encoding='utf-16')

In [94]:
#concat the dataframes
df = pd.concat([df1, df2, df3])

df

Unnamed: 0,nr,brand,name,ratings,specs,price,delivery_information,scraped_at
0,1,Samsung,Galaxy A54 5G,311 Bewertungen 4.4 von 5 Sternen,"256 GB, Awesome Graphite, 6.40"", Hybrid Dual S...",449.–,morgen geliefert,2023-11-24 11:45:20.220938
1,2,Samsung,Galaxy S23 Ultra,301 Bewertungen 4.6 von 5 Sternen,"512 GB, Phantom Black, 6.80"", SIM + eSIM, 200 ...",1229.–,morgen geliefert,2023-11-24 11:45:21.652983
2,3,Samsung,Galaxy A53 5G Enterprise Edition,58 Bewertungen 4.5 von 5 Sternen,"128 GB, Awesome Black, 6.50"", Dual SIM, 64 Mpx...",310.–,morgen geliefert,2023-11-24 11:45:23.086290
3,4,Google,Pixel 7a,146 Bewertungen 4.2 von 5 Sternen,"128 GB, Sea, 6.10"", SIM + eSIM, 64 Mpx, 5G",349.–,morgen geliefert,2023-11-24 11:45:24.533107
4,5,Google,Pixel 8 Pro,143 Bewertungen 4.2 von 5 Sternen,"256 GB, Obsidian, 6.70"", SIM + eSIM, 50 Mpx, 5G",1049.–,morgen geliefert,2023-11-24 11:45:25.968087
...,...,...,...,...,...,...,...,...
2052,2053,OnePlus,9 Pro (12GB),2 Bewertungen 5.0 von 5 Sternen,"256 GB, Morning Mist, 6.70"", Dual SIM, 48 Mpx, 5G",534.17,Zwischen Do. 7.12. und Mi. 13.12. geliefert,2023-12-04 09:29:23.810675
2053,2054,OPPO,Reno 7,2 Bewertungen 4.5 von 5 Sternen,"128 GB, Orange, Dual SIM, 4G",216.89,Zwischen Do. 7.12. und Mi. 13.12. geliefert,2023-12-04 09:29:25.301772
2054,2055,Apple,iPhone 15 Plus,20 Bewertungen 4.5 von 5 Sternen,"512 GB, Green, 6.70"", SIM + eSIM, 48 Mpx, 5G",1388.05,Zwischen Di. 5.12. und Do. 7.12. geliefert,2023-12-04 09:29:26.784213
2055,2056,Apple,iPhone 12 Pro,44 Bewertungen 4.8 von 5 Sternen,"256 GB, Gold, 6.10"", SIM + eSIM, 12 Mpx, 5G",1095.95,Zwischen Mi. 6.12. und Di. 12.12. geliefert,2023-12-04 09:29:28.294434


# checking the data

Checklist of data impurities:
- missing data
- wrong data types (price, delivery information etc.)
- incorrect data (ratings showing 0, but actually not rated)
- inconsistent data (specs/phone names not consistent)
- duplicates



In [95]:
#have a look at the data
df.head()

Unnamed: 0,nr,brand,name,ratings,specs,price,delivery_information,scraped_at
0,1,Samsung,Galaxy A54 5G,311 Bewertungen 4.4 von 5 Sternen,"256 GB, Awesome Graphite, 6.40"", Hybrid Dual S...",449.–,morgen geliefert,2023-11-24 11:45:20.220938
1,2,Samsung,Galaxy S23 Ultra,301 Bewertungen 4.6 von 5 Sternen,"512 GB, Phantom Black, 6.80"", SIM + eSIM, 200 ...",1229.–,morgen geliefert,2023-11-24 11:45:21.652983
2,3,Samsung,Galaxy A53 5G Enterprise Edition,58 Bewertungen 4.5 von 5 Sternen,"128 GB, Awesome Black, 6.50"", Dual SIM, 64 Mpx...",310.–,morgen geliefert,2023-11-24 11:45:23.086290
3,4,Google,Pixel 7a,146 Bewertungen 4.2 von 5 Sternen,"128 GB, Sea, 6.10"", SIM + eSIM, 64 Mpx, 5G",349.–,morgen geliefert,2023-11-24 11:45:24.533107
4,5,Google,Pixel 8 Pro,143 Bewertungen 4.2 von 5 Sternen,"256 GB, Obsidian, 6.70"", SIM + eSIM, 50 Mpx, 5G",1049.–,morgen geliefert,2023-11-24 11:45:25.968087


In [96]:
#check the data types
df.dtypes

nr                       int64
brand                   object
name                    object
ratings                 object
specs                   object
price                   object
delivery_information    object
scraped_at              object
dtype: object

# fix price data type

In [97]:
#remove the currency symbol
df['price'] = df['price'].str.replace('.–', '')
#convert to decimal
df['price'] = df['price'].astype(float)

df.dtypes

nr                        int64
brand                    object
name                     object
ratings                  object
specs                    object
price                   float64
delivery_information     object
scraped_at               object
dtype: object

In [98]:
#convert scraped_at to date
df['scraped_at'] = pd.to_datetime(df['scraped_at'])
df.dtypes

nr                               int64
brand                           object
name                            object
ratings                         object
specs                           object
price                          float64
delivery_information            object
scraped_at              datetime64[ns]
dtype: object

# fix ratings column

In [99]:
df["ratings"].value_counts()
#we need to replace all the 0 star ratings with not rated (because these did not have any ratings yet)

ratings
0.0 von 5 Sternen                    2124
1 Bewertung 5.0 von 5 Sternen         168
1 Bewertung 4.0 von 5 Sternen          71
1 Bewertung 1.0 von 5 Sternen          54
2 Bewertungen 5.0 von 5 Sternen        53
                                     ... 
64 Bewertungen 4.5 von 5 Sternen        1
59 Bewertungen 4.5 von 5 Sternen        1
111 Bewertungen 4.5 von 5 Sternen       1
36 Bewertungen 3.4 von 5 Sternen        1
46 Bewertungen 4.6 von 5 Sternen        1
Name: count, Length: 628, dtype: int64

In [100]:
#regular expression to extract the rating
df['extracted_rating'] = df['ratings'].str.extract(r'(\d+\.\d+) von 5 Sternen')

In [101]:
#convert extracted ratings to object
df['extracted_rating'] = df['extracted_rating'].astype(object)

In [102]:
#drop original ratings column
df = df.drop('ratings', axis=1)

df["extracted_rating"].value_counts()

extracted_rating
0.0    2124
4.5     574
4.7     454
4.6     440
4.4     424
4.3     389
4.8     332
5.0     307
4.0     251
4.2     196
4.1     159
3.9      95
3.5      67
3.8      59
1.0      57
3.0      56
3.6      50
3.7      49
2.0      44
4.9      36
3.4      29
3.3      27
3.1      15
2.3      14
3.2      14
2.5      13
2.7       8
2.6       6
1.7       3
2.9       3
1.3       3
2.4       3
Name: count, dtype: int64

In [103]:
#replace 0.0 zero rating with not rated
df['extracted_rating'] = df['extracted_rating'].replace('0.0', 'not rated')

df["extracted_rating"].value_counts()

extracted_rating
not rated    2124
4.5           574
4.7           454
4.6           440
4.4           424
4.3           389
4.8           332
5.0           307
4.0           251
4.2           196
4.1           159
3.9            95
3.5            67
3.8            59
1.0            57
3.0            56
3.6            50
3.7            49
2.0            44
4.9            36
3.4            29
3.3            27
3.1            15
2.3            14
3.2            14
2.5            13
2.7             8
2.6             6
1.7             3
2.9             3
1.3             3
2.4             3
Name: count, dtype: int64

# missing values

In [104]:
#check number of missing values
df.isnull().sum()
#there seem to be quite a few missing values, especially for specs and delivery information

nr                        0
brand                     3
name                      1
specs                   238
price                     1
delivery_information    179
scraped_at                0
extracted_rating          1
dtype: int64

In [105]:
#check where the specs are null
df[df['specs'].isnull()]

Unnamed: 0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating
751,752,Motorola,ThinkPhone - 5G smartphone - dual-SIM - RAM 8 ...,,656.0,morgen geliefert,2023-11-24 12:04:11.181377,not rated
953,954,,52004 - Schwarz - Mobiltelefon - Schwarz,,23.1,morgen geliefert,2023-11-24 12:09:33.204296,not rated
1106,1107,Ulefone,NOTE 16 PRO 8+128GB BLACK OEM,,222.0,Zwischen Fr. 1.12. und Di. 5.12. geliefert,2023-11-24 12:13:36.406830,not rated
1162,1163,Samsung,"Galaxy S22 SM-S901B 15,5 cm (6.1"" ) Dual-SIM 5...",,699.0,morgen geliefert,2023-11-24 12:15:10.492780,not rated
1186,1187,ZTE,BLADE V40 VITA 4+128GB DS 4G PINE GREEN OEM,,157.0,Zwischen Mi. 6.12. und Di. 12.12. geliefert,2023-11-24 12:15:53.117374,not rated
...,...,...,...,...,...,...,...,...
1967,1968,TCL,30 5G DEAMY BLUE 128GO (sans HS),,308.0,Zwischen 11.1.2024 und 20.1.2024 geliefert,2023-12-04 09:27:02.073108,not rated
1968,1969,TCL,30 5G TECH BLACK 64GO (sans HS),,281.0,Zwischen 11.1.2024 und 20.1.2024 geliefert,2023-12-04 09:27:05.687827,not rated
2019,2020,PhoneLook,Hülle Silikon schwarz Purple Pink Pineapple,,29.9,Zwischen Mi. 6.12. und Do. 7.12. geliefert,2023-12-04 09:28:29.183486,not rated
2020,2021,PhoneLook,Hülle Silikon schwarz Skulls and flowers,,29.9,Zwischen Mi. 6.12. und Do. 7.12. geliefert,2023-12-04 09:28:32.786150,not rated


In [106]:
#there is 1 phone with an unknown/missing brand, we will remove this
df = df[df['brand'].notnull()].copy()

df[df['brand'].isnull()]

Unnamed: 0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating


In [107]:
#we will also remove the phones with missing specs, as we will not be able to use them
#explain in docu why we do this (there is no real structre recognizable in the specs -> we would need to fix this manually for each entry)
df = df[df['specs'].notnull()].copy()

In [108]:
#check missing values again
df.isnull().sum() #only delivery information is missing now (we will deal with this later)

nr                        0
brand                     0
name                      0
specs                     0
price                     0
delivery_information    175
scraped_at                0
extracted_rating          0
dtype: int64

# get phone storage

In [109]:
# Regular expression to extract the GB value
df['extracted_gb'] = df['specs'].str.extract(r'(\d+)\s*GB')

# Convert extracted GB values to numeric
df['extracted_gb'] = pd.to_numeric(df['extracted_gb'])

df["extracted_gb"].value_counts()

extracted_gb
128.0     2173
256.0     1562
64.0       926
32.0       523
512.0      410
16.0        82
1000.0      77
3.0         55
4.0         20
8.0         20
0.0         10
2.0          5
13.0         4
10.0         2
Name: count, dtype: int64

there seem to be some phones with 0 GB storage, let us check them:

In [110]:
df.query("extracted_gb == 0")
#this does not seem to be an error, so we will leave this as it is

Unnamed: 0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating,extracted_gb
1619,1620,Energizer,E282SC,"0.00 GB, Black, 2.80"", Dual SIM, 2 Mpx, 4G",60.2,Zwischen Di. 12.12. und Mi. 20.12. geliefert,2023-11-24 12:28:58.768098,not rated,0.0
1873,1874,Nokia,150 GSM phone black,"0.00 GB, Black, Dual SIM",61.9,Zwischen Do. 14.12. und Fr. 22.12. geliefert,2023-11-24 12:37:02.854869,not rated,0.0
2121,2122,TCL,4021 Dual SIM szary,"0.00 GB, Grey, Dual SIM",34.9,Zwischen Do. 7.12. und Sa. 9.12. geliefert,2023-11-24 12:45:16.536756,not rated,0.0
2155,2156,Nokia,105 2019 KING TA-1174 DS FR BLACK,"0.00 GB, Black, Dual SIM",34.0,Zwischen Do. 28.12. und Di. 9.1. geliefert,2023-11-24 12:46:25.340580,not rated,0.0
1143,1144,Energizer,E282SC,"0.00 GB, Black, 2.80"", Dual SIM, 2 Mpx, 4G",60.2,Zwischen Di. 12.12. und Mi. 20.12. geliefert,2023-11-27 09:46:02.484559,not rated,0.0
1758,1759,TCL,4021 Dual SIM szary,"0.00 GB, Grey, Dual SIM",34.9,Zwischen Mi. 6.12. und Sa. 9.12. geliefert,2023-11-27 10:02:58.476163,not rated,0.0
1958,1959,Nokia,150 GSM phone black,"0.00 GB, Black, Dual SIM",64.9,Zwischen Do. 14.12. und Fr. 22.12. geliefert,2023-11-27 10:09:22.569909,not rated,0.0
1030,1031,Energizer,E282SC,"0.00 GB, Black, 2.80"", Dual SIM, 2 Mpx, 4G",59.8,Zwischen Di. 19.12. und Sa. 23.12. geliefert,2023-12-04 09:00:21.092253,not rated,0.0
1763,1764,TCL,4021 Dual SIM szary,"0.00 GB, Grey, Dual SIM",34.9,Zwischen Fr. 15.12. und Di. 19.12. geliefert,2023-12-04 09:20:43.476886,not rated,0.0
1930,1931,Nokia,150 GSM phone black,"0.00 GB, Black, Dual SIM",64.9,Zwischen Di. 19.12. und Do. 28.12. geliefert,2023-12-04 09:25:56.804640,not rated,0.0


# get phone color

In [111]:
df["phone_color"] = df['specs'].str.extract(r'GB,\s*([^,]+)')

df[df["phone_color"].isnull()]

Unnamed: 0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating,extracted_gb,phone_color
555,556,Xiaomi,Redmi Note 12s,Grün,199.00,morgen geliefert,2023-11-24 11:59:08.843548,not rated,,
827,828,ZTE,BLADE L9 1+32GB BLUE,32 GB,41.00,Zwischen Mi. 6.12. und Di. 12.12. geliefert,2023-11-24 12:06:10.958520,not rated,32.0,
860,861,POCO,M4 4+64GB DS 5G POCO YELLOW OEM,64 GB,119.00,morgen geliefert,2023-11-24 12:07:01.524810,not rated,64.0,
1111,1112,Onda,F22 CLS101 BLACK,"Black, Dual SIM",27.90,Zwischen Sa. 25.11. und Do. 30.11. geliefert,2023-11-24 12:13:44.321822,not rated,,
1149,1150,Emporia,CO2 Messgerät,Schwarz,74.90,,2023-11-24 12:14:47.212335,not rated,,
...,...,...,...,...,...,...,...,...,...,...
2000,2001,PhoneLook,Hülle Gummi Gel Bumper mit extra Schutz für Ec...,Transparent,15.90,Zwischen Mi. 6.12. und Do. 7.12. geliefert,2023-12-04 09:27:56.193411,not rated,,
2018,2019,Hammer Fitness,Iron 3 4G LTE + 9H Glas-Folie: Starter Pack,"Orange, 4G",206.00,Zwischen Mi. 6.12. und Mo. 11.12. geliefert,2023-12-04 09:28:25.564938,not rated,,
2022,2023,Hammer Fitness,Energy 2 Eco 4G LTE + 9H Glas-Folie : Starter ...,"Schwarz, 4G",216.00,Zwischen Mi. 6.12. und Mo. 11.12. geliefert,2023-12-04 09:28:37.917539,not rated,,
2023,2024,Hammer Fitness,Explorer Plus Eco 4G + Powerbank : Starter Pack,"Orange, 4G",322.00,Zwischen Mi. 6.12. und Mo. 11.12. geliefert,2023-12-04 09:28:39.415280,not rated,,


In [112]:
df.isnull().sum()
#there are still phones missing the GB or phone color, we will remove them since we need this information to later on merge the data with the other data sources, additionally these phones are not very popular (do not have any ratings)

nr                        0
brand                     0
name                      0
specs                     0
price                     0
delivery_information    175
scraped_at                0
extracted_rating          0
extracted_gb            195
phone_color             232
dtype: int64

In [113]:
#remove missing GB phones
df = df[df['extracted_gb'].notnull()].copy()
#remove missing phone color phones
df = df[df['phone_color'].notnull()].copy()

In [114]:
df.isnull().sum()

nr                        0
brand                     0
name                      0
specs                     0
price                     0
delivery_information    169
scraped_at                0
extracted_rating          0
extracted_gb              0
phone_color               0
dtype: int64

# replace missing values in the delivery information
impute with mode for each brand (because sometimes the hover over window did not work during scraping)

In [115]:
df[df["delivery_information"].isnull()]

Unnamed: 0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating,extracted_gb,phone_color
118,119,Samsung,Galaxy Z Flip5,"512 GB, Cream, 6.70"", Dual SIM, 12 Mpx, 5G",899.00,,2023-11-24 11:48:12.837864,3.6,512.0,Cream
127,128,Motorola,Razr 40 Ultra,"256 GB, Infinite Black, 6.90"", SIM + eSIM, 12 ...",793.00,,2023-11-24 11:48:25.811904,3.7,256.0,Infinite Black
211,212,Apple,iPhone 13,"128 GB, Green, 6.10"", SIM + eSIM, 12 Mpx, 5G",599.00,,2023-11-24 11:50:28.724630,4.7,128.0,Green
257,258,Apple,iPhone SE (3rd Gen),"128 GB, (PRODUCT)RED, 4.70"", SIM + eSIM, 12 Mp...",479.00,,2023-11-24 11:51:36.889756,4.6,128.0,(PRODUCT)RED
274,275,Apple,iPhone 12,"128 GB, White, 6.10"", SIM + eSIM, 12 Mpx, 5G",529.00,,2023-11-24 11:52:01.710344,4.8,128.0,White
...,...,...,...,...,...,...,...,...,...,...
1974,1975,Xiaomi,XIA DS REDMI A1 2+32 WIND BLK,"32 GB, Black, Dual SIM",79.80,,2023-12-04 09:27:15.784386,not rated,32.0,Black
1975,1976,Gigaset,GX4 PRO,"128 GB, Black, 6.10"", Dual SIM, 48 Mpx, 4G",465.00,,2023-12-04 09:27:17.252321,not rated,128.0,Black
1976,1977,Apple,iPhone XR,"128 GB, Black, 6.10"", SIM + eSIM, 12 Mpx, 4G",949.00,,2023-12-04 09:27:18.710339,4.7,128.0,Black
1978,1979,Motorola,Moto G32 - 4G Smartphone - Dual-SIM - RAM 4 GB...,"128 GB, Mineral Grey, Dual SIM",169.00,,2023-12-04 09:27:21.947167,not rated,128.0,Mineral Grey


In [116]:
#helper function to impute missing values with the mode
def impute_with_mode(group):
    try:
        mode = group['delivery_information'].mode()[0]
    except KeyError:
        return group
    group['delivery_information'].fillna(mode, inplace=True)
    return group

# Apply the function to each group
df = df.groupby('brand').apply(impute_with_mode)

In [117]:
df[df["delivery_information"].isnull()]
#there is still a phone with missing delivery information, we will remove this

Unnamed: 0_level_0,Unnamed: 1_level_0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating,extracted_gb,phone_color
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Carbon Mobile,2119,2120,Carbon Mobile,Carbon 1 MK II,"256 GB, Original Carbon Fiber, 6.01"", Dual SIM...",799.0,,2023-11-24 12:45:11.481626,2.4,256.0,Original Carbon Fiber
Carbon Mobile,2010,2011,Carbon Mobile,Carbon 1 MK II,"256 GB, Original Carbon Fiber, 6.01"", Dual SIM...",799.0,,2023-11-27 10:10:51.759464,2.4,256.0,Original Carbon Fiber
Carbon Mobile,1740,1741,Carbon Mobile,Carbon 1 MK II,"256 GB, Original Carbon Fiber, 6.01"", Dual SIM...",799.0,,2023-12-04 09:19:58.733304,2.4,256.0,Original Carbon Fiber


In [118]:
df = df[df['delivery_information'].notnull()].copy()

In [119]:
#reset the index
df = df.reset_index(drop=True)
df.columns

Index(['nr', 'brand', 'name', 'specs', 'price', 'delivery_information',
       'scraped_at', 'extracted_rating', 'extracted_gb', 'phone_color'],
      dtype='object')

In [120]:
df["date"] = df["scraped_at"].dt.date
df

Unnamed: 0,nr,brand,name,specs,price,delivery_information,scraped_at,extracted_rating,extracted_gb,phone_color,date
0,38,ASUS,ROG Phone 6,"512 GB, Phantom Black, 6.78"", Dual SIM, 50 Mpx...",599.00,morgen geliefert,2023-11-24 11:46:14.669771,4.4,512.0,Phantom Black,2023-11-24
1,108,ASUS,ROG Phone 6 Diablo Immortal Edition,"512 GB, Diablo Immortal Edition, 6.78"", Dual S...",913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,2023-11-24 11:47:57.025040,3.5,512.0,Diablo Immortal Edition,2023-11-24
2,137,ASUS,Zenfone 10,"256 GB, Midnight Black, 5.90"", Dual SIM, 50 Mp...",649.00,morgen geliefert,2023-11-24 11:48:38.808052,3.6,256.0,Midnight Black,2023-11-24
3,202,ASUS,ROG Phone 6 Diablo Immortal Edition,"512 GB, Diablo Immortal Edition, 6.78"", Dual S...",913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,2023-11-24 11:50:14.230374,3.5,512.0,Diablo Immortal Edition,2023-11-24
4,208,ASUS,ROG Phone 7,"512 GB, Storm White, 6.78"", Dual SIM, 50 Mpx, 5G",1040.00,morgen geliefert,2023-11-24 11:50:22.980236,4.5,512.0,Storm White,2023-11-24
...,...,...,...,...,...,...,...,...,...,...,...
5824,1698,realme,C35,"64 GB, Glowing green, 6.60"", Hybrid Dual SIM, ...",149.00,Zwischen Sa. 9.12. und Fr. 15.12. geliefert,2023-12-04 09:18:36.329941,5.0,64.0,Glowing green,2023-12-04
5825,1717,realme,9i,"64 GB, Prism Blue, 6.60"", Dual SIM + SD, 50 Mp...",114.00,Zwischen Mi. 13.12. und Fr. 15.12. geliefert,2023-12-04 09:19:11.318949,5.0,64.0,Prism Blue,2023-12-04
5826,1809,realme,C33 4/64GB Dual SIM Night Sea,"64 GB, Black, 6.50"", Dual SIM, 50 Mpx, 4G",115.00,Zwischen Fr. 15.12. und Di. 19.12. geliefert,2023-12-04 09:22:05.725023,not rated,64.0,Black,2023-12-04
5827,1999,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,"4 GB, Schwarz, Hybrid Dual SIM",211.00,Zwischen Do. 7.12. und Mo. 11.12. geliefert,2023-12-04 09:27:53.159054,not rated,4.0,Schwarz,2023-12-04


In [121]:
cleaned_df = df[["brand", "name", "price", "delivery_information", "extracted_rating", "extracted_gb",
                 "phone_color", "date"]].copy()

cleaned_df

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date
0,ASUS,ROG Phone 6,599.00,morgen geliefert,4.4,512.0,Phantom Black,2023-11-24
1,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24
2,ASUS,Zenfone 10,649.00,morgen geliefert,3.6,256.0,Midnight Black,2023-11-24
3,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24
4,ASUS,ROG Phone 7,1040.00,morgen geliefert,4.5,512.0,Storm White,2023-11-24
...,...,...,...,...,...,...,...,...
5824,realme,C35,149.00,Zwischen Sa. 9.12. und Fr. 15.12. geliefert,5.0,64.0,Glowing green,2023-12-04
5825,realme,9i,114.00,Zwischen Mi. 13.12. und Fr. 15.12. geliefert,5.0,64.0,Prism Blue,2023-12-04
5826,realme,C33 4/64GB Dual SIM Night Sea,115.00,Zwischen Fr. 15.12. und Di. 19.12. geliefert,not rated,64.0,Black,2023-12-04
5827,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,211.00,Zwischen Do. 7.12. und Mo. 11.12. geliefert,not rated,4.0,Schwarz,2023-12-04


In [122]:
cleaned_df.dtypes

brand                    object
name                     object
price                   float64
delivery_information     object
extracted_rating         object
extracted_gb            float64
phone_color              object
date                     object
dtype: object

## replace delivery information with days

In [123]:
cleaned_df

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date
0,ASUS,ROG Phone 6,599.00,morgen geliefert,4.4,512.0,Phantom Black,2023-11-24
1,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24
2,ASUS,Zenfone 10,649.00,morgen geliefert,3.6,256.0,Midnight Black,2023-11-24
3,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24
4,ASUS,ROG Phone 7,1040.00,morgen geliefert,4.5,512.0,Storm White,2023-11-24
...,...,...,...,...,...,...,...,...
5824,realme,C35,149.00,Zwischen Sa. 9.12. und Fr. 15.12. geliefert,5.0,64.0,Glowing green,2023-12-04
5825,realme,9i,114.00,Zwischen Mi. 13.12. und Fr. 15.12. geliefert,5.0,64.0,Prism Blue,2023-12-04
5826,realme,C33 4/64GB Dual SIM Night Sea,115.00,Zwischen Fr. 15.12. und Di. 19.12. geliefert,not rated,64.0,Black,2023-12-04
5827,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,211.00,Zwischen Do. 7.12. und Mo. 11.12. geliefert,not rated,4.0,Schwarz,2023-12-04


In [124]:
cleaned_df["delivery_information"].value_counts()

delivery_information
morgen geliefert                                2454
Zwischen Mi. 6.12. und Sa. 9.12. geliefert       224
Zwischen Mi. 6.12. und Di. 12.12. geliefert      200
Zwischen Do. 14.12. und Fr. 22.12. geliefert     176
Zwischen Di. 12.12. und Mi. 20.12. geliefert     172
                                                ... 
Zwischen 20.2.2024 und 13.3.2024 geliefert         1
Zwischen 22.2.2024 und 15.3.2024 geliefert         1
Zwischen Fr. 29.12. und Mo. 8.1. geliefert         1
Zwischen Sa. 30.12. und Fr. 26.1. geliefert        1
Zwischen 10.1.2024 und 1.2.2024 geliefert          1
Name: count, Length: 166, dtype: int64

In [125]:
def calculate_delivery_time(row):
    if row['delivery_information'] == 'morgen geliefert':
        return 1
    elif row['delivery_information'] == 'übermorgen geliefert':
        return 2
    else:
        # Extract the first date from the string
        match = re.search(r'\d{1,2}\.\d{1,2}\.', row['delivery_information'])
        if match:
            start_date_str = match.group()
            # Parse the dates
            start_date = datetime.strptime(start_date_str + '2023', '%d.%m.%Y').date()  
            order_date = row["date"]
            # Calculate the difference in days
            return (start_date - order_date).days
        else:
            return "Lieferung nicht möglich"  #if the phone cant be delivered


cleaned_df['delivery_time_days'] = cleaned_df.apply(calculate_delivery_time, axis=1)
cleaned_df.head()

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date,delivery_time_days
0,ASUS,ROG Phone 6,599.0,morgen geliefert,4.4,512.0,Phantom Black,2023-11-24,1
1,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24,5
2,ASUS,Zenfone 10,649.0,morgen geliefert,3.6,256.0,Midnight Black,2023-11-24,1
3,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24,5
4,ASUS,ROG Phone 7,1040.0,morgen geliefert,4.5,512.0,Storm White,2023-11-24,1


In [126]:
cleaned_df["delivery_time_days"].value_counts()

delivery_time_days
1                          2687
9                           420
15                          310
12                          234
3                           226
10                          199
8                           182
11                          177
2                           149
5                           145
6                           133
4                           131
Lieferung nicht möglich     127
19                          126
13                           91
18                           87
17                           80
20                           72
7                            45
-327                         44
14                           42
34                           27
22                           17
25                           14
-319                          5
-320                          5
-328                          5
16                            4
-283                          4
26                            4
-325                 

## check for duplicates

In [127]:
cleaned_df.duplicated().sum()

306

In [128]:
#remove the duplicates, since these will be irrelevant for our analysis
cleaned_df = cleaned_df.drop_duplicates()

cleaned_df.duplicated().sum()

0

In [129]:
cleaned_df

Unnamed: 0,brand,name,price,delivery_information,extracted_rating,extracted_gb,phone_color,date,delivery_time_days
0,ASUS,ROG Phone 6,599.00,morgen geliefert,4.4,512.0,Phantom Black,2023-11-24,1
1,ASUS,ROG Phone 6 Diablo Immortal Edition,913.08,Zwischen Mi. 29.11. und Fr. 1.12. geliefert,3.5,512.0,Diablo Immortal Edition,2023-11-24,5
2,ASUS,Zenfone 10,649.00,morgen geliefert,3.6,256.0,Midnight Black,2023-11-24,1
4,ASUS,ROG Phone 7,1040.00,morgen geliefert,4.5,512.0,Storm White,2023-11-24,1
5,ASUS,Zenfone 10,828.00,morgen geliefert,3.9,512.0,Midnight Black,2023-11-24,1
...,...,...,...,...,...,...,...,...,...
5824,realme,C35,149.00,Zwischen Sa. 9.12. und Fr. 15.12. geliefert,5.0,64.0,Glowing green,2023-12-04,5
5825,realme,9i,114.00,Zwischen Mi. 13.12. und Fr. 15.12. geliefert,5.0,64.0,Prism Blue,2023-12-04,9
5826,realme,C33 4/64GB Dual SIM Night Sea,115.00,Zwischen Fr. 15.12. und Di. 19.12. geliefert,not rated,64.0,Black,2023-12-04,11
5827,realme,Narzo 50A Prime 64GB Flash Black [16.7cm (6.6 ...,211.00,Zwischen Do. 7.12. und Mo. 11.12. geliefert,not rated,4.0,Schwarz,2023-12-04,3


## save cleaned data

In [130]:
cleaned_df.to_csv('data/robin_portmann_stage2.csv', index=False)