In [134]:
import json
import pandas as pd
from copy import deepcopy
import os
import re 

In [82]:
def convert_list_to_dict(obj):
    obj2 = deepcopy(obj)
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key], obj2[key] = convert_list_to_dict(value)
    elif isinstance(obj, list) and all(isinstance(item, dict) for item in obj):
        dict1 = dict()
        dict2 = dict()
        for item in obj:
            if list(item.keys())[0] not in dict1:
                dict1.update(item)
            else:
                dict2.update(item)
        return dict1, dict2
    return obj, obj2

class Cleaner:
    data = dict()
    data_ = dict()

    def add_brand(self, data):
        brand = list(data.keys())[0]
        vals = convert_list_to_dict(data[brand])
        Cleaner.data[brand] = vals[0]
        Cleaner.data_[brand] = vals[1]
        
    def get_dataframes(self):
        dfs = dict()

        def find_attrs(data, attrs):
            for row in data:
                if isinstance(data[row], dict):
                    find_attrs(data[row], attrs)
                else:
                    attrs[row] = data[row]
        
        for brand in Cleaner.data:
            for phone in Cleaner.data[brand]:
                attrs = dict()
                missings = dict()
                find_attrs(Cleaner.data[brand][phone], attrs)
                find_attrs(Cleaner.data_[brand][phone], missings)
                for attr in attrs:
                    if attr not in dfs:
                        dfs[attr] = {'phone': [phone], attr: [attrs[attr]]}
                    else:
                        dfs[attr]['phone'] += [phone]
                        dfs[attr][attr] += [attrs[attr]]
                    if attr in missings:
                        dfs[attr]['phone'] += [phone]
                        dfs[attr][attr] += [missings[attr]]

        for attr in dfs:
            dfs[attr] = pd.DataFrame(dfs[attr])

        return dfs

cleaner = Cleaner()

In [83]:
for _, _, paths in os.walk('./data'):
    for path in paths:
        with open(os.path.join('./data', path), 'r') as f:
            data = json.load(f)
            cleaner.add_brand(data)

In [84]:
dfs = cleaner.get_dataframes()

In [85]:
def find_mAh(x):
    for i, val in enumerate(x):
        if 'mAh' in val:
            return x[i - 1]
dfs['Type']['BatterymAh'] = dfs['Type']['Type'].str.split().apply(find_mAh)
BatterymAhs = dfs['Type'].drop(columns='Type').dropna()

In [86]:
BatterymAhs

Unnamed: 0,phone,BatterymAh
0,1B (2022),3000
1,1L Pro (2021),3000
2,1 (2021),2000
3,3L (2021),4000
4,1S (2021),4000
...,...,...
5343,Wildfire R70,4000
5346,Tiara,1800
5348,Zeta,1830
5349,Ville,1650


In [87]:
dfs['Price']['price_prep'] = dfs['Price']['Price'].str.findall(r'\d+ EUR|\d+$')
prices = dfs['Price'].explode('price_prep')[['phone', 'price_prep']].dropna()

In [88]:
prices

Unnamed: 0,phone,price_prep
0,1B (2022),100 EUR
1,1L Pro (2021),110 EUR
2,1 (2021),60 EUR
3,3L (2021),330 EUR
4,1S (2021),130 EUR
...,...,...
4360,P3300,350 EUR
4361,S620,240 EUR
4362,S310,130 EUR
4363,TyTN,420 EUR


In [89]:
chipsets = dfs['Chipset'].dropna()

In [90]:
chipsets

Unnamed: 0,phone,Chipset
0,1B (2022),Mediatek MT6761 Helio A22 (12 nm)
1,1L Pro (2021),Unisoc SC9863A (28nm)
2,1 (2021),Mediatek MT6739 (28 nm)
3,3L (2021),Mediatek MT6762D Helio P22 (12 nm)
4,1S (2021),Mediatek MT6762D Helio P22 (12 nm)
...,...,...
3257,S620,TI OMAP 850
3258,S310,TI OMAP 850
3259,Wildfire R70,Mediatek MT6763 Helio P23 (16 nm)
3260,One (M8i),Qualcomm MSM8939 Snapdragon 615 (28 nm)


In [91]:
pre_df = {'phone': [], 'display_type': []}

for brand in Cleaner.data:
    for phone in Cleaner.data[brand]:
        pre_df['phone'] += [phone]
        pre_df['display_type'] += [Cleaner.data[brand][phone]['Display']['Type']]

pre_df =  pd.DataFrame(pre_df)

def find_displayType(x):
    if ',' in x:
        x = x[:x.find(',')]
    elif '(' in x:
        x = x[:x.find('(')]
        
    for val in x.split():
        for let in val:
            if let.isupper():
                return val
    
pre_df['display_type_prep'] = pre_df['display_type'].apply(find_displayType)

display_types = pre_df[['phone', 'display_type_prep']].dropna()

In [92]:
display_types

Unnamed: 0,phone,display_type_prep
0,1B (2022),TFT
1,1L Pro (2021),IPS
2,1 (2021),TFT
3,3L (2021),IPS
4,1S (2021),IPS
...,...,...
5348,Zeta,IPS
5349,Ville,Super
5350,Ignite,S-LCD
5351,Desire HD2,S-LCD


In [93]:
gpus = dfs['GPU']
gpus = gpus.replace('No', None).dropna()

In [94]:
gpus

Unnamed: 0,phone,GPU
0,1B (2022),PowerVR GE8300
1,1L Pro (2021),IMG8322
2,1 (2021),PowerVR GE8100
3,3L (2021),PowerVR GE8320
4,1S (2021),PowerVR GE8320
...,...,...
3154,TyTN II,Adreno 130
3155,Wildfire R70,Mali-G71 MP2
3156,One (M8i),Adreno 405
3157,Desire HD2,Adreno 205


In [95]:
dfs['Internal']['Internal_prep'] = dfs['Internal']['Internal'].str.split(r', ')

def find_ram_storage(x):
    for val in x:
        if 'RAM' in val:
            return val[:val.find('RAM')]


dfs['Internal']['Internal_prep'] = dfs['Internal']['Internal_prep'].apply(find_ram_storage)
dfs['Internal']['Internal_prep'] = dfs['Internal']['Internal_prep'].str.split()

def to_standard_format(x):
    if x:
        if len(x) < 3:
            if len(x) == 1:
                storage, ram = None, x[0]
            if len(x) == 2:
                storage, ram = x[0], x[1]
            return pd.Series([storage, ram])
    return pd.Series([None, None])

dfs['Internal'][['Storage', 'RAM']] = dfs['Internal']['Internal_prep'].apply(to_standard_format)
memory_specifications = dfs['Internal'][['phone', 'Storage', 'RAM']].dropna(thresh=2)

In [96]:
memory_specifications

Unnamed: 0,phone,Storage,RAM
0,1B (2022),32GB,2GB
2,1L Pro (2021),32GB,2GB
4,1 (2021),8GB,1GB
6,3L (2021),64GB,4GB
8,1S (2021),32GB,3GB
...,...,...,...
6688,Primo,,512MB
6689,Zeta,32GB,1GB
6690,Ignite,,512MB
6691,Desire HD2,8GB,512MB


In [97]:
dfs['SIM']['SIM_prep'] = dfs['SIM']['SIM'].str.split(' ')

def find_sim_types(x):
    st = set()
    for i, val in enumerate(x):
        if '(' in val:
            break
        if 'SIM' in val:
            st.add(val)
        if val == 'SIM':
            return x[i - 1]
    return list(st)

dfs['SIM']['SIM_prep'] = dfs['SIM']['SIM_prep'].apply(find_sim_types)

sims = dfs['SIM'].explode('SIM_prep')

In [98]:
sims = sims[['phone', 'SIM_prep']].dropna()

In [99]:
sims

Unnamed: 0,phone,SIM_prep
0,1B (2022),Single
1,1L Pro (2021),Single
2,1 (2021),Single
3,3L (2021),Single
4,1S (2021),Single
...,...,...
6415,Zeta,Mini-SIM
6416,Ville,Mini-SIM
6417,Ignite,Mini-SIM
6418,Desire HD2,Mini-SIM


In [100]:
dfs['Technology']['Tech_prep'] = dfs['Technology']['Technology'].str.split(' / ')
technologies = dfs['Technology'].explode('Tech_prep')[['phone', 'Tech_prep']]
technologies

Unnamed: 0,phone,Tech_prep
0,1B (2022),GSM
0,1B (2022),HSPA
0,1B (2022),LTE
1,1L Pro (2021),GSM
1,1L Pro (2021),HSPA
...,...,...
5351,Desire HD2,GSM
5351,Desire HD2,HSPA
5352,A12,GSM
5352,A12,HSPA


In [101]:
BatterymAhs.to_csv('./db_data/BatterymAh.csv', index=False)

In [102]:
prices.to_csv('./db_data/Price.csv', index=False)

In [103]:
chipsets.to_csv('./db_data/Chipset.csv', index=False)

In [104]:
display_types.to_csv('./db_data/DisplayType.csv', index=False)

In [105]:
gpus.to_csv('./db_data/GPU.csv', index=False)

In [106]:
memory_specifications.to_csv('./db_data/MemorySpecification.csv', index=False)

In [107]:
sims.to_csv('./db_data/SIM.csv', index=False)

In [108]:
technologies.to_csv('./db_data/Technology.csv', index=False)

In [109]:
phones = []
for brand in cleaner.data:
    for phone in cleaner.data[brand]:
        phones += [phone]

phones = pd.Series(phones, name='phone')

In [110]:
sims.merge(phones)

Unnamed: 0,phone,SIM_prep
0,1B (2022),Single
1,1L Pro (2021),Single
2,1 (2021),Single
3,3L (2021),Single
4,1S (2021),Single
...,...,...
5226,Zeta,Mini-SIM
5227,Ville,Mini-SIM
5228,Ignite,Mini-SIM
5229,Desire HD2,Mini-SIM


In [111]:
len(set(technologies['phone'].unique()).intersection(set(sims['phone'].unique())))

5063

In [112]:
sims['SIM_prep'].value_counts()

SIM_prep
Mini-SIM     2161
Dual         1539
Single        539
Micro-SIM     437
Nano-SIM      349
eSIM           91
SIM,            9
USADual         8
Nano-SIM,       7
Triple          6
Quad            2
H542Dual        1
MIni-SIM        1
Name: count, dtype: int64

In [113]:
phones

0           1B (2022)
1       1L Pro (2021)
2            1 (2021)
3           3L (2021)
4           1S (2021)
            ...      
5348             Zeta
5349            Ville
5350           Ignite
5351       Desire HD2
5352              A12
Name: phone, Length: 5353, dtype: object

In [114]:
brand_phones = {'brand': [], 'phone': []}
for brand in cleaner.data:
    for phone in cleaner.data[brand]:
        brand_phones['brand'] += [brand]
        brand_phones['phone'] += [phone]

brand_phones_df = pd.DataFrame(brand_phones)
brand_phones_df

Unnamed: 0,brand,phone
0,alcatel409,1B (2022)
1,alcatel409,1L Pro (2021)
2,alcatel409,1 (2021)
3,alcatel409,3L (2021)
4,alcatel409,1S (2021)
...,...,...
5348,HTC287,Zeta
5349,HTC287,Ville
5350,HTC287,Ignite
5351,HTC287,Desire HD2


In [115]:
brands_to_estimate = ['Apple118', 'Samsung1383', 'Huawei441', 'Xiaomi377', 'Nokia576']
means = []

def price_corrector(x):
    if x:
        val = x[0].split()
        if val[-1] == 'EUR':
            return int(val[0])
        else:
            return 0.9 * int(val[0])
    else:
        return None

dfs['Price']['Numeric_Price'] = dfs['Price']['price_prep'].apply(price_corrector).dropna()


for brand in brands_to_estimate:
    df = brand_phones_df[brand_phones_df['brand'] == brand]
    merged_df = df.merge(dfs['Price'], on='phone')
    means += [merged_df['Numeric_Price'].mean()]

In [116]:
means = pd.Series(dict(zip(brands_to_estimate, means)), name='Means')

In [117]:
means.to_csv('Means.csv')

In [162]:
def find_year(x):
    if 'Not' in x:
        return None
    else:
        try:
            return int(re.findall(r'\b\d+\b', x)[0])
        except:
            return None

dfs['Announced']['Year'] = dfs['Announced']['Announced'].apply(find_year)
dfs['Announced'] = dfs['Announced'].dropna()

merged_df = dfs['Announced'].merge(dfs['Price'], on='phone')

prep_df = merged_df[merged_df['Year'] == 2023]

prices_2023 = prep_df['Numeric_Price']

4360.0

In [164]:
print('lower-bound:', prices_2023.quantile(0.99))
print('upper-bound:', prices_2023.quantile(0.1))

lower-bound: 1342.4999999999968
upper-bound: 20.21
