In [None]:
import json
import numpy as np
import pandas as pd
import re

## Functions

In [None]:
def clean_brand_str(brand):
    if pd.isna(brand) or type(brand) != str:
        return None
    brand = brand.strip()
    brand = brand.strip(",./")
    brand = brand.lower()
    brand = brand.replace(".", " ")
    brand = brand.replace(",", " ")
    brand = brand.replace(" and ", " ")
    brand = brand.replace("&", " ")
    brand = brand.replace("-", " ")
    brand = brand.replace("'", "")
    brand = brand.replace("’", "")
    brand = re.sub('\\s+', ' ', brand)
    
    # Manual adjustments
    if brand in ["unbranded", "undisclosed"]:
        return None
    if brand == "børn":
        brand = "born"
    if brand == "propét":
        brand = "propet"
    if brand == "chloé":
        brand = "chloe"
    if brand == "diana ferrari supersoft":
        brand = "diana ferrari"
    if brand == "life stride":
        brand = "lifestride"
    if brand == "sperry":
        brand = "sperry top sider"
    if "ugg" in brand:
        if brand != "ugg australia":
            brand = "ugg"
        else:
            brand = "ugg australia"
    if brand == "kenneth cole reaction":
        brand = "kenneth cole"
    if brand == "twisted x boots":
        brand = "twisted x"
    if "ann taylor" in brand:
        brand = "ann taylor"
    if "louboutin" in brand or "loboutin" in brand:
        brand = "louboutin"
    if "clarks" in brand:
        brand = "clarks"
    if "saint laurent" in brand:
        brand = "saint laurent"
    if brand == "merrel":
        brand = "merrell"
    if "stuart weitzman" in brand:
        brand = "stuart weitzman"
    if brand in ["b o c", "b ø c", "b o c born concepts", "b o c born of concepts", 
                 "boc born", "boc born concept", "boc by born", "boc børn concept"]:
        brand = "boc"
    if brand == "bobs by skechers":
        brand = "bobs"
    return brand

In [None]:
def write_all_seen_brands_to_file(df):
    clean_brands = df.brand.apply(clean_brand_str)
    brand_count_dict = dict(clean_brands.value_counts())
    brand_count_dict = {k: int(v) for k, v in brand_count_dict.items()}
    with open("new_clean_brands_from_train.json", "w") as f:
        json.dump(brand_count_dict, f)

In [None]:
def write_brands_mean_prices_to_file(df):
    df = clean_brand_and_extract_brand_from_title(df)
    brand_mean_price_dict = df.groupby("brand")["log_price"].mean().sort_values(ascending=False)
    brand_max_price_dict = df.groupby("brand")["log_price"].max().sort_values(ascending=False)
    brand_mean_price_dict = {k: float(v) for k, v in brand_mean_price_dict.items()}
    brand_max_price_dict = {k: float(v) for k, v in brand_max_price_dict.items()}
    with open("brand_mean_price_dict.json", "w") as f:
        json.dump(brand_mean_price_dict, f)
    with open("brand_max_price_dict.json", "w") as f:
        json.dump(brand_max_price_dict, f)

In [None]:
def clean_brand_and_extract_brand_from_title(df):
    with open("new_clean_brands_from_train.json", "r") as f:
        brands_list = json.load(f)
    
    def _extract_brand(brand, title):
        if pd.notnull(brand):
            return clean_brand_str(brand)
        title = str(title).lower().strip()
        title = title.strip(",./")
        title = title.lower()
        title = title.replace(".", " ")
        title = title.replace(",", " ")
        title = title.replace(" and ", " ")
        title = title.replace("&", " ")
        title = title.replace("-", " ")
        title = title.replace("'", "")
        title = title.replace("’", "")
        title = title.replace("ø", "")
        title = title.replace("é", "")
        
        for brand in brands_list:
            if brand in title and len(brand) > 2:
                return brand
        return None
    
    df['brand'] = df.apply(lambda x: _extract_brand(x["brand"], x["title"]), axis=1)
    return df

In [None]:
def map_brand(data):
    col_name = "brand"
    
    with open("./new_clean_brands_from_train.json", "r") as f:
        common_brands = json.load(f)
    common_brands = [k for k,v in list(common_brands.items())[:15]]
    
    tmp_data = data[~data[col_name].isin(common_brands)].copy()
    price = tmp_data["price"]
    tmp_data['log_price'] = np.log(price)
    tmp_data = tmp_data[[col_name, "log_price"]]
    tmp_data = tmp_data.groupby(col_name).mean().sort_values(by=['log_price'], ascending=True)
    map = {}
    index = tmp_data.index
    i = 0
    while i < len(tmp_data):
        brand = str(index[i]).lower().strip()
        l_price = float(tmp_data.iloc[i])
        i+=1
        group =1
        if (l_price<2.2):
            map[brand]=group
            continue
        group+=1
        if((l_price>=2.2) and (l_price<2.8)):
            map[brand]=group
            continue
        group+=1
        if(l_price>=2.8 and l_price<3):
            map[brand]=group
            continue
        group+=1
        if(l_price>=3 and l_price<3.15):
            map[brand]=group
            continue
        group+=1
        if(l_price>=3 and l_price<3.3):
            map[brand]=group
            continue
        group+=1
        if(l_price>=3.3 and l_price<3.5):
            map[brand]=group
            continue
        group+=1
        if(l_price>=3.5 and l_price<3.75):
            map[brand]=group
            continue
        group+=1
        if(l_price>=3.75 and l_price<4.1):
            map[brand]=group
            continue
        group+=1
        if(l_price>=3.75 and l_price<4.1):
            map[brand]=group
            continue
        group+=1
        if(l_price>=4.1 and l_price<4.6):
            map[brand]=group
            continue
        group+=1
        if(l_price>=4.6 and l_price<5.1):
            map[brand]=group
            continue
        group+=1
        if(l_price>=5.1 and l_price<5.6):
            map[brand]=group
            continue
        group+=1
        if(l_price>=5.6 and l_price<6.1):
            map[brand]=group
            continue
        group+=1
        if(l_price>=6.1 ):
            map[brand]=group
            continue
        print("brand problem")
    for cb in common_brands:
        map[cb] = cb
    with open("brand_dict.json", "w") as t:
        json.dump(map, t)

In [None]:
# Returns new brand column with binning
def process_brand(df):
    with open('brand_dict.json', 'rb') as f:
        brand_bin_mapping = json.load(f)
    
    def _processBrand(x):
        if (pd.isna(x)):
            return 0
        x= str(x).strip().lower()
        if x not in brand_bin_mapping:
            return 0
        return brand_bin_mapping[x]
    
    df["f_brand"] = df["brand"].apply(_processBrand)
    df["f_brand"] = df["f_brand"].astype('category')
    return df

In [None]:
def process_brands_mean_prices(df):
    with open("brand_mean_price_dict.json", "r") as f:
        brand_mean_price_dict = json.load(f)
    
    price_threshold = 4.0
    priciest_brands = [k for k,v in brand_mean_price_dict.items() if v > price_threshold]
    mean_price_of_all_other_brands = np.mean([v for k,v in brand_mean_price_dict.items() if v <= price_threshold])
    
    def _assign_mean_price_to_brand(x):
        if pd.isna(x):
            return mean_price_of_all_other_brands
        if x in priciest_brands:
            return brand_mean_price_dict[x]
        return mean_price_of_all_other_brands
    
    df["f_pricy_brands_mean_log_price"] = df.brand.apply(_assign_mean_price_to_brand)
    return df


def process_most_expensive_brands(df):
    with open("brand_mean_price_dict.json", "r") as f:
        brand_mean_price_dict = json.load(f)
    
    top_priciest_brands = [k for k,v in list(brand_mean_price_dict.items())[:30]]
    
    def _assign_is_luxury_brand(x):
        if pd.isna(x):
            return 0
        if x in top_priciest_brands:
            return 1
        return 0
    
    df["f_is_luxury_brand"] = df.brand.apply(_assign_is_luxury_brand)
    return df


def process_brands_rankings(df):
    with open("brand_mean_price_dict.json", "r") as f:
        brand_mean_price_dict = json.load(f)
    
    brand_ranks_dict = {item[0]: i for i, item in enumerate(brand_mean_price_dict.items())}
    mean_rank = int(np.mean(list(brand_ranks_dict.values())))
    
    def _assign_rank(x):
        if pd.isna(x):
            return mean_rank
        if x in list(brand_ranks_dict.keys()):
            return brand_ranks_dict[x]
        return mean_rank
    
    df["f_brand_ranking"] = df.brand.apply(_assign_rank)
    return df