In [None]:
import json
import numpy as np
import pandas as pd
import re

%run brands_preprocess_functions.ipynb

## Functions

In [None]:
#######################
### help functions
#######################

def bool_convert(x):
    if pd.isna(x):
        return "null"
    if x==0:
        return "False"
    if(x==1):
        return "True"
    print("other category")
    return "null"

def processVintage(x):
    yes_list = ["yes", "ja" ]
    no_list = ["no", "nein", "nein", "non", "Not"]
    if pd.isna(x):
        return "null"
    x = str(x).lower()
    if x in no_list:
        return "False"
    if x in yes_list:
        return "True"
    return "null"

def processOccasion(x):
    if pd.isna(x):
        return "other"
    
    x = str(x).lower()
    # casual case
    if ("casual" in x) or ("work" in x) or ("various" in x) or("school" in x) or ("office" in x) or ("everyday" in x):
        return "casual"
    if ("party" in x) or ("club" in x ):
        return "party"
    if ("formal" in x) or ("evening" in x) or ("special occasion" in x) or ("formelle" in x) or ("tie" in x) or ("elegant" in x):
        return "formal"
    if ("any" in x) or ("versatile" in x) or ("all" in x) or  ("various" in x) or ("multiple" in x) or ("varies" in x) or ("suitable" in x) or ("multiple" in x):
        return "any"
    if ("dress" in x):
        return "dress"
    if ("wedding" in x) or ("prom" in x) or ("heels" in x):
        return "wedding"
    if ("business" in x):
        return "business"
    if ("outdoor" in x) or ("travel" in x) or ("hiking" in x) or ("sport" in x):
        return "outdoor"
    if ("non-heeled" in x) or ("flat" in x):
        return "flat"
    return "other"


def processMaterial(x):
    if (pd.isna(x)):
        return "other"
    x = str(x).lower().strip()
    if "faux leather" in x:
        return "faux leather"
    if "faux suede" in x:
        return "faux suede"
    if "leather" in x or "echtleder" in x:
        return "leather"
    if "suede" in x:
        return "suede"
    if "canvas" in x:
        return "canvas"
    if "satin" in x:
        return "satin"
    if "fabric" in x:
        return "fabric"
    if "textile" in x:
        return "textile"
    if "velvet" in x:
        return "velvet"
    if "manmade" in x:
        return "manmade"
    if "nubuck" in x:
        return "nubuck"
    if "pelle" in x:
        return "pelle"
    if "rubber" in x:
        return 'rubber'
    if "cotton" in x:
        return "cotton"
    if "nylon" in x:
        return "nylon"
    if "linen" in x:
        return "linen"
    return "other"


def mergeMaterial(a,b):
    # case both columns agree
    if a==b:
        return a
    # case "f_upper_material" does not have information
    if b=="other":
        return a
    # case "f_material" does not have information
    if a=="other":
        return b
    # case both columns have information but they are disagree
    return a



def preprocess_country_region_of_manufacture(df, top_n=15):
    top_n_countries = [i.lower() for i in list(df.country_region_of_manufacture.value_counts()[:top_n].index)]
    with open("top_n_country_region_of_manufacture.json", "w") as f:
        json.dump(top_n_countries, f)

In [None]:
#######################
### user functions
#######################

def process_bool(data, col_name):
    data["f_"+col_name] = data[col_name].apply(lambda x: bool_convert(x))
    data["f_"+col_name] = data["f_"+col_name].astype('category')
    return data

def process_vintage(data):
    col_name = "vintage"
    data["f_"+col_name] = data[col_name].apply(lambda x: processVintage(x))
    data["f_"+col_name] = data["f_"+col_name].astype('category')
    return data

def process_occasion(data):
    col_name = "occasion"
    data["f_"+col_name] = data[col_name].apply(lambda x: processOccasion(x))
    data["f_"+col_name] = data["f_"+col_name].astype('category')
    return data


def process_category(df):
    df["f_category"] = df["category"]
    return df


def process_material(data):
    col_name = "material"
    data["f_"+col_name] = data[col_name].apply(lambda x: processMaterial(x))
    data["f_"+col_name] = data["f_"+col_name].astype('category')
    col_name2 = "upper_material"
    data["f_"+col_name2] = data[col_name2].apply(lambda x: processMaterial(x))
    data["f_"+col_name2] = data["f_"+col_name2].astype('category')
    data["f_"+col_name+'_'+col_name2] = data.apply(lambda x: mergeMaterial(x["f_"+col_name],x["f_"+col_name2]), axis =1)
    data["f_"+col_name+'_'+col_name2] = data["f_"+col_name+'_'+col_name2].astype('category')
    data.drop(columns=["f_"+col_name, "f_"+col_name2], inplace=True)
    return data


# Process country_region_of_manufacture attribute
def process_country_region_of_manufacture(df):
    
    with open("top_n_country_region_of_manufacture.json", "r") as f:
        top_n_countries = json.load(f)

    def assign_new_country(x):
        if pd.isna(x):
            return "None"
        x = x.lower()
        if x == "usa":
            return "united states"
        return x if x in top_n_countries else "Other"
        
    df["f_country_region_of_manufacture"] = df.country_region_of_manufacture.apply(assign_new_country)
    return df


# Process heel_type attribute (with ~90% missing values) to be one of: None, Flat, Medium/Wide, High/Slim and Other
def process_heel_type(df):
    flats = ["flat", "flats", "no heel", "none", "flat heel", "Comfort", "flat platform"]
    medium_wide = ["wedge", "kitten", "cuban", "cone", "short heel", "Clog", "low", "Stacked"]
    high_slim = ["stiletto", "block", "slim", "platform", "pump", "Chunky", "Spiked", "Flare", "Sculptural"]

    def assign_new_heel_type(x):
        if x == "None":
            return "None"
        elif x in flats or "flat" in x:
            return "Flat"
        elif x in medium_wide:
            return "Medium/Wide"
        elif x in high_slim:
            return "High/Slim"
        else:
            return "Other"

    df["heel_type"] = df.heel_type.apply(lambda x: x.lower() if pd.notnull(x) else "None")
    df["f_heel_type"] = df.heel_type.apply(assign_new_heel_type)
    
    return df


# Process heel_height attribute
def extract_height(data):
    inch_to_cm_scale = 2.54
    inch_units = ['inch', 'inches', 'in', '"', '”']
    mm_units = ['mm', 'mmmm']
    
    p = re.compile('([.\d]+)\s*(mm|cm|mmmm|inch|inches|in|\"|”)')
    heights_without_level = []
    results = []
    for h in data:
        if type(h) != str:
            results.append(None)
            continue
        s = h.lower()
        if "mid" in s or "med" in s or "medium" in s:
            results.append("Medium")
            continue
        elif "low" in s:
            results.append("Low")
            continue
        elif "very high" in s:
            results.append("Very High")
            continue
        elif "high" in s:
            results.append("High")
            continue
        elif "flat" in s:
            results.append("Flat")
            continue
        else:
            heights_without_level.append(s)        
    
    for h in heights_without_level:
        patt = p.findall(h)
        if patt:
            num = float(patt[0][0])
            unit = patt[0][1]
            if unit in inch_units:
                num = num * inch_to_cm_scale
            if unit in mm_units:
                num = num / 10

            # Sanity check
            if num > 20:
                num = num / 10
            
            if num < 1.5:
                results.append("Flat")
            elif 1.5 <= num < 3.5:
                results.append("Low")
            elif 3.5 <= num < 7:
                results.append("Medium")
            elif 7 <= num < 11:
                results.append("High")
            else:
                results.append("Very High")
        else:
            results.append("Unknown")
            
    return results


# Process heel_height attribute
def process_heel_height(df):
    
    def assign_new_heel_height(x):
        if pd.isna(x):
            return "None"
        x = x.lower()
        x = x.strip()
        return x
        
    df["f_heel_height"] = df["heel_height"].apply(assign_new_heel_height)
    df["f_heel_height"] = pd.Series(extract_height(df["f_heel_height"])).values
    return df


# Process style strange numbers
def process_style_number_field(df):
    
    def assign_numbered_style_attr(x):
        p = re.compile('\d{6}_\d{7}')
        if pd.isna(x) or type(x) != str:
            return False
        if p.findall(x):
            return True
        return False 
    
    df["f_has_numbered_style"] = df["style"].apply(assign_numbered_style_attr)
    return df


# Process style attribute
def process_style(df):
    
    styles = {
        "casual" : ["sneakers", "platform", "slip-on", "slip on", "casual"],
        "comfort" : 
        [
            "comfort", 
            "clogs", 
            "loafers", 
            "loafer", 
            "mules", 
            "loafers, moccasins", 
            "loafers & slip ons",
            "moccasins",
            "boat shoes", 
            "loafers & moccasins",
            "mule",
        ],
        "oxford" : [i.lower() for i in df["style"] if pd.notnull(i) and "oxford" in i.lower()],
        "heels" : [i.lower() for i in df["style"] if pd.notnull(i) and "heel" in i.lower() and "low heel" not in i.lower()],
        "flats" : [i.lower() for i in df["style"] if pd.notnull(i) and "flat" in i.lower()],
        "ballet" : ["ballet", "ballerinas"],
        "sandals" : ["sandals", "sandal"],
        "more_formal" : 
        [
            "pumps", 
            "mary janes", 
            "mary jane", 
            "pump", 
            "court shoes", 
            "slingbacks", 
            "slingback",
            "pumps, classics",
            "platforms & wedges",
            "ankle strap"
        ],
        "formal" : ["strappy", "strappy, ankle straps"]
    }
    
    def assign_new_style(x):
        if pd.isna(x):
            return "None"
        x = x.lower()
        
        for s, vals in styles.items():
            if x in vals:
                return s
        return "Other"
        
    df["f_style"] = df["style"].apply(assign_new_style)
    return df


def process_title(df):  
    pricy_words = [ "authentic", "auth", "original", "orig", "rare", "patent", "luxurious", "designer",
    "real", "limited", "satin"]
    
    def assign_pricy_words(x):
        if pd.isna(x):
            return 0
        x = x.lower()
        for word in pricy_words:
            if word in x:
                return 1
        return 0

    df["f_pricy_words"] = df["title"].apply(assign_pricy_words)
    df["f_pricy_words_notes"] = df["seller_notes"].apply(assign_pricy_words)
    return df
    
    
def process_fancy_colors(df):
    fancy_colors = ["leather", "satin", "suede", "nero", "oro", "premium", "blu", "samtchevro"]
    
    def _processColors(x):
        if pd.isna(x):
            return 0
        x.lower()
        for col in fancy_colors:
            if col in x:
                return 1
        return 0
    
    cols = ["color", "colour"]
    df["color"] = df.color.fillna(df.colour)
    df["f_fancy_colors"] = df["color"].apply(_processColors)
    return df


def process_na(df, group_size =5):
    out = df.copy()
    out["f_na_vals"] = out.isnull().sum(axis=1)//group_size
    return out

In [None]:
#######################
### Building Model 3
#######################

def process_df_for_model_3(df):
    
    # Process relevant columns
    df = process_style(df)
    df = process_heel_type(df)
    df = process_heel_height(df)
    df = process_country_region_of_manufacture(df)
    df = process_material(df)
    
    # Handle brand data
    df = clean_brand_and_extract_brand_from_title(df)
    df = process_brands_mean_prices(df)
    df = process_most_expensive_brands(df)
    df = process_brands_rankings(df)
    df = process_brand(df)
    
    df = process_category(df)
    df = process_occasion(df)
    df = process_vintage(df)
    df = process_title(df)
    
    # Note on process_style_number_field function: 
    # We added this feature after carefully examining the data and seeing that there seems to be a prticular seller 
    # that used this field in a specific way, and also sells mainly expansive shoes. 
    # It fits our cause so that's why we added it, but we do recognize that it might be a problematic feature
    # for a model that will be run on a lot of previously unseen data. In this case we wouldn't just use this 
    # feature and perform more research on this particular pattern before using it (or not use it at all).
    df = process_style_number_field(df)
    df = process_fancy_colors(df)
    df = process_na(df)
    
    df = process_bool(df, "free_shipping")
    df = process_bool(df, "longtime_member")
    df = process_bool(df, "fast_safe_shipping")
    df = process_bool(df, "same_day_shipping")
    df = process_bool(df, "returns")   
    
    # Remove old columns
    c = [col for col in df.columns if col.startswith("f_") or col == "log_price"]
    df.drop(columns=[col for col in df if col not in c], inplace=True)
    
    return df