In [1]:
import pandas as pd
import re
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(palette="magma_r")

In [2]:
xlsx = pd.ExcelFile('../../data/raw/mar19.xlsx')
xlsx.sheet_names

['1. top_100_brands(brandname+syn',
 '2. reference_color',
 '3. personalization_rules(exampl',
 '4. sample_occasion',
 '5. influencer_color_rules',
 '6. category_and_subcategory',
 '7. user_subset',
 '8. item_subset',
 '9. category_ids',
 '10. brands_affinity',
 '11. styling_segments',
 '12. wishlist_items',
 '13. user_influencer',
 '14. 100_users_set',
 '15. 100_users_item_set',
 '16. 100_user_influencer']

In [3]:
# Variables to avoid hardcoding of sheet names which might change later
top_brands_sheet = "1. top_100_brands(brandname+syn"
category_sheet = "9. category_ids"
user_sheet = "14. 100_users_set"
items_sheet = "15. 100_users_item_set"
wishlist_sheet = "12. wishlist_items"
influencer_sheet = "16. 100_user_influencer"

# Load Category ID data

In [4]:
df_cat = xlsx.parse(category_sheet)

df_cat.dropna(inplace=True)
df_cat["Category ID"] = df_cat["Category ID"].astype("int64")
df_cat.head()

Unnamed: 0,Category Name,Category ID
0,Tops,110
1,Blouses,111
2,T Shirts,112
3,Tanks,113
4,Knits,114


# Load unique brandID strings for top brands

In [None]:
df_topbrands = xlsx.parse(top_brands_sheet)
df_topbrands = df_topbrands["brand_name"].append(df_topbrands["brand_name_synonym"]).map(lambda x : "".join(x.split()).lower()).drop_duplicates().sort_values().reset_index(drop=True)
df_topbrands.sample(15)


71            madewell
116    victoriassecret
24            colehaan
7            anntaylor
112               vans
43           hollister
104               toms
29           dolcevita
114               vici
96           stelladot
113              venus
120        warbyparker
103               tobi
13      bananarepublic
70               lulus
dtype: object

# Load influencer data to attach to users 

In [None]:
influencers = [
    "ariellecharnas",
    "blaireadiebee",
    "blakevond",
    "chiaraferragni",
    "hannahbronfman",
    "jordynwoods",
    "manrepeller",
    "mayemusk",
    "nicolettemason",
    "seaofshoes",
    "somethingnavy",
    "weworewhat"
]

df_influencers = xlsx.parse(influencer_sheet)
df_influencers["user_id"] = df_influencers["user_id"].map(lambda x : x[-13:])
df_influencers["influencers"] = df_influencers["style_who_inspiries"].map(lambda x : [1 if re.search(i,x) else 0 for i in influencers ])
df_influencers[influencers] = pd.DataFrame(df_influencers["influencers"].values.tolist(), index = df_influencers.index)
df_influencers.drop(["style_who_inspiries", "influencers"], axis =1, inplace=True)
df_influencers.fillna(0,inplace=True)

In [None]:
df_influencers.head()

# User data EDA

In [None]:
df_users = xlsx.parse(user_sheet)

df_users.columns

In [None]:
df_users = xlsx.parse(user_sheet)

user_fields = [
    "user_id",
    "style_age_range",
    "style_age_range_group",
    "items_in_wishlist",
    "style_brands_selected",
    "style_size_preference_none",
    "style_size_preference_petite",
    "style_size_preference_extra_long",
    "style_size_preference_plus",
    "style_size_preference_maternity",
    "style_size_preference_skipped",
    "style_vibe",
    "has_stype_vibe",
    "style_who_inspiries_skipped",
    "style_looks_wanted_dates",
    "style_looks_wanted_everyday",
    "style_looks_wanted_formal",
    "style_looks_wanted_nights",
    "style_looks_wanted_other",
    "style_looks_wanted_summer",
    "style_looks_wanted_travel",
    'style_looks_wanted_winter',
    "style_looks_wanted_work",
    "style_looks_wanted_workouts",
    "style_looks_wanted_skipped",
    'style_most_important_active', 
    'style_most_important_any',
    'style_most_important_beach', 
    'style_most_important_dress',
    'style_most_important_bags', 
    'style_most_important_jeans',
    'style_most_important_jump', 
    'style_most_important_nothing',
    'style_most_important_outwear', 
    'style_most_important_pants',
    'style_most_important_shoes', 
    'style_most_important_tops',
    'style_most_important_skipped'
    ]

df_users = df_users[user_fields]
df_users["user_id"] = df_users["user_id"].map(lambda x : x[-13:])
df_users["style_age_range_group"] = df_users["style_age_range_group"].fillna(5)
df_users["style_vibe"] = df_users["style_vibe"].fillna("None")

user_fillna_zero_columns = [
    "style_size_preference_none",
    "style_size_preference_petite",
    "style_size_preference_extra_long",
    "style_size_preference_plus",
    "style_size_preference_maternity",
    "style_looks_wanted_dates",
    "style_looks_wanted_everyday",
    "style_looks_wanted_formal",
    "style_looks_wanted_nights",
    "style_looks_wanted_other",
    "style_looks_wanted_summer",
    "style_looks_wanted_travel",
    'style_looks_wanted_winter',
    "style_looks_wanted_work",
    "style_looks_wanted_workouts",
    'style_most_important_active', 
    'style_most_important_any',
    'style_most_important_beach', 
    'style_most_important_dress',
    'style_most_important_bags', 
    'style_most_important_jeans',
    'style_most_important_jump', 
    'style_most_important_nothing',
    'style_most_important_outwear', 
    'style_most_important_pants',
    'style_most_important_shoes', 
    'style_most_important_tops',
    'style_most_important_skipped'
]
df_users[user_fillna_zero_columns] = df_users[user_fillna_zero_columns].fillna(0)

user_drop_columns = [
    "style_age_range",
    "style_brands_selected",
    "has_stype_vibe",
    "style_who_inspiries_skipped",
    "items_in_wishlist"
]
df_users.drop(user_drop_columns, axis=1, inplace=True)

user_int_conversion_columns = [
    "style_age_range_group",
    "style_size_preference_none",
    "style_size_preference_petite",
    "style_size_preference_extra_long",
    "style_size_preference_plus",
    "style_size_preference_maternity",
    "style_size_preference_skipped",
    "style_looks_wanted_dates",
    "style_looks_wanted_everyday",
    "style_looks_wanted_formal",
    "style_looks_wanted_nights",
    "style_looks_wanted_other",
    "style_looks_wanted_summer",
    "style_looks_wanted_travel",
    'style_looks_wanted_winter',
    "style_looks_wanted_work",
    "style_looks_wanted_workouts",
    "style_looks_wanted_skipped",
    'style_most_important_active', 
    'style_most_important_any',
    'style_most_important_beach', 
    'style_most_important_dress',
    'style_most_important_bags', 
    'style_most_important_jeans',
    'style_most_important_jump', 
    'style_most_important_nothing',
    'style_most_important_outwear', 
    'style_most_important_pants',
    'style_most_important_shoes', 
    'style_most_important_tops',
    'style_most_important_skipped'
]
df_users[user_int_conversion_columns] = df_users[user_int_conversion_columns].astype("int64")


df_users = pd.merge(df_users,df_influencers,left_on="user_id",right_on="user_id", how="left")
df_users[influencers] = df_users[influencers].fillna(0).astype("int64")
df_users.sample(5)

In [None]:
""" This is a concern to be addressed """

df_users.duplicated(df_users.columns[1:]).sum()

In [None]:
"""Style vibe hardly repeats (about 6). Having so many values will throw the model off once numeric encoded, 
as there is not enough repetition across observations 
This column in current state is not worth cleaning 
"""
df_users.style_vibe.value_counts()

In [None]:
df_users.drop("style_vibe", inplace=True, axis =1)

In [None]:
"""
style_looks wanted columns seem interesting, but useless as none of the users have any data.
The best we can possibly do here is to impute 
"""

for i in df_users.columns[1:]:
    temp = df_users[i].value_counts().reset_index()
    sns.countplot(x=i,  data=df_users)
    plt.show()

In [None]:
"""
placeholder to impute style looks wanted columns
for now drop the columns
"""

style_looks_columns = [
    "style_looks_wanted_dates",
    "style_looks_wanted_everyday",
    "style_looks_wanted_formal",
    "style_looks_wanted_nights",
    "style_looks_wanted_other",
    "style_looks_wanted_summer",
    "style_looks_wanted_travel",
    'style_looks_wanted_winter',
    "style_looks_wanted_work",
    "style_looks_wanted_workouts",
] 

df_users.drop(style_looks_columns, axis=1, inplace=True)

In [None]:
## Write dataframe to CSV file
df_users.to_csv("../..//data/processed/users.csv",index=False)

# EDA for Item Data

In [None]:
item_columns_tokeep = [
        'user_id', 'brand_id', 'user_provided_brand_name', 'parsed_brand_name',
        'store_id', 'user_provided_store_name','parsed_store_name','product_id', 
        'item_name_lower', 'product_category_id', 'paid_price',
        'list_price', 'sale_price',
        'order_total_amt', 'size', 'email_dt', 'color_parsed']

""" usecols not working. Seems to be a bug"""
df_items = xlsx.parse(items_sheet)
df_items = df_items[item_columns_tokeep]


df_items["user_id"] = df_items["user_id"].map(lambda x : x[-13:])

##Dropping rows with null product ID
df_items = df_items[~df_items["product_id"].isnull()]
df_items["product_id"] = df_items["product_id"].map(lambda x : x[-8:])



df_items["product_category_id"].fillna(0, inplace=True)
df_items["product_category_id"] = df_items["product_category_id"].astype("int64")


##Adding "on sale" column indicate a purchase of item in sale 
df_items["on_sale"] = df_items["sale_price"].map(lambda x : True if x > 0 else False)
df_items["part_of_order"] = df_items.apply(lambda x : True if x["order_total_amt"] > x["paid_price"] else False, axis=1)

##Get brand id with blank from other two brand columns when avaialble, else set to None
condition = pd.isnull(df_items["brand_id"])
df_items.loc[condition,"brand_id"] = df_items.loc[condition,"parsed_brand_name"]
condition = pd.isnull(df_items["brand_id"])
df_items.loc[condition,"brand_id"] = df_items.loc[condition,"user_provided_brand_name"]
condition = pd.notnull(df_items["brand_id"])
df_items.loc[condition,"brand_id"] = df_items.loc[condition,"brand_id"].map(lambda x : "".join(x.split()).lower())

df_items["brand_id"].fillna("None", inplace=True)

##Get store id with blank from other two brand columns when avaialble, else set to None
condition = pd.isnull(df_items["store_id"])
df_items.loc[condition,"store_id"] = df_items.loc[condition,"parsed_store_name"]
condition = pd.isnull(df_items["store_id"])
df_items.loc[condition,"store_id"] = df_items.loc[condition,"user_provided_store_name"]
condition = pd.notnull(df_items["brand_id"])
df_items.loc[condition,"store_id"] = df_items.loc[condition,"store_id"].map(lambda x : "".join(x.split()).lower())

##Set store id ro Brand ID when null
condition = pd.isnull(df_items["store_id"])
df_items.loc[condition,"store_id"] = df_items.loc[condition,"brand_id"]

## Set store ID to none if still not available
df_items["store_id"].fillna("None", inplace=True)

df_items["top_brand"] = df_items["store_id"].map(lambda x : True if x in df_topbrands.values.tolist() else False )


item_drop_columns = [
    "sale_price",
    "order_total_amt",
    "user_provided_brand_name",
    "parsed_brand_name",
    "user_provided_store_name",
    "parsed_store_name"
]
df_items.drop(item_drop_columns, axis = 1,inplace=True)


df_items.sample(15)


In [None]:
""" Seems like this part of order column can be dropped"""
df_items.part_of_order.value_counts()

In [None]:
""" Leave on sale as is for now"""
df_items.on_sale.value_counts()

In [None]:
""" Category ID have lot of in correct values. Drop the rows with category ID that is not present in master list"""
df_items.product_category_id.value_counts()

In [None]:
condition = df_items.product_category_id.map( lambda x : x in df_cat["Category ID"].values.tolist())
df_items = df_items[condition]

In [None]:
plt.figure(figsize=(12,8))
g = sns.countplot(df_items.product_category_id)
loc, labels = plt.xticks(rotation=90)

In [None]:
major_cats = (df_items.product_category_id//100)*100
plt.figure(figsize=(8,8))
g = sns.countplot(major_cats)
loc, labels = plt.xticks(rotation=90)

In [None]:
wishlist_columns_tokeep = [
    
]
df_wish = xlsx.parse(wishlist_sheet)
df_wish["user_id"] = df_wish["userid"].map(lambda x : x[-13:])
df_wish["product_id"] = df_wish["itemid"].map(lambda x : x[-8:])

wish_drop_columns = [
    "userid", "itemid"
]
df_wish.drop(wish_drop_columns, axis=1, inplace=True)

df_wish.head()

In [None]:
len(df_items.brand_id.unique())

In [None]:
df_items.store_id.unique()

In [None]:
df_items.head()