#### This notebook contains code to clean and consolidate all data into a single dataset. Please upvote if you find it useful and share your comments if you have any suggestions

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_cust=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
df_articles=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
df_trx=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

## Customer Data Cleaning

In [None]:
df_cust.info()

In [None]:
df_cust.head()

#### We can see there are some null values in columns: 'FN','Active','club_member_status','fashion_news_frequency'  
#### And column 'fashion_news_frequency' has 2 'None' values instead of 'NONE'

In [None]:
for i in['FN','Active','club_member_status','fashion_news_frequency']:
    print("null values: ",df_cust[i].isna().sum())
    print(df_cust[i].value_counts())
    print("----------------------------")

#### Replacing NaN values in FN and Active columns with 0
#### Replacing NaN values in club_member_status column with "ACTIVE" and in fashion_news_frequency column with "NONE" as they are mode values

In [None]:
df_cust['FN'].fillna(value=0,inplace=True)
df_cust['Active'].fillna(value=0,inplace=True)
df_cust['club_member_status'].fillna(value="ACTIVE",inplace=True)
df_cust['fashion_news_frequency'].fillna(value="NONE",inplace=True)

#### Replacing 2 "None" values with "NONE"

In [None]:
df_cust['fashion_news_frequency']=df_cust['fashion_news_frequency'].apply(lambda x: "NONE" if x=="None" else x)

In [None]:
for i in['FN','Active','club_member_status','fashion_news_frequency']:
    print("null values: ",df_cust[i].isna().sum())
    print(df_cust[i].value_counts())
    print("----------------------------")

## Articles Data Cleaning

In [None]:
df_articles.info()

In [None]:
df_articles.head()

#### We see the number of codes do not match the number of respective names for some columns like product name and code

In [None]:
df_articles.nunique().sort_values()

#### Creating dictionaries for name and code combinations

In [None]:
graphical_appearance=pd.Series(df_articles.graphical_appearance_name.values,
                               index=df_articles.graphical_appearance_no).sort_index().to_dict()
index_group=pd.Series(df_articles.index_group_name.values,
                               index=df_articles.index_group_no).sort_index().to_dict()
perceived_colour_value=pd.Series(df_articles.perceived_colour_value_name.values,
                               index=df_articles.perceived_colour_value_id).sort_index().to_dict()
index=pd.Series(df_articles.index_name.values,
                               index=df_articles.index_code).sort_index().to_dict()
perceived_colour_master=pd.Series(df_articles.perceived_colour_master_name.values,
                               index=df_articles.perceived_colour_master_id).sort_index().to_dict()
garment_group=pd.Series(df_articles.garment_group_name.values,
                               index=df_articles.garment_group_no).sort_index().to_dict()
colour_group=pd.Series(df_articles.colour_group_name.values,
                               index=df_articles.colour_group_code).sort_index().to_dict()
section=pd.Series(df_articles.section_name.values,
                               index=df_articles.section_no).sort_index().to_dict()
department=pd.Series(df_articles.department_name.values,
                               index=df_articles.department_no).sort_index().to_dict()
product_type=pd.Series(df_articles.product_type_name.values,
                               index=df_articles.product_type_no).sort_index().to_dict()
product=pd.Series(df_articles.prod_name.values,
                               index=df_articles.product_code).sort_index().to_dict()
dict_list={'graphical_appearance':graphical_appearance,'index_group':index_group,'perceived_colour_value':perceived_colour_value,
           'index':index, 'perceived_colour_master':perceived_colour_master,'garment_group':garment_group,
          'colour_group':colour_group,'section':section,'department':department,'product_type':product_type,
          "product": product}

In [None]:
dict_list    

#### Finding names having multiple codes each and replacing with single code

In [None]:

def find_duplicate_value(col_dict):
    y=pd.array([str(x) for x in col_dict.values()]).value_counts()
    y=y[y.values>1]
    return y.index

In [None]:
def get_duplicate_value_keys(col,col_dict):
    name=find_duplicate_value(col_dict)
    name_dict={}
    for i in name:
        codes=[x for x in col_dict.keys() if col_dict[x]==i]
#         print( i,":",codes)
        name_dict[i]=codes
        
    return name_dict



In [None]:
dup_section_dict=get_duplicate_value_keys(df_articles.section_name,section)
print(dup_section_dict)
print("---------------------------------------------------------------------------------------------------------")
dup_prodtype_dict=get_duplicate_value_keys(df_articles.product_type_name,product_type)
print(dup_prodtype_dict)
print("---------------------------------------------------------------------------------------------------------")
dup_dept_dict=get_duplicate_value_keys(df_articles.department_name,department)
print(dup_dept_dict)
print("---------------------------------------------------------------------------------------------------------")
dup_prod_dict=get_duplicate_value_keys(df_articles.prod_name,product)
print(dup_prod_dict)

In [None]:
def replace_duplicate_codes(df,name_col,code_col,dup_dict):

    for i in range(df.shape[0]):
            if(df[name_col][i] in dup_dict.keys()):
                df[code_col][i]=dup_dict[df[name_col][i]][0]
    return df

In [None]:
df_articles=replace_duplicate_codes(df_articles,'section_name','section_no',dup_section_dict)
df_articles=replace_duplicate_codes(df_articles,'product_type_name','product_type_no',dup_prodtype_dict)
df_articles=replace_duplicate_codes(df_articles,'department_name','department_no',dup_dept_dict)
df_articles=replace_duplicate_codes(df_articles,'prod_name','product_code',dup_prod_dict)

In [None]:
df_articles[df_articles.section_name=='Ladies Other']['section_no'].values

In [None]:
df_articles[df_articles.product_type_name=='Umbrella']['product_type_no'].values

In [None]:
df_articles[df_articles.department_name=='Knitwear']['department_no'].values

In [None]:
df_articles[df_articles.prod_name=='Molly dress']['product_code'].values

#### The number of names and codes match for section, product type and department, but there is difference in product name and code. This means that there are some codes which are assigned to more than one name

In [None]:
df_articles.nunique().sort_values()

#### Finding missing product names from the dictionary

In [None]:
missing_prod_name=[]
for i in df_articles.prod_name.unique():
    if i not in product.values():
        missing_prod_name.append(i)
        
missing_prod_name

#### Finding product codes associated with missing product names and adding those names to the dictionary

In [None]:

for i in missing_prod_name:
    x= df_articles.loc[df_articles.prod_name==i,'product_code']
    code=x.values[0]
    y=df_articles[df_articles.product_code==code][['product_code','prod_name']]
    product[code]=set(y.prod_name)
    missing_prod_name=[x for x in missing_prod_name if x not in product[code]]
    

In [None]:
missing_prod_name

#### The missing product names are now added to dictionary keys as set

In [None]:
product

#### Some of the columns have -1 values probably referring to missing data

In [None]:
print("columns having -1 values: \n")
cols_missing_value=[]
for i in df_articles.columns:
    if (-1 in df_articles[i].value_counts()):
        cols_missing_value.append(i)
print(cols_missing_value)        

#### -1 value in all code columns refer to the 'Unknown' category. Therefore keeping -1 values as it is

In [None]:
product_type[-1]

In [None]:
graphical_appearance[-1]

In [None]:
colour_group[-1]

In [None]:
perceived_colour_value[-1]

In [None]:
perceived_colour_master[-1]

In [None]:
df_cust.to_csv("customers_clean.csv",index=False)
df_articles.to_csv("articles_clean.csv", index=False)

## Merging Transaction data with Customer and Articles data to form final dataset

In [None]:
df_trx.info()

In [None]:
df_trx.head()

In [None]:
df=pd.merge(left=df_cust,right=df_trx,on="customer_id",how="left")
df=pd.merge(left=df,right=df_articles,on='article_id',how='left')

#### Final dataset: df

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.to_csv("hm_data.csv",index=False)