# Dataset creation + formatting

The purpose of this notebook is to **run some initial data preparation prior to the modeling notebooks in this folder**. 

Some of the data preparation steps in this notebook include:
- Creation of the 5% sample and 10% sample datasets
- Creation of the tf-idf PCA columns based off the article descriptions
- Creation of the association analysis dataframe

# Import statements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

from scipy import sparse 
from pandas.api.types import CategoricalDtype 

from sklearn.neighbors import NearestNeighbors
from scipy.spatial import KDTree

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score,f1_score,precision_score,recall_score
from sklearn.model_selection import GridSearchCV,GroupKFold
from sklearn.calibration import CalibratedClassifierCV

import xgboost as xgb

import nltk

from nltk import *
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from tqdm import tqdm
tqdm.pandas()

# Read in data + fix data types

In [None]:
sample = '_05'

# Read in articles data
df_art = pd.read_csv('../Data/articles/articles'+sample+'.csv')
df_cust = pd.read_csv('../Data/customers/customers'+sample+'.csv')
df_trans = pd.read_csv('../Data/transactions_train/transactions_train'+sample+'.csv')

In [None]:
# Fix format of article IDs
df_art['article_id'] = df_art['article_id'].astype(str).str.zfill(10)
df_art['detail_desc'] = df_art['detail_desc'].astype(str)
df_trans['article_id'] = df_trans['article_id'].astype(str).str.zfill(10)

# Fix datetime type
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])

# Build df_cust age brackets
df_cust['Age_Bracket'] = pd.cut(df_cust['age'],[1,19,29,39,49,59,200],labels=[1,2,3,4,5,6]).fillna(2)

# Update the color column for df_art
df_art['color'] = np.where(df_art['perceived_colour_master_name'].isin(['Blue','Turquoise','Bluish Green']),'Blue',\
                  np.where(df_art['perceived_colour_master_name'].isin(['Green','Yellowish Green','Khaki green']),'Green',\
                  np.where(df_art['perceived_colour_master_name'].isin(['Brown','Beige','Mole']),'Brown',\
                  np.where(df_art['perceived_colour_master_name'].isin(['Grey','Metal']),'Grey',\
                           df_art['perceived_colour_master_name']))))

# Identify age + gender category of an article

In [None]:
df_art['Age_Category'] = np.where(df_art['index_group_name']=='Baby/Children','Kids',\
                                 np.where(df_art['index_group_name']=='Divided','YA','Adult'))

femaleproducts = ['Dress','Leggings/Tights','Bag','Skirt','Bra','Hair/alice band','Blouse','Earring','Bikini top',\
                 'Hair string','Necklace','Bodysuit','Ballerinas','Pumps','Underwear Tights','Bracelet','Ring','Wedge']

fDepts = ['Ladies','Girls']
mDepts = ['Men','Boys']

female_depts = [i for i in df_art['department_name'].unique() if any([x in i for x in fDepts])]
male_depts = [i for i in df_art['department_name'].unique() if any([x in i for x in mDepts])]

female_sect = [i for i in df_art['section_name'].unique() if any([x in i for x in fDepts])]
male_sect = [i for i in df_art['section_name'].unique() if any([x in i for x in mDepts])]

female_desc = [i for i in df_art['detail_desc'].unique() if any([' ' + x.lower() + ' ' in i for x in fDepts]) or\
                                               any([' ' + x.lower() + ' ' in i for x in femaleproducts])]

df_art['Gender_Category'] = np.where(df_art['index_name'].isin(['Ladieswear','Ladies Accessories','Lingeries/Tights']),\
                                     'F',np.where(df_art['index_name'].isin(['Menswear']),'M','U'))

df_art['Gender_Category'] = np.where(df_art['Gender_Category'] != 'U',df_art['Gender_Category'],\
                                np.where(df_art['garment_group_name'].isin(['Dresses Ladies','Blouses','Skirts']),'F','U'))

df_art['Gender_Category'] = np.where(df_art['Gender_Category'] != 'U',df_art['Gender_Category'],\
                                np.where(df_art['product_type_name'].isin(femaleproducts),'F','U'))

df_art['Gender_Category'] = np.where(df_art['Gender_Category'] != 'U',df_art['Gender_Category'],\
                                np.where(df_art['department_name'].isin(male_depts),'M',\
                                np.where(df_art['department_name'].isin(female_depts),'F','U')))

df_art['Gender_Category'] = np.where(df_art['Gender_Category'] != 'U',df_art['Gender_Category'],\
                                np.where(df_art['section_name'].isin(male_sect),'M',\
                                np.where(df_art['section_name'].isin(female_sect),'F','U')))

df_art['Gender_Category'] = np.where(df_art['Gender_Category'] != 'U',df_art['Gender_Category'],\
                                np.where(df_art['detail_desc'].isin(female_desc),'F','U'))

df_art['Gender_Category'] = np.where(df_art['Gender_Category'] != 'U',df_art['Gender_Category'],\
                                np.where(df_art['index_group_name']!='Baby/Children','U',\
                                np.where(df_art['color']=='Blue','M',np.where(df_art['color']=='Pink','F','U'))))

In [None]:
# Write article csv with updated columns for future use
df_art.to_csv('../Data/articles/articles'+sample+'.csv',index=False)

# Article description Tf-idf + PCA

In [None]:
bad_words = []
bad_words = list(set(bad_words + [i for i in feature_names if any([j in i for j in ['0','1','2','3','4','5','6','7','8','9']])]))

In [None]:
corpus = df_art['detail_desc']

my_stop_words = text.ENGLISH_STOP_WORDS.union(bad_words)

vectorizer = TfidfVectorizer(stop_words=my_stop_words)
vectors = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
print(len(feature_names))

In [None]:
matrix = vectors.todense()
list_dense = matrix.tolist()
df = pd.DataFrame(list_dense,columns=feature_names)
df

In [None]:
x = df.sum().sort_values()
df2 = df[x[x > 500].index]
df2

In [None]:
components = 10

transformer = PCA(n_components=components,random_state=0)
transformer.fit(df2)
df3 = pd.DataFrame(transformer.fit_transform(df2),columns = ['PCA'+str(i) for i in range(1,components+1)],index=df_art['article_id']).reset_index()

In [None]:
df3.to_csv('../Datasets/PCA_Vectorizer.csv',index=False)

# BUILD SAMPLE DATASET

Build a 5% and 10% dataset based on active customers (defined as customers with 5+ purchases and a most recent purchase in the last 365 days)

In [None]:
percentage = 10

In [None]:
# Sample 70K valid customers (purchase in last year and at least 5 articles purchased)
df_num_purchases = df_trans.groupby('customer_id').agg({'t_dat':['nunique','max'],'article_id':'count'})
df_num_purchases.columns = ['NumShoppingDays','LastShoppingDay','NumPurchases']
df_num_purchases = df_num_purchases.reset_index()

In [None]:
valid_customers = df_num_purchases.loc[(df_num_purchases['LastShoppingDay'] >= '2019-09-22') & \
                                      (df_num_purchases['NumPurchases'] >= 5),['customer_id']]
valid_customers

In [None]:
df_cust_sample = valid_customers.sample(len(df_cust)*(percentage/100), replace=False)
df_trans_sample = df_trans.loc[df_trans['customer_id'].isin(df_cust_sample['customer_id'])]
df_cust_final = df_cust.loc[df_cust['customer_id'].isin(df_cust_sample['customer_id'])]
df_art_sample = df_art.loc[df_art['article_id'].isin(df_trans_sample['article_id'])]
df_trans_sample

In [None]:
df_art_sample.to_csv('../Data/articles/articles_'+str(percentage)+'.csv',index=False)
df_trans_sample.to_csv('../Data/transactions_train/transactions_train_'+str(percentage)+'.csv',index=False)
df_cust_final.to_csv('../Data/customers/customers_'+str(percentage)+'.csv',index=False)

# Association analysis - filter down to articles sold in the 2 weeks prior

In [None]:
# Truncate dataset to articles sold in the last 2 weeks, for scalability
sold_last_week = df_trans.loc[df_trans['t_dat'] >= '2020-09-07','article_id'].unique()
df_trans_train2 = df_trans.loc[df_trans['article_id'].isin(sold_last_week)].copy()

In [None]:
# Association Analysis - find the number of customers who bought each article alongside target article
# NOTE: THIS TAKES 1.1 HOURS TO RUN FOR 1500 ARTICLES

top_articles = df_trans_train2['article_id'].value_counts()[:15000]

# art_dict = {}

for art_id in tqdm(top_articles.index):
    buyers = df_trans_train2.loc[df_trans_train2['article_id']==art_id,'customer_id'].unique()
    others = df_trans_train2.loc[(df_trans_train2['customer_id'].isin(buyers))&(df_trans_train2['article_id'] != art_id),\
                               'article_id'].value_counts()[:12]
    art_dict[art_id] = others

art_dict2 = {i:list(art_dict[i].index) for i in art_dict}



## CONVERT ASSOCIATION DICTIONARY INTO A CSV SO WE DON'T NEED TO KEEP REPEATING THIS LONG PROCESS

df_artdict = pd.DataFrame(art_dict2).T.reset_index().rename(columns={'index':'article_id'})
df_artdict.to_csv('../Data/association_v2.csv',index=False)