In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
import gensim
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
DBFO = "/dbfs/FileStore/tables/OFFSHORE/"
DBFM = "/dbfs/FileStore/tables/MALLIK/"
DBFR = "/dbfs/FileStore/tables/OFFSHORE_RESULTS/"
path = 'C:\\users\\iny2819\\kroger\\Data\\'   
DBFS = path
DBFO = path
DBFM = path
DBFR = path


In [3]:
# Reading PIMMART data
pim_gtin_mapped = pd.read_csv(DBFR + "PIM_Data_New_50_82Mn.csv", dtype=object)
for i in ['SUBCOM_CD', 'DPT_CD', 'COM_CD','PMY_DPT_CD', 'REC_DPT_CD', 'ITM_ID', 'GTIN']:
    pim_gtin_mapped[i] = pim_gtin_mapped[i].astype(np.float64)

# Reading Syndigo 259K data
synd_ALL = pd.read_csv(DBFR + 'Syndigo_Final_ALL.csv') # 259k Syndigo Data
for i in ['SUBCOM_CD', 'DPT_CD', 'COM_CD', 'GTIN', 'ITM_ID', 'PMY_DPT_CD']:
    synd_ALL[i] = synd_ALL[i].astype(np.float64)

# Trimming empty spaces from all columns
df_obj = synd_ALL.select_dtypes(['object'])
synd_ALL[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

  synd_ALL = pd.read_csv(DBFR + 'Syndigo_Final_ALL.csv') # 259k Syndigo Data


In [4]:
syndigo_mapped = synd_ALL
pimmart = pim_gtin_mapped

In [5]:
syndigo_mapped.drop_duplicates('GTIN', inplace = True)

In [6]:
syndigo_mapped['ITEM_SUBCOM_text'] = \
(syndigo_mapped.VND_ECOM_DSC + ' ' + syndigo_mapped.SUBCOM_DSC).fillna('').str.lower()

In [7]:
syndigo_mapped['Level 1'].value_counts()

Food / Beverages                            131593
Health & Beauty                              62259
Beer / Wine / Spirits                        17992
Cleaning & Janitorial                         9084
Livestock & Pet Supplies                      6325
Kitchen & Bathroom                            5268
Home & Venue Decoration                       3924
Toys / Games / Hobbies                        3191
Gardening & Outdoors                          2208
Childcare                                     2111
Office Supplies                               1671
Electronics                                   1499
Apparel                                       1283
Lighting & Fans                               1252
Tobacco Products                              1252
Not classified                                1018
Hardware                                       923
Arts & Crafts                                  803
Automotive                                     778
Appliances                     

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range= (1,2), max_features = 800000)

syndigo_mapped['Level 1'].value_counts()

In [11]:
### import word2vec model trained on entire PIMMART data
from gensim.models import Word2Vec, KeyedVectors
#model = KeyedVectors.load_word2vec_format('/dbfs/FileStore/tables/DATA_SCIENCE/w2vmodel_053123_PIM_ALL.bin', binary=True)
model = KeyedVectors.load_word2vec_format(path + 'w2vmodel_053123_PIM_ALL.bin', binary=True)                                        
model.most_similar('chips')

[('chips,', 0.7620069980621338),
 ('puffed/popped', 0.6928106546401978),
 ('veggie/grain', 0.6568589210510254),
 ('chips.', 0.6461755037307739),
 ('popped', 0.6434525847434998),
 ('snacks', 0.6221341490745544),
 ('crisps', 0.6071634292602539),
 ('thins', 0.5962806344032288),
 ('<', 0.570486307144165),
 ('kurokirishima', 0.5686667561531067)]

In [12]:
#Build item vectors
def get_item_vector(item_vocab):
    vect = np.zeros_like(model.get_vector('chips'))
    for word in item_vocab:
        if word in model:
            vect += model.get_vector(word)
    return vect#/max(1,len(item_vocab))

## RUN UNTIL HERE only ONCE

In [13]:
level__1 = "Beer / Wine / Spirits"
filenamee = "BEER_WINE_SPIRITS"

In [14]:
# level__1 = "Beer / Wine / Spirits"
# filenamee = "BEER_WINE_SPIRITS"

level__1 = "Cleaning & Janitorial"
filenamee = "CLEANING_JANITORIAL"

level__1 = "Toys / Games / Hobbies"
filenamee = "TOYS_GAMES_HOBBIES"

level__1 = "Office Supplies"

level__1 = "Building Supplies"
filenamee = "Building_Supplies"

level__1 = "Flooring"
filenamee = "FLOORING"

level__1 = "Beer / Wine / Spirits"
filenamee = "BEER_WINE_SPIRITS"


# Electronics

level__1 = "Food / Beverages"
filenamee = "Food / Beverages"
syndigo_mapped[syndigo_mapped['Level 1']==level__1]['Level 2'].value_counts(ascending=True)

Meal Kits (Perishable)                                        309
Baby & Toddler Food                                          1623
Produce                                                      1649
Meat / Poultry / Seafood / Meat Substitutes (Perishable)     3976
Bakery / Deli                                                5333
Dairy & Egg Products                                         9018
Frozen Foods                                                13582
Beverages                                                   21240
Grocery                                                     74863
Name: Level 2, dtype: int64

#### word2vec Vectorization

In [15]:
level_1 = level__1
subset_df = syndigo_mapped[syndigo_mapped['Level 1'] == level_1]
x_subset_w2v = np.array(list(subset_df.ITEM_SUBCOM_text.apply(lambda x: get_item_vector(x.split(' ')))))
level2_id_map = dict(zip(subset_df['Level 2'].fillna('Other').unique(), range(subset_df['Level 2'].fillna('Other').nunique())))
id2_level_map = dict(zip(range(subset_df['Level 2'].fillna('Other').nunique()), subset_df['Level 2'].fillna('Other').unique()))
y_subset_w2v  = subset_df['Level 2'].fillna('Other').map(level2_id_map)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x_subset_w2v, y_subset_w2v.values, test_size= 0.2, stratify=y_subset_w2v, random_state=42)

### MLP Algorithm

In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(200,), activation = 'relu', learning_rate= 'adaptive', early_stopping= True)
#print(f"Test size: {len(A_test)}\nTrain size: {len(A_train)}\n")
print(f"Training - w2v + MLP \"{level_1}\"")
warnings.filterwarnings("ignore", category=UserWarning)
mlp.fit(X_train, y_train)

Training - w2v + MLP "Food / Beverages"


In [19]:
print(' Display MLP test  metrics')
preds_test_mlp = mlp.predict(X_test)
preds_train_mlp = mlp.predict(X_train)
print(classification_report(y_test, preds_test_mlp,labels = mlp.classes_, target_names = [id2_level_map[i] for i in mlp.classes_]))

 Display MLP test  metrics
                                                          precision    recall  f1-score   support

                                    Dairy & Egg Products       0.89      0.93      0.91      1804
                                                 Grocery       0.93      0.96      0.95     14973
                                               Beverages       0.97      0.95      0.96      4248
Meat / Poultry / Seafood / Meat Substitutes (Perishable)       0.70      0.73      0.71       795
                                            Frozen Foods       0.89      0.88      0.88      2716
                                           Bakery / Deli       0.67      0.41      0.51      1067
                                     Baby & Toddler Food       0.85      0.90      0.88       324
                                                 Produce       0.64      0.61      0.62       330
                                  Meal Kits (Perishable)       0.30      0.19      0.24   

### Oversampling minority classes

In [None]:
%pip install -U imbalanced-learn

In [20]:
#Original Level Counts
dict(pd.Series(y_train).value_counts())

{1: 59890,
 2: 16992,
 4: 10866,
 0: 7214,
 5: 4266,
 3: 3181,
 7: 1319,
 6: 1299,
 8: 247}

In [21]:
dic_y_train = dict(pd.Series(y_train).value_counts())

In [23]:
max_cnt = max(dic_y_train.values())
max_cnt

59890

In [28]:
def get_smp_strgy
    """
        get_smp_strgy  :  get sampling_strategy for SMOTE
    """
    des_lvl_cnt = {} 
    adj_ratio = 0.2
    adj_cnt =  int(max(dic_y_train.values()) * adj_ratio)
    for key, values in dic_y_train.items():
        if values < adj_cnt:
           print(f'change value key = {key}, values = {values}')
           des_lvl_cnt[key] = adj_cnt
        else: 
            print(f'no change value key = {key}, values = {values}')
            des_lvl_cnt [key] = values
            
    des_lvl_cnt.items()    

11978

In [47]:
des_lvl_cnt = {} 
adj_ratio = 0.2
adj_cnt =  int(max(dic_y_train.values()) * adj_ratio)
for key, values in dic_y_train.items():
    if values < adj_cnt:
       print(f'change value key = {key}, values = {values}')
       des_lvl_cnt[key] = adj_cnt
    else: 
        print(f'no change value key = {key}, values = {values}')
        des_lvl_cnt [key] = values
        
des_lvl_cnt.items()         
    

no change value key = 1, values = 59890
no change value key = 2, values = 16992
change value key = 4, values = 10866
change value key = 0, values = 7214
change value key = 5, values = 4266
change value key = 3, values = 3181
change value key = 7, values = 1319
change value key = 6, values = 1299
change value key = 8, values = 247


dict_items([(1, 59890), (2, 16992), (4, 11978), (0, 11978), (5, 11978), (3, 11978), (7, 11978), (6, 11978), (8, 11978)])

In [34]:
des_lvl_cnt

{1: 59890,
 2: 11978,
 4: 11978,
 0: 11978,
 5: 11978,
 3: 11978,
 7: 11978,
 6: 11978,
 8: 11978}

In [48]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE(random_state= 42, sampling_strategy= des_lvl_cnt ).fit_resample(X_train, y_train)
print('Before Resampling:\n',pd.Series(y_train).value_counts())
print('After Resampling:\n',pd.Series(y_resampled).value_counts())

Before Resampling:
 1    59890
2    16992
4    10866
0     7214
5     4266
3     3181
7     1319
6     1299
8      247
dtype: int64
After Resampling:
 1    59890
2    16992
0    11978
4    11978
3    11978
7    11978
5    11978
6    11978
8    11978
dtype: int64


In [49]:
mlp_smote = MLPClassifier(hidden_layer_sizes=(200,), activation = 'relu', learning_rate= 'adaptive', early_stopping= True)
#print(f"Test size: {len(A_test)}\nTrain size: {len(A_train)}\n")
print(f"Training - w2v + MLP \"{level_1}\"")
warnings.filterwarnings("ignore", category=UserWarning)
mlp_smote.fit(X_resampled, y_resampled)

Training - w2v + MLP "Food / Beverages"


In [43]:
print(' Display MLP + SMOTE test  metrics')
preds_test_mlp_smote = mlp_smote.predict(X_test)
preds_train_mlp_smote = mlp_smote.predict(X_resampled)
print(classification_report(y_test, preds_test_mlp_smote,labels = mlp_smote.classes_, target_names = [id2_level_map[i] for i in mlp_smote.classes_]))

 Display MLP + SMOTE test  metrics
                                                          precision    recall  f1-score   support

                                    Dairy & Egg Products       0.92      0.90      0.91      1804
                                                 Grocery       0.94      0.95      0.95     14973
                                               Beverages       0.97      0.95      0.96      4248
Meat / Poultry / Seafood / Meat Substitutes (Perishable)       0.68      0.76      0.72       795
                                            Frozen Foods       0.89      0.88      0.88      2716
                                           Bakery / Deli       0.64      0.46      0.53      1067
                                     Baby & Toddler Food       0.81      0.94      0.87       324
                                                 Produce       0.54      0.70      0.61       330
                                  Meal Kits (Perishable)       0.28      0.32     

In [50]:

test_metrics = pd.DataFrame(classification_report(y_test, preds_test_mlp_smote,labels = mlp_smote.classes_, target_names = [id2_level_map[i] for i in mlp_smote.classes_],  output_dict= True))
#test_metrics = pd.DataFrame(classification_report(y_test, preds,labels = lr_tf.classes_, target_names = [id_l1_l2_map[i] for i in lr_tf.classes_],  output_dict= True)).T

In [54]:
t_test_metrics = test_metrics.T

Unnamed: 0,precision,recall,f1-score,support
Dairy & Egg Products,0.923599,0.904656,0.91403,1804.0
Grocery,0.940012,0.951312,0.945628,14973.0
Beverages,0.965822,0.951271,0.958491,4248.0
Meat / Poultry / Seafood / Meat Substitutes (Perishable),0.684869,0.757233,0.719235,795.0
Frozen Foods,0.886506,0.882916,0.884708,2716.0
Bakery / Deli,0.641261,0.457357,0.533917,1067.0
Baby & Toddler Food,0.81016,0.935185,0.868195,324.0
Produce,0.541176,0.69697,0.609272,330.0
Meal Kits (Perishable),0.28169,0.322581,0.300752,62.0
accuracy,0.910293,0.910293,0.910293,0.910293


In [53]:
test_metrics

Unnamed: 0,Dairy & Egg Products,Grocery,Beverages,Meat / Poultry / Seafood / Meat Substitutes (Perishable),Frozen Foods,Bakery / Deli,Baby & Toddler Food,Produce,Meal Kits (Perishable),accuracy,macro avg,weighted avg
precision,0.923599,0.940012,0.965822,0.684869,0.886506,0.641261,0.81016,0.541176,0.28169,0.910293,0.741678,0.909563
recall,0.904656,0.951312,0.951271,0.757233,0.882916,0.457357,0.935185,0.69697,0.322581,0.910293,0.762165,0.910293
f1-score,0.91403,0.945628,0.958491,0.719235,0.884708,0.533917,0.868195,0.609272,0.300752,0.910293,0.748248,0.909032
support,1804.0,14973.0,4248.0,795.0,2716.0,1067.0,324.0,330.0,62.0,0.910293,26319.0,26319.0


In [32]:
desired_levelcount = {1: 59890,
 2: 16992,
 4: 10866,
 0: 10000,
 5: 10000,
 3: 10000,
 7: 10000,
 6: 10000,
 8: 10000}

#### IGNORE BELOW

preds = lr_tf.predict(X_test)
preds_lrtf = preds
probs = lr_tf.predict_proba(X_test)
preds_train = lr_tf.predict(X_train)
probs_train = lr_tf.predict_proba(X_train)
print(f"Done Training - \"{level_1}\"")

level_num = []
testProobs = []
number = 1

print(len(preds))
for i in range(len(preds)):
    for j in range(number):
        try:
            level_num.append(int(np.where(probs[i]==probs[i][np.argsort(probs[i])][::-1][:3][j])[0]))
        except TypeError:
            level_num.append(int(np.where(probs[i]==probs[i][np.argsort(probs[i])][::-1][:3][2])[0][0]))
            print("problem occurred with: ", i, j, flush=True)
    testProobs.append(probs[i][np.argsort(probs[i])][::-1][:1])

test_new_proobs = []
for i in range(len(testProobs)):
    test_new_proobs.append(testProobs[i].tolist())
test_new_proobs = [element for sublist in list(test_new_proobs) for element in sublist]

level_num = []
trainProobs = []

print(len(preds_train))
for i in range(len(preds_train)):
    for j in range(number):
        try:
            level_num.append(int(np.where(probs_train[i]==probs_train[i][np.argsort(probs_train[i])][::-1][:3][j])[0]))
        except TypeError:
            level_num.append(int(np.where(probs_train[i]==probs_train[i][np.argsort(probs_train[i])][::-1][:3][2])[0][0]))
            print("problem occurred with: ", i, j, flush=True)
    trainProobs.append(probs_train[i][np.argsort(probs_train[i])][::-1][:1]) 

train_new_proobs = []
for i in range(len(trainProobs)):
    train_new_proobs.append(trainProobs[i].tolist())
train_new_proobs = [element for sublist in list(train_new_proobs) for element in sublist]

testLevels = []
for j in y_test:
    testLevels.append([i for i in level2_id_map if level2_id_map[i]==j][0])

testLevelss = []
for j in preds:
    testLevelss.append([i for i in level2_id_map if level2_id_map[i]==j][0])

trainLevelss = []
for j in preds_train:
    trainLevelss.append([i for i in level2_id_map if level2_id_map[i]==j][0])

print("---------------------------------------------\nFINAL COUNTS:\n---------------------------------------------")
print("1)", len(A_test + A_train))
print("2)", len(['Test']*len(A_test) + ['Train']*len(A_train)))
print("3)", len(B_test + B_train))
print("4)", len(testLevels + B_train))
print("5)", len(testLevelss + trainLevelss))
print("6)", int(len(test_new_proobs + train_new_proobs)))

data = {
    'GTIN' : A_test + A_train,
    'Source': ['Test']*len(A_test) + ['Train']*len(A_train),
    'Actual Level 2' : B_test + B_train,
    'Actuals' : testLevels + B_train,
    'Predictions' : testLevelss + trainLevelss,
    'Scores' : test_new_proobs + train_new_proobs,
    'L1 Name' : level_1
}

df = pd.DataFrame(data)
df = df.merge(pim_gtin_mapped[['GTIN', 'ITM_ID', 'PMY_DPT_CD', 'PMY_DPT_DSC', 'REC_DPT_CD',
    'REC_DPT_DSC', 'DPT_CD', 'DPT_DSC', 'COM_CD', 'COM_DSC', 'SUBCOM_CD',
    'SUBCOM_DSC', 'VND_ECOM_DSC']], on='GTIN', how='left')
df = df[['ITM_ID', 'GTIN', 'PMY_DPT_CD', 'PMY_DPT_DSC', 'REC_DPT_CD', 'REC_DPT_DSC', 'DPT_CD',
    'DPT_DSC', 'COM_CD', 'COM_DSC', 'SUBCOM_CD', 'SUBCOM_DSC',
    'VND_ECOM_DSC', 'Source', 'Actual Level 2', 'Actuals', 'Predictions', 'Scores', 'L1 Name']]

## Keep the output of the below cell for all Level 1 executed

df.to_csv(DBFO + filenamee + '.csv', index=False)
print(f"{filenamee}.csv")