# 1.2 - Multi-Class Classification using Random Forest

In [20]:
import pandas as pd
import regex
from IPython.core.display import display
df = pd.read_csv('../data/interim/ecommerce_data-cleaned-0.2.3.csv', index_col=0)
df.head()

Unnamed: 0,brand,name,description,category_raw,price_raw,discount_raw
0,La Costeï¿½ï¿½a,"La Costena Chipotle Peppers, 7 OZ (Pack of 12)",We aim to show you accurate product informati...,"Food | Meal Solutions, Grains & Pasta | Canned...",31.93,31.93
1,Equate,Equate Triamcinolone Acetonide Nasal Allergy S...,We aim to show you accurate product informati...,Health | Equate | Equate Allergy | Equate Sinu...,10.48,10.48
2,AduroSmart ERIA,AduroSmart ERIA Soft White Smart A19 Light Bul...,We aim to show you accurate product informati...,Electronics | Smart Home | Smart Energy and Li...,10.99,10.99
3,lowrider,"24"" Classic Adjustable Balloon Fender Set Chro...",We aim to show you accurate product informati...,Sports & Outdoors | Bikes | Bike Accessories |...,38.59,38.59
4,Anself,Elephant Shape Silicone Drinkware Portable Sil...,We aim to show you accurate product informati...,Baby | Feeding | Sippy Cups: Alternatives to P...,5.81,5.81


In [21]:
# Setup constants.
DATA_DIR = "./../data/interim/"
BASENAME = "ecommerce_data-cleaned-0.2.3"
EXT = "csv"

## 1.2.1 - Finding the predictors for classification

Based on the features described in the dataframe above, we need to select the appropriate features for classification
of a Walmart product into the appropriate list price label. Currently, let us consider the feature (or column) - 
'name' for the classification. The 'name' of the Walmart product will act as our output that needs to be classified.

First, we need to remove the missing values from the column 'name' and add a column for the output category. As 
described in the notebook - "0.2-rimij405-feature-eda.ipynb", we can consider the following range labels as list price 
categories, and assign them an integer label between 0-9. 

**List Price Range** | **Class**
:--------------------:|:-------------:
*price <= 10*        | 0
*10 < price <= 20*  | 1
*20 < price <= 25* | 2
*25 < price <= 30* | 3
*30 < price <= 35* | 4
*35 < price <= 40* | 5
*40 < price <= 45* | 6
*45 < price <= 50* | 7
*50 < price <= 100* | 8
*price > 100* | 9

## 1.2.2 - Text Preprocessing and adding Classification Labels

Now we will remove punctuation, numbers and special characters from the name column as
we need to use the keywords from it in our classifier. All the words will be converted into lower-case for uniformity 
and stemmed using NLTK package's Porter Stemmer. This is done to reduce the size of vocabulary space and improve volume
of feature space. 

Additionally, we will add a column with the integer labels for list price classification to the data frame.

In [22]:
import pandas as pd
import re
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

stemmer = PorterStemmer()
words = stopwords.words("english")

df = pd.read_csv('../data/interim/ecommerce_data-cleaned-0.2.3.csv', index_col=0)

def get_range_label(price):
    value = np.round(price, decimals=1)
    if value <= 10:
        return 0
    elif 10 < value <= 20:
        return 1
    elif 20 < value <= 25:
        return 2
    elif 25 < value <= 30:
        return 3
    elif 30 < value <= 35:
        return 4
    elif 35 < value <= 40:
        return 5
    elif 40 < value <= 45:
        return 6
    elif 45 < value <= 50:
        return 7
    elif 50 < value <= 100:
        return 8
    else:
        return 9

df['labels'] = df['price_raw'].apply(lambda x: get_range_label(x))
display(df)

# df[cleaned_description] = df['description'].apply(lambda x: " ".join([stemmer.stem(i) 
#                                                                     for i in re.sub("[^a-zA-Z]", " ", x).split() 
#                                                                     if i not in words]).lower())
df['cleaned_name'] = df.name.apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
# df.head()
display(df)

vectorizer = TfidfVectorizer(min_df= 5, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(df['cleaned_name']).toarray()
final_features.shape

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sheenambhatia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,brand,name,description,category_raw,price_raw,discount_raw,labels
0,La Costeï¿½ï¿½a,"La Costena Chipotle Peppers, 7 OZ (Pack of 12)",We aim to show you accurate product informati...,"Food | Meal Solutions, Grains & Pasta | Canned...",31.93,31.93,4
1,Equate,Equate Triamcinolone Acetonide Nasal Allergy S...,We aim to show you accurate product informati...,Health | Equate | Equate Allergy | Equate Sinu...,10.48,10.48,1
2,AduroSmart ERIA,AduroSmart ERIA Soft White Smart A19 Light Bul...,We aim to show you accurate product informati...,Electronics | Smart Home | Smart Energy and Li...,10.99,10.99,1
3,lowrider,"24"" Classic Adjustable Balloon Fender Set Chro...",We aim to show you accurate product informati...,Sports & Outdoors | Bikes | Bike Accessories |...,38.59,38.59,5
4,Anself,Elephant Shape Silicone Drinkware Portable Sil...,We aim to show you accurate product informati...,Baby | Feeding | Sippy Cups: Alternatives to P...,5.81,5.81,0
...,...,...,...,...,...,...,...
29994,NineChef,Sheng Xiang Zhen (ShengXiangZhen) Snack + OneN...,We aim to show you accurate product informati...,"Food | Snacks, Cookies & Chips | Chips & Crisp...",45.99,45.99,7
29996,Shock Sox,Shock Sox Fork Seal Guards 29-36mm Fork Tube 4...,We aim to show you accurate product informati...,Sports & Outdoors | Bikes | Bike Components | ...,33.25,33.25,4
29997,Princes,Princes Gooseberries 300g,We aim to show you accurate product informati...,"Food | Meal Solutions, Grains & Pasta | Canned...",8.88,8.88,0
29998,Create Ion,Create Ion Grace 3/4 Inches Straight Hair Iron...,We aim to show you accurate product informati...,Beauty | Hair Care | Hair Styling Tools | Flat...,50.00,24.50,7


Unnamed: 0,brand,name,description,category_raw,price_raw,discount_raw,labels,cleaned_name
0,La Costeï¿½ï¿½a,"La Costena Chipotle Peppers, 7 OZ (Pack of 12)",We aim to show you accurate product informati...,"Food | Meal Solutions, Grains & Pasta | Canned...",31.93,31.93,4,la costena chipotl pepper oz pack
1,Equate,Equate Triamcinolone Acetonide Nasal Allergy S...,We aim to show you accurate product informati...,Health | Equate | Equate Allergy | Equate Sinu...,10.48,10.48,1,equat triamcinolon acetonid nasal allergi spra...
2,AduroSmart ERIA,AduroSmart ERIA Soft White Smart A19 Light Bul...,We aim to show you accurate product informati...,Electronics | Smart Home | Smart Energy and Li...,10.99,10.99,1,adurosmart eria soft white smart a light bulb ...
3,lowrider,"24"" Classic Adjustable Balloon Fender Set Chro...",We aim to show you accurate product informati...,Sports & Outdoors | Bikes | Bike Accessories |...,38.59,38.59,5,classic adjust balloon fender set chrome bicyc...
4,Anself,Elephant Shape Silicone Drinkware Portable Sil...,We aim to show you accurate product informati...,Baby | Feeding | Sippy Cups: Alternatives to P...,5.81,5.81,0,eleph shape silicon drinkwar portabl silicon c...
...,...,...,...,...,...,...,...,...
29994,NineChef,Sheng Xiang Zhen (ShengXiangZhen) Snack + OneN...,We aim to show you accurate product informati...,"Food | Snacks, Cookies & Chips | Chips & Crisp...",45.99,45.99,7,sheng xiang zhen shengxiangzhen snack onenin c...
29996,Shock Sox,Shock Sox Fork Seal Guards 29-36mm Fork Tube 4...,We aim to show you accurate product informati...,Sports & Outdoors | Bikes | Bike Components | ...,33.25,33.25,4,shock sox fork seal guard mm fork tube green y...
29997,Princes,Princes Gooseberries 300g,We aim to show you accurate product informati...,"Food | Meal Solutions, Grains & Pasta | Canned...",8.88,8.88,0,princ gooseberri g
29998,Create Ion,Create Ion Grace 3/4 Inches Straight Hair Iron...,We aim to show you accurate product informati...,Beauty | Hair Care | Hair Styling Tools | Flat...,50.00,24.50,7,creat ion grace inch straight hair iron ci r


(29602, 11892)

## 1.2.3 - Creating RF Classifier 

Now that we have cleaned and encoded our data set, we can split the data into testing and training data sets and build 
classifiers for KNN and Random Forest.

Below we have created the RF classifier using the scikit-learn package and saved the model for future use using pickle.

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

data_uri = '../data/interim/ecommerce_data-cleaned-0.2.3.csv'
products_raw = pd.read_csv(data_uri, index_col=0)

words = stopwords.words("english")
stemmer = PorterStemmer()
def clean_text(feature):
    return " ".join([stemmer.stem(i) for i in regex.sub("[^a-zA-Z0-9]", " ", feature).split() if i not in words]).lower()


products = products_raw.copy() #.iloc[:,].copy()
products['name'] = products.name.fillna("").apply(clean_text)
products['brand'] = products.brand.fillna("").apply(clean_text)
products['description'] = products.description.fillna("").apply(clean_text)
products['category_raw'] = products.category_raw.fillna("")
display(products)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = products.loc[:,'brand':'category_raw']
Y = products['labels']
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(X, Y, test_size=0.25)


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True, norm='l2')
column_transformer = ColumnTransformer([('name', vectorizer, 'name'),
                                        ('description', vectorizer, 'description'),
                                        ('brand', vectorizer, 'brand'),
                                        ('category_raw', vectorizer, 'category_raw'),
                                       ], remainder='drop', verbose_feature_names_out=False)


pipeline_RF = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', RandomForestClassifier())])

modelRF = pipeline_RF.fit(X_train_RF, y_train_RF)
with open('RandomForest.pickle', 'wb') as f:
    pickle.dump(modelRF, f)
    
ytest_RF = np.array(y_test_RF)

print(classification_report(ytest_RF, modelRF.predict(X_test_RF)))
print(confusion_matrix(ytest_RF, modelRF.predict(X_test_RF)))

              precision    recall  f1-score   support

           0       0.46      0.47      0.46      1626
           1       0.36      0.49      0.42      1978
           2       0.16      0.13      0.14       589
           3       0.17      0.13      0.15       476
           4       0.14      0.08      0.11       354
           5       0.16      0.13      0.14       312
           6       0.19      0.10      0.13       221
           7       0.08      0.06      0.07       188
           8       0.32      0.28      0.30       866
           9       0.51      0.47      0.49       791

    accuracy                           0.35      7401
   macro avg       0.26      0.24      0.24      7401
weighted avg       0.33      0.35      0.34      7401

[[764 565  61  46  30  32  12  16  62  38]
 [461 970 123  89  49  54  29  21 108  74]
 [110 255  75  36  17  17   5   7  45  22]
 [ 66 186  28  63  11  20  11  10  59  22]
 [ 51 111  37  29  30  17   3  13  36  27]
 [ 42 101  21  12  12  41 

## 1.2.4 -  Performing PCA

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.display import display
data_uri = '../data/interim/ecommerce_data-cleaned-0.1.4.csv'
# data_uri = '../data/interim/ecommerce_data-cleaned-0.2.2.csv'
products_raw = pd.read_csv(data_uri, index_col=0, keep_default_na=False)
products_raw.head()

# products_raw = products_raw.head(1500)
# Getting a stratified sample of 1500 products:
N = 1500
products_raw = products_raw.groupby('price_range', group_keys=False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(products_raw))))).sample(frac=1).reset_index(drop=True)
products_raw.head()


# Get the stopwords.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# import regex

# Setup function for cleaning text fields of stop words.
# words = stopwords.words("english")
# stemmer = PorterStemmer()
# def clean_text(feature):
#     return " ".join([stemmer.stem(i) for i in regex.sub("[^a-zA-Z0-9]", " ", feature).split() if i not in words]).lower()
# # Clean the brand, name, and description fields.
# products = products_raw.copy() #.iloc[:,].copy()
# products['name'] = products.name.fillna("").apply(clean_text)
# products['brand'] = products.brand.fillna("").apply(clean_text)
# products['description'] = products.description.fillna("").apply(clean_text)
# products['category_raw'] = products.category_raw.fillna("")
# display(products)

# Prepare the train/test splits.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split data into features and labels:
# X = products.loc[:,'brand':'category_raw']
X = products_raw.drop(columns=['price_raw', 'discount_raw'])
y = products_raw['price_raw'].astype('int')

# Encode labels.
# le = LabelEncoder()
# le.fit(products.loc[:,'price_range'].unique())
# y = le.transform(products.loc[:,'price_range'])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)


# Split into the train test splits.
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
display(f"X train: {X_train.shape}")
display(f"y train: {y_train.shape}")
display(f"X test: {X_test.shape}")
display(f"y test: {y_test.shape}")
display(X,y)
# display(le.inverse_transform(y_test))

'X train: (1125, 8)'

'y train: (1125,)'

'X test: (375, 8)'

'y test: (375,)'

Unnamed: 0,brand,name,description,category_1,category_2,category_3,keywords,price_range
0,trend lab,trend lab grace 5 piec crib bed set,we aim show accur product inform manufactur su...,babi,nurseri decor,babi bed,crib bed set all crib bed set,"(50, 100]"
1,air wick,air wick pumpkin spice freshmat ultra automat ...,we aim show accur product inform manufactur su...,household essenti,air freshen,air wick freshmat spray air freshen,unknown,"(0, 25]"
2,countri life,countri life coenzym b complex cap 60 vegan ca...,we aim show accur product inform manufactur su...,health,vitamin supplement,letter vitamin,vitamin b,"(0, 25]"
3,accessori avenu,soccer sport ball collag holder pouch gold silver,we aim show accur product inform manufactur su...,sport outdoor,sport,soccer,soccer ball,"(0, 25]"
4,master massag,mar sport treatment tabl,we aim show accur product inform manufactur su...,sport outdoor,exercis fit,exercis fit accessori,all exercis fit accessori,"(100, 100+]"
...,...,...,...,...,...,...,...,...
1495,bioton,bioton fig vanilla perfect grain bodi exfoli 1...,we aim show accur product inform manufactur su...,person care,bath bodi,bodi scrub exfoli,unknown,"(0, 25]"
1496,philip hue,philip hue white ambianc e12 smart light cande...,we aim show accur product inform manufactur su...,electron,smart home,smart energi light,smart light smart light bulb,"(25, 50]"
1497,american crew,american crew firm hold style gel tube 8 4 oz,we aim show accur product inform manufactur su...,beauti,halloween makeup hair,best halloween makeup,unknown,"(0, 25]"
1498,mckesson,mckesson brand adult absorb underwear mckesson...,we aim show accur product inform manufactur su...,person care,incontin,all incontin,unknown,"(25, 50]"


0        96
1         4
2        14
3        21
4       439
       ... 
1495     24
1496     29
1497     13
1498     36
1499    814
Name: price_raw, Length: 1500, dtype: int64

In [6]:
# Prepare the pipeline.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD, DictionaryLearning, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

def get_feature_transformer(columns, vectorizer):
    return ColumnTransformer([(feature, vectorizer, feature) for feature in columns], remainder='drop', verbose_feature_names_out=True)

vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True, norm='l2')

column_transformer = get_feature_transformer(['brand', 'name', 'description', 'category_1', 'category_2', 'category_3', 'keywords'], vectorizer)

from sklearn.metrics import classification_report, confusion_matrix
def show_metrics(clf, test_X, test_y):
    print(f'Classification score: {clf.score(test_X, test_y) * 100}%')
    # print(classification_report(np.array(test_y), clf.predict(test_X), zero_division=0))
    # print(confusion_matrix(np.array(test_y), clf.predict(test_X)))
    
from sklearn.ensemble import RandomForestClassifier

def get_pipeline():
    """Get the composed Pipeline"""
    return Pipeline([
        ("vect", column_transformer),
        ("dim", "passthrough"),
        ("clf", RandomForestClassifier())
    ])

N_FEATURES = [2, 4, 10]
# C_OPTIONS = [1, 10, 100, 1000]
# 
def get_param_grid():
    return [
        {
            "dim": [TruncatedSVD(n_iter=1), LatentDirichletAllocation(max_iter=1)],
            "dim__n_components": N_FEATURES,
        },
        {
            "dim": [SelectKBest(chi2)],
            "dim__k": N_FEATURES,
        },
    ]

reducer_labels = ["TruncatedSVD", "LDA", "KBest(chi2)"]

# GridSearchCV
grid = GridSearchCV(get_pipeline(), n_jobs=1, param_grid=get_param_grid())

grid.fit(X_train, y_train)

show_metrics(grid, X_test, y_test)

mean_scores = np.array(grid.cv_results_["mean_test_score"])

# scores are in the order of param_grid iteration, which is alphabetical
# mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES))
# select score for best C
# mean_scores = mean_scores.max(axis=0)
# bar_offsets = np.arange(len(N_FEATURES)) * (len(reducer_labels) + 1) + 0.5

# plt.figure()
# COLORS = "bgrcmyk"
# for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
#     plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
# 
# plt.title("Comparing feature reduction techniques")
# plt.xlabel("Reduced number of features")
# plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES)
# plt.ylabel("Digit classification accuracy")
# plt.ylim((0, 1))
# plt.legend(loc="upper left")
# 
# plt.show()
# Metric calculation function:




Classification score: 4.0%


In [30]:
# from sklearn.decomposition import TruncatedSVD

# PCA gives the following error:
# TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

# Create the pipeline.

# vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True, norm='l2')
# column_transformer = ColumnTransformer([('name', vectorizer, 'name'),
#                                         ('description', vectorizer, 'description'),
#                                         ('brand', vectorizer, 'brand'),
#                                         ('category_raw', vectorizer, 'category_raw'),
#                                         # ('category_1', vectorizer, 'category_1'),
#                                         # ('category_2', vectorizer, 'category_2'),
#                                         # ('category_3', vectorizer, 'category_3'),
#                                         # ('keywords', vectorizer, 'keywords'),
#                                        ], remainder='drop', verbose_feature_names_out=False)

clf_RF = Pipeline([('vect', column_transformer),
                   ('chi',  SelectKBest(chi2, k=20000)),
                   ('clf', RandomForestClassifier())])
# Fit the classifier.
clf_RF.fit(X_train, y_train)
show_metrics(clf_RF, X_test, y_test)

Classification score: 10.403999459532494%
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         9
           1       0.22      0.16      0.19        25
           2       0.17      0.17      0.17        60
           3       0.20      0.17      0.19        80
           4       0.17      0.15      0.16       157
           5       0.13      0.22      0.16       195
           6       0.11      0.20      0.14       248
           7       0.12      0.24      0.16       280
           8       0.09      0.15      0.12       260
           9       0.08      0.16      0.11       281
          10       0.08      0.10      0.09       193
          11       0.11      0.12      0.12       236
          12       0.05      0.08      0.06       192
          13       0.10      0.12      0.11       216
          14       0.12      0.16      0.14       243
          15       0.05      0.07      0.06       166
          16       0.07      0.07      

In [None]:
clf_LDA = Pipeline([('vect', column_transformer),
                    ('svd', LatentDirichletAllocation()),
                   ('clf', RandomForestClassifier())])
# Fit the classifier.
clf_LDA.fit(X_train, y_train)
show_metrics(clf_LDA, X_test, y_test)


clf_DL = Pipeline([('vect', column_transformer),
                    ('svd', DictionaryLearning()),
                   ('clf', RandomForestClassifier())])
# Fit the classifier.
clf_DL.fit(X_train, y_train)
show_metrics(clf_DL, X_test, y_test)

In [None]:
# ValueError: k should be >=0, <= n_features = 100; got 7000. Use k='all' to return all features.
# ('chi', SelectKBest(chi2, k=100)),

# Classification score: 43.35900553979192%
# with n_components = 100

# Classification score: 42.57532765842454%
# with n_components = 1000