In [1]:
import numpy as np
from IPython.core.display import display
import pandas as pd
import nltk
import ssl
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
import regex

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vaibhavidharashivkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("../data/interim/ecommerce_data-cleaned-0.1.4.csv", index_col=0, encoding='utf-8',keep_default_na=False)
display(df)

Unnamed: 0,brand,name,description,category_1,category_2,category_3,keywords,price_raw,discount_raw,price_range
0,la cost,la costena chipotl pepper 7 oz pack 12,we aim show accur product inform manufactur su...,food,meal solut grain pasta,can good,can veget,31.93,31.93,"(25, 50]"
1,equat,equat triamcinolon acetonid nasal allergi spra...,we aim show accur product inform manufactur su...,health,equat,equat allergi,equat sinu congest nasal care,10.48,10.48,"(0, 25]"
2,adurosmart eria,adurosmart eria soft white smart a19 light bul...,we aim show accur product inform manufactur su...,electron,smart home,smart energi light,smart light smart light bulb,10.99,10.99,"(0, 25]"
3,lowrid,24 classic adjust balloon fender set chrome bi...,we aim show accur product inform manufactur su...,sport outdoor,bike,bike accessori,bike fender,38.59,38.59,"(25, 50]"
4,anself,eleph shape silicon drinkwar portabl silicon c...,we aim show accur product inform manufactur su...,babi,feed,sippi cup altern plastic,unknown,5.81,5.81,"(0, 25]"
...,...,...,...,...,...,...,...,...,...,...
29994,ninechef,sheng xiang zhen shengxiangzhen snack onenin c...,we aim show accur product inform manufactur su...,food,snack cooki chip,chip crisp,chip crisp,45.99,45.99,"(25, 50]"
29996,shock sox,shock sox fork seal guard 29 36mm fork tube 4 ...,we aim show accur product inform manufactur su...,sport outdoor,bike,bike compon,bike fork,33.25,33.25,"(25, 50]"
29997,princ,princ gooseberri 300g,we aim show accur product inform manufactur su...,food,meal solut grain pasta,can good,can fruit,8.88,8.88,"(0, 25]"
29998,creat ion,creat ion grace 3 4 inch straight hair iron ci...,we aim show accur product inform manufactur su...,beauti,hair care,hair style tool,flat iron hair flat iron,50.00,24.50,"(25, 50]"


In [3]:
def get_range_label(price):
    value = np.round(price, decimals=1)
    if value <= 25:
        return 0
    elif 25 < value <= 50:
        return 1
    elif 50 < value <= 100:
        return 2
    else:
        return 3

df['labels'] = df['price_raw'].apply(lambda x: get_range_label(x))

In [4]:
stemmer = PorterStemmer()
words = stopwords.words("english")
cleaned_text = lambda x: " ".join([stemmer.stem(i) for i in regex.sub("[^a-zA-Z0-9]", " ", x).split() if i not in words]).lower()
df['cleaned_name'] = df.name.apply(cleaned_text)
df['cleaned_brand'] = df.brand.fillna("").apply(cleaned_text)
df['cleaned_description'] = df.description.fillna("").apply(cleaned_text)
df['cleaned_category_1'] = df.category_1.fillna("").apply(cleaned_text)
df['cleaned_category_2'] = df.category_2.fillna("").apply(cleaned_text)
df['cleaned_category_3'] = df.category_3.fillna("").apply(cleaned_text)
df['cleaned_keywords'] = df.keywords.fillna("").apply(cleaned_text)
display(df)

Unnamed: 0,brand,name,description,category_1,category_2,category_3,keywords,price_raw,discount_raw,price_range,labels,cleaned_name,cleaned_brand,cleaned_description,cleaned_category_1,cleaned_category_2,cleaned_category_3,cleaned_keywords
0,la cost,la costena chipotl pepper 7 oz pack 12,we aim show accur product inform manufactur su...,food,meal solut grain pasta,can good,can veget,31.93,31.93,"(25, 50]",1,la costena chipotl pepper 7 oz pack 12,la cost,aim show accur product inform manufactur suppl...,food,meal solut grain pasta,good,veget
1,equat,equat triamcinolon acetonid nasal allergi spra...,we aim show accur product inform manufactur su...,health,equat,equat allergi,equat sinu congest nasal care,10.48,10.48,"(0, 25]",0,equat triamcinolon acetonid nasal allergi spra...,equat,aim show accur product inform manufactur suppl...,health,equat,equat allergi,equat sinu congest nasal care
2,adurosmart eria,adurosmart eria soft white smart a19 light bul...,we aim show accur product inform manufactur su...,electron,smart home,smart energi light,smart light smart light bulb,10.99,10.99,"(0, 25]",0,adurosmart eria soft white smart a19 light bul...,adurosmart eria,aim show accur product inform manufactur suppl...,electron,smart home,smart energi light,smart light smart light bulb
3,lowrid,24 classic adjust balloon fender set chrome bi...,we aim show accur product inform manufactur su...,sport outdoor,bike,bike accessori,bike fender,38.59,38.59,"(25, 50]",1,24 classic adjust balloon fender set chrome bi...,lowrid,aim show accur product inform manufactur suppl...,sport outdoor,bike,bike accessori,bike fender
4,anself,eleph shape silicon drinkwar portabl silicon c...,we aim show accur product inform manufactur su...,babi,feed,sippi cup altern plastic,unknown,5.81,5.81,"(0, 25]",0,eleph shape silicon drinkwar portabl silicon c...,anself,aim show accur product inform manufactur suppl...,babi,feed,sippi cup altern plastic,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29994,ninechef,sheng xiang zhen shengxiangzhen snack onenin c...,we aim show accur product inform manufactur su...,food,snack cooki chip,chip crisp,chip crisp,45.99,45.99,"(25, 50]",1,sheng xiang zhen shengxiangzhen snack onenin c...,ninechef,aim show accur product inform manufactur suppl...,food,snack cooki chip,chip crisp,chip crisp
29996,shock sox,shock sox fork seal guard 29 36mm fork tube 4 ...,we aim show accur product inform manufactur su...,sport outdoor,bike,bike compon,bike fork,33.25,33.25,"(25, 50]",1,shock sox fork seal guard 29 36mm fork tube 4 ...,shock sox,aim show accur product inform manufactur suppl...,sport outdoor,bike,bike compon,bike fork
29997,princ,princ gooseberri 300g,we aim show accur product inform manufactur su...,food,meal solut grain pasta,can good,can fruit,8.88,8.88,"(0, 25]",0,princ gooseberri 300g,princ,aim show accur product inform manufactur suppl...,food,meal solut grain pasta,good,fruit
29998,creat ion,creat ion grace 3 4 inch straight hair iron ci...,we aim show accur product inform manufactur su...,beauti,hair care,hair style tool,flat iron hair flat iron,50.00,24.50,"(25, 50]",1,creat ion grace 3 4 inch straight hair iron ci...,creat ion,aim show accur product inform manufactur suppl...,beauti,hair care,hair style tool,flat iron hair flat iron


In [5]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29604 entries, 0 to 29999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                29604 non-null  object 
 1   name                 29604 non-null  object 
 2   description          29604 non-null  object 
 3   category_1           29604 non-null  object 
 4   category_2           29604 non-null  object 
 5   category_3           29604 non-null  object 
 6   keywords             29604 non-null  object 
 7   price_raw            29604 non-null  float64
 8   discount_raw         29604 non-null  float64
 9   price_range          29604 non-null  object 
 10  labels               29604 non-null  int64  
 11  cleaned_name         29604 non-null  object 
 12  cleaned_brand        29604 non-null  object 
 13  cleaned_description  29604 non-null  object 
 14  cleaned_category_1   29604 non-null  object 
 15  cleaned_category_2   29604 non-null 

In [7]:
from sklearn.tree import DecisionTreeClassifier

X = [df['cleaned_name'], df['cleaned_description'], df['cleaned_brand'], df['cleaned_category_1'], 
     df['cleaned_category_2'], df['cleaned_category_3'], df['cleaned_keywords']]

for i in X:
    X_train, X_test, y_train, y_test = train_test_split(i, df['labels'], test_size=0.25)
    
    pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', DecisionTreeClassifier(random_state=0))])

    model = pipeline.fit(X_train, y_train)

    ytest = np.array(y_test)

    print(classification_report(ytest, model.predict(X_test)))
    print(confusion_matrix(ytest, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74      4111
           1       0.35      0.32      0.33      1571
           2       0.32      0.26      0.28       897
           3       0.47      0.45      0.46       822

    accuracy                           0.57      7401
   macro avg       0.46      0.45      0.45      7401
weighted avg       0.56      0.57      0.57      7401

[[3149  625  185  152]
 [ 767  504  194  106]
 [ 310  195  232  160]
 [ 209  122  122  369]]
              precision    recall  f1-score   support

           0       0.70      0.74      0.72      4180
           1       0.32      0.31      0.32      1581
           2       0.29      0.25      0.27       864
           3       0.44      0.42      0.43       776

    accuracy                           0.56      7401
   macro avg       0.44      0.43      0.43      7401
weighted avg       0.55      0.56      0.55      7401

[[3074  705  242  159]
 [ 776  494  1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72      4179
           1       0.17      0.00      0.00      1537
           2       0.00      0.00      0.00       855
           3       0.00      0.00      0.00       830

    accuracy                           0.56      7401
   macro avg       0.18      0.25      0.18      7401
weighted avg       0.35      0.56      0.41      7401

[[4175    4    0    0]
 [1536    1    0    0]
 [ 854    1    0    0]
 [ 830    0    0    0]]
              precision    recall  f1-score   support

           0       0.57      0.98      0.72      4159
           1       0.44      0.06      0.11      1569
           2       0.57      0.03      0.06       866
           3       0.44      0.02      0.04       807

    accuracy                           0.57      7401
   macro avg       0.51      0.27      0.23      7401
weighted avg       0.53      0.57      0.44      7401

[[4078   70    3    8]
 [1455   97   

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

X = df
X_train, X_test, y_train, y_test = train_test_split(X, df['labels'], test_size=0.25)
column_trans = ColumnTransformer([('name', vectorizer, 'cleaned_name'),('description', vectorizer, 'cleaned_description'), 
                                  ('brand', vectorizer, 'cleaned_brand'), ('category_1', vectorizer, 'cleaned_category_1'),
                                  ('category_2', vectorizer, 'cleaned_category_2'), ('category_3', vectorizer, 'cleaned_category_3'), 
                                  ('keywords', vectorizer, 'cleaned_keywords')],
                                remainder='drop', verbose_feature_names_out=False)

pipeline = Pipeline([('vect', column_trans),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', DecisionTreeClassifier(random_state=0))])

model = pipeline.fit(X_train, y_train)

ytest = np.array(y_test)

print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))


              precision    recall  f1-score   support

           0       0.73      0.76      0.74      4192
           1       0.34      0.35      0.34      1507
           2       0.30      0.26      0.28       882
           3       0.49      0.45      0.47       820

    accuracy                           0.58      7401
   macro avg       0.46      0.45      0.46      7401
weighted avg       0.57      0.58      0.58      7401

[[3176  665  228  123]
 [ 706  522  159  120]
 [ 286  225  228  143]
 [ 181  133  140  366]]


In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X = [df['cleaned_name'], df['cleaned_description'], df['cleaned_brand'], df['cleaned_category_1'], 
     df['cleaned_category_2'], df['cleaned_category_3'], df['cleaned_keywords']]

for i in X:
    X_train, X_test, y_train, y_test = train_test_split(i, df['labels'], test_size=0.25)
    
    pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1))])

    model = pipeline.fit(X_train, y_train)

    ytest = np.array(y_test)

    print(classification_report(ytest, model.predict(X_test)))
    print(confusion_matrix(ytest, model.predict(X_test)))


              precision    recall  f1-score   support

           0       0.68      0.86      0.76      4199
           1       0.34      0.21      0.26      1556
           2       0.31      0.22      0.26       853
           3       0.55      0.35      0.43       793

    accuracy                           0.60      7401
   macro avg       0.47      0.41      0.43      7401
weighted avg       0.55      0.60      0.56      7401

[[3620  354  144   81]
 [1010  321  148   77]
 [ 427  164  191   71]
 [ 274  116  124  279]]
              precision    recall  f1-score   support

           0       0.67      0.86      0.75      4113
           1       0.28      0.17      0.21      1571
           2       0.27      0.24      0.25       859
           3       0.64      0.33      0.44       858

    accuracy                           0.58      7401
   macro avg       0.47      0.40      0.41      7401
weighted avg       0.54      0.58      0.54      7401

[[3520  377  159   57]
 [1083  267  1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72      4120
           1       0.20      0.00      0.00      1550
           2       0.00      0.00      0.00       873
           3       0.00      0.00      0.00       858

    accuracy                           0.56      7401
   macro avg       0.19      0.25      0.18      7401
weighted avg       0.35      0.56      0.40      7401

[[4118    2    0    0]
 [1549    1    0    0]
 [ 873    0    0    0]
 [ 856    2    0    0]]
              precision    recall  f1-score   support

           0       0.58      0.98      0.73      4192
           1       0.41      0.06      0.10      1544
           2       0.53      0.03      0.06       842
           3       0.43      0.03      0.05       823

    accuracy                           0.57      7401
   macro avg       0.49      0.27      0.23      7401
weighted avg       0.52      0.57      0.45      7401

[[4093   84    7    8]
 [1429   91   

In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

X = df
X_train, X_test, y_train, y_test = train_test_split(X, df['labels'], test_size=0.25)
column_trans = ColumnTransformer([('name', vectorizer, 'cleaned_name'),('description', vectorizer, 'cleaned_description'), 
                                  ('brand', vectorizer, 'cleaned_brand'), ('category_1', vectorizer, 'cleaned_category_1'),
                                  ('category_2', vectorizer, 'cleaned_category_2'), ('category_3', vectorizer, 'cleaned_category_3'), 
                                  ('keywords', vectorizer, 'cleaned_keywords')],
                                remainder='drop', verbose_feature_names_out=False)

pipeline = Pipeline([('vect', column_trans),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1))])

model = pipeline.fit(X_train, y_train)

ytest = np.array(y_test)

print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))


              precision    recall  f1-score   support

           0       0.69      0.85      0.76      4188
           1       0.29      0.21      0.24      1530
           2       0.28      0.22      0.25       875
           3       0.65      0.35      0.45       808

    accuracy                           0.59      7401
   macro avg       0.48      0.41      0.43      7401
weighted avg       0.55      0.59      0.56      7401

[[3563  423  153   49]
 [1006  321  172   31]
 [ 387  222  192   74]
 [ 211  146  168  283]]


In [11]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
X = [df['cleaned_name'], df['cleaned_description'], df['cleaned_brand'], df['cleaned_category_1'], 
     df['cleaned_category_2'], df['cleaned_category_3'], df['cleaned_keywords']]

for i in X:
    X_train, X_test, y_train, y_test = train_test_split(i, df['labels'], test_size=0.25)
    
    pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME"))])

    model = pipeline.fit(X_train, y_train)

    ytest = np.array(y_test)

    print(classification_report(ytest, model.predict(X_test)))
    print(confusion_matrix(ytest, model.predict(X_test)))


              precision    recall  f1-score   support

           0       0.58      0.99      0.74      4213
           1       0.61      0.03      0.05      1523
           2       0.80      0.02      0.04       865
           3       0.65      0.11      0.18       800

    accuracy                           0.59      7401
   macro avg       0.66      0.29      0.25      7401
weighted avg       0.62      0.59      0.46      7401

[[4191    4    0   18]
 [1463   43    3   14]
 [ 814   17   20   14]
 [ 705    7    2   86]]
              precision    recall  f1-score   support

           0       0.62      0.96      0.75      4147
           1       0.34      0.08      0.13      1578
           2       0.50      0.06      0.11       882
           3       0.55      0.38      0.45       794

    accuracy                           0.60      7401
   macro avg       0.50      0.37      0.36      7401
weighted avg       0.54      0.60      0.51      7401

[[3965  100   12   70]
 [1353  127   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72      4151
           1       0.50      0.00      0.00      1514
           2       0.00      0.00      0.00       876
           3       0.00      0.00      0.00       860

    accuracy                           0.56      7401
   macro avg       0.27      0.25      0.18      7401
weighted avg       0.42      0.56      0.40      7401

[[4149    2    0    0]
 [1512    2    0    0]
 [ 876    0    0    0]
 [ 860    0    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.57      0.99      0.72      4163
           1       0.00      0.00      0.00      1539
           2       0.89      0.02      0.04       900
           3       0.27      0.06      0.10       799

    accuracy                           0.56      7401
   macro avg       0.43      0.27      0.21      7401
weighted avg       0.46      0.56      0.42      7401

[[4101    0    0   62]
 [1493    0    1   45]
 [ 864    0   17   19]
 [ 751    0    1   47]]
              precision    recall  f1-score   support

           0       0.58      0.99      0.73      4223
           1       0.59      0.01      0.01      1546
           2       0.56      0.03      0.06       811
           3       0.70      0.11      0.19       821

    accuracy                           0.58      7401
   macro avg       0.61      0.29      0.25      7401
weighted avg       0.59      0.58      0.45      7401

[[4197    3    3   20]
 [1519   10   

In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

X = df
X_train, X_test, y_train, y_test = train_test_split(X, df['labels'], test_size=0.25)
column_trans = ColumnTransformer([('name', vectorizer, 'cleaned_name'),('description', vectorizer, 'cleaned_description'), 
                                  ('brand', vectorizer, 'cleaned_brand'), ('category_1', vectorizer, 'cleaned_category_1'),
                                  ('category_2', vectorizer, 'cleaned_category_2'), ('category_3', vectorizer, 'cleaned_category_3'), 
                                  ('keywords', vectorizer, 'cleaned_keywords')],
                                remainder='drop', verbose_feature_names_out=False)

pipeline = Pipeline([('vect', column_trans),
                     ('chi',  SelectKBest(chi2, k='all')),
                     ('clf', AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME"))])

model = pipeline.fit(X_train, y_train)

ytest = np.array(y_test)

print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.65      0.95      0.77      4173
           1       0.35      0.13      0.19      1551
           2       0.49      0.10      0.17       853
           3       0.65      0.44      0.52       824

    accuracy                           0.62      7401
   macro avg       0.54      0.40      0.41      7401
weighted avg       0.57      0.62      0.55      7401

[[3969  156   11   37]
 [1258  197   32   64]
 [ 566  109   86   92]
 [ 324   94   47  359]]
