In [1]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import regex

data_uri = '../data/interim/ecommerce_data-cleaned-0.1.4.csv'

products_raw = pd.read_csv(data_uri, index_col=0, keep_default_na=False)
products_raw.head()

N = 1500
products_raw = products_raw.groupby('price_range', group_keys=False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(products_raw))))).sample(frac=1).reset_index(drop=True)
products_raw.head()

# Prepare the train/test splits.
from sklearn.model_selection import train_test_split

def get_range_label(price):
    value = np.round(price, decimals=1)
    if value <= 25:
        return 0
    elif 25 < value <= 50:
        return 1
    elif 50 < value <= 100:
        return 2
    else:
        return 3

products_raw['labels'] = products_raw['price_raw'].apply(lambda x: get_range_label(x))

stemmer = PorterStemmer()
words = stopwords.words("english")
cleaned_text = lambda x: " ".join([stemmer.stem(i) for i in regex.sub("[^a-zA-Z0-9]", " ", x).split() if i not in words]).lower()
products_raw['cleaned_name'] = products_raw.name.apply(cleaned_text)
products_raw['cleaned_brand'] = products_raw.brand.fillna("").apply(cleaned_text)
products_raw['cleaned_description'] = products_raw.description.fillna("").apply(cleaned_text)
products_raw['cleaned_category_1'] = products_raw.category_1.fillna("").apply(cleaned_text)
products_raw['cleaned_category_2'] = products_raw.category_2.fillna("").apply(cleaned_text)
products_raw['cleaned_category_3'] = products_raw.category_3.fillna("").apply(cleaned_text)
products_raw['cleaned_keywords'] = products_raw.keywords.fillna("").apply(cleaned_text)
# display(products_raw)

X = products_raw
X_train, X_test, y_train, y_test = train_test_split(X, products_raw['labels'], test_size=0.25)
# X = products_raw.drop(columns=['price_raw', 'discount_raw'])
# y = products_raw['price_raw'].astype('int')

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [2]:
# Prepare the pipeline.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD, DictionaryLearning, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

def get_feature_transformer(columns, vectorizer):
    return ColumnTransformer([(feature, vectorizer, feature) for feature in columns], remainder='drop', verbose_feature_names_out=True)

vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))

# column_transformer = get_feature_transformer(['brand', 'name', 'description', 'category_1', 'category_2', 'category_3', 'keywords'], vectorizer)

column_transformer = ColumnTransformer([('name', vectorizer, 'cleaned_name'),('description', vectorizer, 'cleaned_description'), 
                                  ('brand', vectorizer, 'cleaned_brand'), ('category_1', vectorizer, 'cleaned_category_1'),
                                  ('category_2', vectorizer, 'cleaned_category_2'), ('category_3', vectorizer, 'cleaned_category_3'), 
                                  ('keywords', vectorizer, 'cleaned_keywords')],
                                remainder='drop', verbose_feature_names_out=False)

def show_metrics(clf, test_X, test_y):
    print(f'Classification score: {clf.score(test_X, test_y) * 100}%')
    print(classification_report(np.array(test_y), clf.predict(test_X), zero_division=0))
    print(confusion_matrix(np.array(test_y), clf.predict(test_X)))
    
from sklearn.ensemble import RandomForestClassifier

def get_pipeline():
    """Get the composed Pipeline"""
    return Pipeline([
        ("vect", column_transformer),
        ("dim", "passthrough"),
        ("clf", RandomForestClassifier())
    ])

N_FEATURES = [2, 4, 10]

def get_param_grid():
    return [
        {
            "dim": [TruncatedSVD(), LatentDirichletAllocation()],
            "dim__n_components": N_FEATURES,
        },
        {
            "dim": [SelectKBest(chi2, k=7000)],
            "dim__k": N_FEATURES,
        },
    ]

reducer_labels = ["TruncatedSVD", "LDA", "KBest(chi2)"]
grid = GridSearchCV(get_pipeline(), n_jobs=1, param_grid=get_param_grid())
grid.fit(X_train, y_train)

# show_metrics(grid, X_test, y_test)

ytest_RF = np.array(y_test)

print(classification_report(ytest_RF, grid.predict(X_test)))
print(confusion_matrix(ytest_RF, grid.predict(X_test)))


# clf_RF = Pipeline([('vect', column_transformer),
#                    ('chi',  SelectKBest(chi2, k=7000)),
#                    ('clf', RandomForestClassifier())])
# # Fit the classifier.
# clf_RF = clf_RF.fit(X_train, y_train)

# ytest_RF = np.array(y_test)
# 
# print(classification_report(ytest_RF, clf_RF.predict(X_test)))
# print(confusion_matrix(ytest_RF, clf_RF.predict(X_test)))

# show_metrics(clf_RF, X_test, y_test)

              precision    recall  f1-score   support

           0       0.57      1.00      0.73       214
           1       0.00      0.00      0.00        75
           2       0.00      0.00      0.00        42
           3       0.00      0.00      0.00        44

    accuracy                           0.57       375
   macro avg       0.14      0.25      0.18       375
weighted avg       0.33      0.57      0.41       375

[[214   0   0   0]
 [ 75   0   0   0]
 [ 42   0   0   0]
 [ 44   0   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# column_transformer = ColumnTransformer([('name', vectorizer, 'cleaned_name'),('description', vectorizer, 'cleaned_description'), 
#                                   ('brand', vectorizer, 'cleaned_brand'), ('category_1', vectorizer, 'cleaned_category_1'),
#                                   ('category_2', vectorizer, 'cleaned_category_2'), ('category_3', vectorizer, 'cleaned_category_3'), 
#                                   ('keywords', vectorizer, 'cleaned_keywords')],
#                                 remainder='drop', verbose_feature_names_out=False)
# 
# 
# 
# clf_LDA = Pipeline([('vect', column_transformer),
#                     ('svd', LatentDirichletAllocation()),
#                    ('clf', RandomForestClassifier())])
# # Fit the classifier.
# clf_LDA = clf_LDA.fit(X_train, y_train)
# # show_metrics(clf_LDA, X_test, y_test)
# ytest_RF = np.array(y_test)

# print(classification_report(ytest_RF, clf_LDA.predict(X_test)))
# print(confusion_matrix(ytest_RF, clf_LDA.predict(X_test)))

# clf_DL = Pipeline([('vect', column_transformer),
#                     ('svd', DictionaryLearning()),
#                    ('clf', RandomForestClassifier())])
# # Fit the classifier.
# clf_DL.fit(X_train, y_train)
# show_metrics(clf_DL, X_test, y_test)



KeyboardInterrupt



In [4]:
# clf_SVD = Pipeline([('vect', column_transformer),
#                     ('svd', TruncatedSVD()),
#                    ('clf', RandomForestClassifier())])
# # Fit the classifier.
# clf_SVD = clf_SVD.fit(X_train, y_train)
# # show_metrics(clf_SVD, X_test, y_test)
# 
# ytest_RF = np.array(y_test)
# 
# print(classification_report(ytest_RF, clf_SVD.predict(X_test)))
# print(confusion_matrix(ytest_RF, clf_SVD.predict(X_test)))
# 

Classification score: 4.485880286447777%
              precision    recall  f1-score   support

           0       0.12      0.17      0.14         6
           1       0.17      0.07      0.10        29
           2       0.03      0.05      0.04        44
           3       0.07      0.07      0.07        95
           4       0.03      0.03      0.03       141
           5       0.06      0.07      0.06       215
           6       0.08      0.10      0.09       256
           7       0.08      0.10      0.09       268
           8       0.07      0.08      0.07       270
           9       0.04      0.05      0.05       282
          10       0.05      0.05      0.05       191
          11       0.05      0.05      0.05       234
          12       0.09      0.08      0.08       246
          13       0.06      0.06      0.06       202
          14       0.08      0.07      0.08       240
          15       0.06      0.06      0.06       165
          16       0.01      0.01      0