## Level 1: Query Classification
generate training data - mapping the leaf category ids of queries to ancestor categories.

### Task 1: Prune the category taxonomy

In [84]:
import xml.etree.ElementTree as ET
import pandas as pd

# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'
categories_file_name = r'/Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
queries_file_name = r'/Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/train.csv'
tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

In [85]:
# Prepare Parent Dictionary
parents_dict_df = parents_df.set_index("category", drop=True, inplace=False)
parent_dict = parents_dict_df.to_dict()['parent']

# IMPLEMENT ME: Convert queries to lowercase, and optionally implement other normalization, like stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()
df['query_norm'] = df['query'].apply(lambda x: stemmer.stem(x))
df_norm = df[['category', 'query_norm']]
df_norm.set_index('category').sort_index()

Unnamed: 0_level_0,query_norm
category,Unnamed: 1_level_1
abcat0011000,portable dvd play
abcat0011000,pink
abcat0011000,princess dvd play
abcat0011000,pink
abcat0011000,portable pink dvd play
...,...
pcmcat99000050002,wired rout
pcmcat99000050002,rotor
pcmcat99000050002,wired rout
pcmcat99000050002,wireless rout


In [86]:
# Roll up categories with fewer queries than the threshold to their parents, and keep doing so until every remaining category satisfies the threshold number of queries.
threshold = 1000
threshold_reached = False
df_cat_count = df_norm.groupby(['category']).count()
# print(df_cat_count[df_cat_count['query_norm'] < 100])
parent_dict[root_category_id] = root_category_id
while not threshold_reached:
    list_to_replace = list(df_cat_count[df_cat_count['query_norm'] < threshold].index)
    print(f"list_to_replace: {len(list_to_replace)}")
    replace_dict = { k: parent_dict[k] for k in list_to_replace}
    df_norm["category"] = df_norm["category"].replace(replace_dict)
    df_cat_count = df_norm.groupby(['category']).count()
    len_low_count = len(list(df_cat_count[df_cat_count['query_norm'] < threshold].index))
    print(f"len_low_count: {len_low_count}")
    threshold_reached = (len_low_count <= 0)

list_to_replace: 1188


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)


len_low_count: 236
list_to_replace: 236


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)


len_low_count: 48
list_to_replace: 48


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)


len_low_count: 6
list_to_replace: 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)


len_low_count: 1
list_to_replace: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)


len_low_count: 0


In [87]:
df_norm

Unnamed: 0,category,query_norm
0,abcat0101001,televisiones panasonic 50 pulgada
1,abcat0101001,sharp
2,pcmcat193100050014,nook
3,abcat0101001,rca
4,abcat0101005,rca
...,...,...
1865264,pcmcat247400050000,ttv
1865265,pcmcat218000050000,incas
1865266,pcmcat248500050020,ds game
1865267,pcmcat209000050008,archo


In [89]:
df_norm.groupby(['category']).count()

Unnamed: 0_level_0,query_norm
category,Unnamed: 1_level_1
abcat0100000,4359
abcat0101001,80213
abcat0101005,1042
abcat0102003,8303
abcat0102005,2533
...,...
pcmcat248700050021,3753
pcmcat252700050006,2619
pcmcat254000050002,1528
pcmcat254000050005,2256


In [93]:
import csv

df = df_norm
df['label'] = '__label__' + df['category']
output_file_name = r'/Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data.txt'
df = df[df['category'].isin(categories)]
df['output'] = df['label'] + ' ' + df['query_norm']
df[['output']].to_csv(output_file_name, header=False, sep='|', escapechar='\\', quoting=csv.QUOTE_NONE, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['output'] = df['label'] + ' ' + df['query_norm']


In [95]:
!wc -l /Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data.txt

 1854998 /Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data.txt


In [96]:
!head /Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data.txt

__label__abcat0101001 televisiones panasonic  50 pulgada
__label__abcat0101001 sharp
__label__pcmcat193100050014 nook
__label__abcat0101001 rca
__label__abcat0101005 rca
__label__pcmcat143200050016 flat screen tv
__label__pcmcat247400050001 macbook
__label__pcmcat171900050028 blue tooth headphon
__label__abcat0107004 tv antenna
__label__pcmcat186100050006 memory card


In [None]:
!python create_labeled_queries.py --min_queries 1000 --output '/Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data_min1000.txt'

### Task 2: Train a query classifier

In [98]:
!shuf /Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data_min1000.txt > min1000_shuffled_labeled_query_data.txt

In [100]:
!head -n 100000 min1000_shuffled_labeled_query_data.txt > min1000_labeled_query_training_data.txt
!tail -n 10000 min1000_shuffled_labeled_query_data.txt > min1000_labeled_query_test_data.txt

In [110]:
import fasttext

cat_classifier_v1 = fasttext.train_supervised(input="min1000_labeled_query_training_data.txt", lr=0.4, epoch=25)
print(cat_classifier_v1.test("min1000_labeled_query_test_data.txt", k = 1))
print(cat_classifier_v1.test("min1000_labeled_query_test_data.txt", k = 2))
print(cat_classifier_v1.test("min1000_labeled_query_test_data.txt", k = 3))

Read 0M words
Number of words:  12891
Number of labels: 388
Progress: 100.0% words/sec/thread:   34644 lr:  0.000000 avg.loss:  2.965291 ETA:   0h 0m 0s  8.2% words/sec/thread:   40355 lr:  0.367038 avg.loss:  5.267332 ETA:   0h 0m14s 78.9% words/sec/thread:   35925 lr:  0.084556 avg.loss:  3.326733 ETA:   0h 0m 3s

(10000, 0.5265, 0.5265)
(10000, 0.3259, 0.6518)
(10000, 0.2373, 0.7119)


 avg.loss:  2.965291 ETA:   0h 0m 0s


In [111]:
cat_classifier_v2 = fasttext.train_supervised(input="min1000_labeled_query_training_data.txt", lr=0.5, epoch=100, wordNgrams=3)
print(cat_classifier_v2.test("min1000_labeled_query_test_data.txt", k = 1))
print(cat_classifier_v2.test("min1000_labeled_query_test_data.txt", k = 2))
print(cat_classifier_v2.test("min1000_labeled_query_test_data.txt", k = 3))

Read 0M words
Number of words:  12891
Number of labels: 388
Progress: 100.0% words/sec/thread:   24969 lr:  0.000000 avg.loss:  2.079734 ETA:   0h 0m 0s% words/sec/thread:   27111 lr:  0.168248 avg.loss:  2.484065 ETA:   0h 0m32s 76.0% words/sec/thread:   26126 lr:  0.119887 avg.loss:  2.346501 ETA:   0h 0m23s


(10000, 0.5263, 0.5263)
(10000, 0.32565, 0.6513)
(10000, 0.23803333333333335, 0.7141)


#### Try regenerating the training data with minimum-query values of 10,000

In [173]:
!python create_labeled_queries.py --min_queries 10000 --output '/Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data_min10000.txt'

--min_queries:10000
list_to_replace: 1457
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)
len_low_count: 337
list_to_replace: 337
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm["category"] = df_norm["category"].replace(replace_dict)
len_low_count: 84
list_to_replace: 84
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

In [113]:
!shuf /Users/sengopal/build/my-git/search_with_machine_learning_course/datasets/labeled_query_data_min10000.txt > min10000_shuffled_labeled_query_data.txt
!head -n 100000 min10000_shuffled_labeled_query_data.txt > min10000_labeled_query_training_data.txt
!tail -n 10000 min10000_shuffled_labeled_query_data.txt > min10000_labeled_query_test_data.txt

In [118]:
cat_classifier_v3 = fasttext.train_supervised(input="min10000_labeled_query_training_data.txt", lr=0.6, epoch=100, wordNgrams=3)
print(cat_classifier_v3.test("min10000_labeled_query_test_data.txt", k = 1))
print(cat_classifier_v3.test("min10000_labeled_query_test_data.txt", k = 2))
print(cat_classifier_v3.test("min10000_labeled_query_test_data.txt", k = 3))

Read 0M words
Number of words:  12796
Number of labels: 70
Progress: 100.0% words/sec/thread:   87506 lr:  0.000000 avg.loss:  2.123806 ETA:   0h 0m 0s


(10000, 0.5875, 0.5875)
(10000, 0.36195, 0.7239)
(10000, 0.26176666666666665, 0.7853)


#### To use more training data

In [120]:
!wc -l min10000_shuffled_labeled_query_data.txt

 1854998 min10000_shuffled_labeled_query_data.txt


In [121]:
!head -n 200000 min10000_shuffled_labeled_query_data.txt > min10000_labeled_query_training_data_large.txt

In [124]:
cat_classifier_v4 = fasttext.train_supervised(input="min10000_labeled_query_training_data_large.txt", lr=0.5, epoch=100, wordNgrams=3)
print(cat_classifier_v4.test("min10000_labeled_query_test_data.txt", k = 1))
print(cat_classifier_v4.test("min10000_labeled_query_test_data.txt", k = 2))
print(cat_classifier_v4.test("min10000_labeled_query_test_data.txt", k = 3))

Read 0M words
Number of words:  18860
Number of labels: 70
Progress: 100.0% words/sec/thread:   66581 lr:  0.000000 avg.loss:  1.740510 ETA:   0h 0m 0s 94.0% words/sec/thread:   66792 lr:  0.029768 avg.loss:  1.785331 ETA:   0h 0m 4s


(10000, 0.6093, 0.6093)
(10000, 0.372, 0.744)
(10000, 0.2687, 0.8061)


In [126]:
cat_classifier_v5 = fasttext.train_supervised(input="min10000_labeled_query_training_data_large.txt", lr=0.3, epoch=25, wordNgrams=3)
print(cat_classifier_v5.test("min10000_labeled_query_test_data.txt", k = 1))
print(cat_classifier_v5.test("min10000_labeled_query_test_data.txt", k = 2))
print(cat_classifier_v5.test("min10000_labeled_query_test_data.txt", k = 3))

Read 0M words
Number of words:  18860
Number of labels: 70
Progress: 100.0% words/sec/thread:   98973 lr: -0.000003 avg.loss:  1.530274 ETA:   0h 0m 0s 109370 lr:  0.176731 avg.loss:  1.910446 ETA:   0h 0m 7s

(10000, 0.609, 0.609)
(10000, 0.3728, 0.7456)
(10000, 0.26953333333333335, 0.8086)


Progress: 100.0% words/sec/thread:   98972 lr:  0.000000 avg.loss:  1.530274 ETA:   0h 0m 0s


In [129]:
cat_classifier_v5.save_model("cat_classifier_v5.bin")

#### Task 1: Add the query classifier to query processing

In [150]:
pred = cat_classifier_v5.predict("Apple iPhone", k=20)
pred

(('__label__cat02015',
  '__label__abcat0900000',
  '__label__cat02001',
  '__label__cat00000',
  '__label__abcat0101001',
  '__label__pcmcat247400050000',
  '__label__cat02009',
  '__label__cat09000',
  '__label__cat02010',
  '__label__abcat0302000',
  '__label__abcat0100000',
  '__label__pcmcat209400050001',
  '__label__pcmcat245100050028',
  '__label__abcat0700000',
  '__label__abcat0200000',
  '__label__pcmcat248700050021',
  '__label__abcat0500000',
  '__label__abcat0811002',
  '__label__pcmcat144700050004',
  '__label__abcat0207000'),
 array([0.26535127, 0.07556419, 0.05596664, 0.04775826, 0.03420277,
        0.03417056, 0.03412051, 0.03156948, 0.02971699, 0.02065383,
        0.01938056, 0.01817827, 0.01807952, 0.01767591, 0.01580442,
        0.01431181, 0.01201295, 0.01123516, 0.01103844, 0.01099545]))