In [1]:
from util import print_log, validate_model, sparse_validate_model

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (15,30)

## Read raw data as lines

In [3]:
raw_train = pd.DataFrame([line for line in open('../data/classification_train.tsv',encoding='utf8')],columns=['line'])

In [4]:
raw_test = pd.DataFrame([line for line in open('../data/classification_blind_set_corrected.tsv',encoding='utf8')],columns=['line'])

## Extract category and brand from raw data

In [5]:
train = raw_train.line.str.extract(r'(.*)\t(\d+)\t(\d+)$',expand=True)
train.columns = ['product_title', 'brand_id', 'category_id']
train = train.dropna()
train.loc[:, ['brand_id', 'category_id']] = train.loc[:, ['brand_id', 'category_id']].astype(int)

In [6]:
test = raw_test.line.str.extract(r'(.*)\t(-?\d+)$',expand=True)
test.columns = ['product_title', 'category_id']
test = test.dropna()
test.loc[:, ['category_id']] = test.loc[:, ['category_id']].astype(int)

In [7]:
class Tokenizer2(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we', '__NUMBER__', '__SERIAL__'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        # only return first and last two tokens
        
        tokens = tokens if len(tokens) <5 else tokens[:3] + tokens[-2:]
        
        token_dict = {}
        
        for i,token in enumerate(tokens):
            token_dict['word_'+str(i)] = token
        
        return token_dict

In [8]:
! wc -l ../data/classification_train.tsv

1000000 ../data/classification_train.tsv


In [9]:
! wc -l ../data/classification_blind_set.tsv

wc: ../data/classification_blind_set.tsv: No such file or directory


In [10]:
train.shape, test.shape

((999996, 3), (619240, 2))

# missed rows

In [11]:
1000000 - train.shape[0], 619243 -  test.shape[0]

(4, 3)

In [12]:
pd.options.display.max_colwidth = 900

In [13]:
raw_train[~raw_train.line.str.contains(r'(.*)\t(\d+)\t(\d+)$')]

  if __name__ == '__main__':


Unnamed: 0,line
218278,title\tbid\tcid\n
246806,""" 2 Pack Panasonic Compatible KX-FA83 KXFA83 Laser Toner Cartridge, 2,500 Pa\tUnknown\tcomputers & accessories > cables & accessories > printer ink & toner > laser printer toner\n"
458263,"""Brocade VDX 6720 - switch - 16 ports - rack-mountable\tBrocade Communication Systems\tcomputers & accessories > networking products > switches\n"
575503,"""This hub is built strong and ideal for industrial environments. With the StarTech.com ST4200USBM 4-port hub you can wall-mount or install onto a DIN rail for convenient access to the ports. This four port industrial hub can be bus powered or self powered with a three wire terminal block connector 7-24V . Plug in your most demanding next-generation peripherals and still enjoy data transfer speed\tStarTech\tcomputers & accessories > networking products > hubs\n"


## exploring category and brand cardinalities

In [14]:
all_data = pd.concat([train, test])

In [15]:
all_data.category_id.value_counts().shape

(705,)

In [None]:
vc = train.brand_id.value_counts()
vc[vc<20].shape

# Model Learning

In [12]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk import word_tokenize

In [13]:
class Tokenizer(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we', '__NUMBER__', '__SERIAL__'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        # only return first and last two tokens
        return tokens if len(tokens) <5 else tokens[:2] + tokens[-2:]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=Tokenizer())
print_log("starting vectorizer fit_transform")
sparse_title = vectorizer.fit_transform(train['product_title'])
print_log("completed vectorizer fit_transform")

In [None]:
print("distinct words found", len(vectorizer.vocabulary_))

To build sparse matrix from vectorized tokens and category_id  
learn model on top of this

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
category_dict_vectorizer = DictVectorizer()
print_log("starting sparse category")
sparse_category = category_dict_vectorizer.fit_transform(train.category_id.astype(str).apply(lambda x: {x: 1}))
print_log("completed sparse category")

In [None]:
sparse_category.shape, (train.category_id.shape, train.category_id.nunique())

In [None]:
from scipy.sparse import hstack

In [None]:
joined_data = hstack([sparse_category, sparse_title], format='csr')

In [None]:
joined_data.shape

Saving file

In [None]:
from scipy.io import mmwrite, mmread
mmwrite('joined_data.mtx', joined_data)
mmwrite('sparse_category.mtx', sparse_category)
mmwrite('sparse_title.mtx', sparse_title)

In [None]:
# joined_data = mmread('joined_data.mtx')
# sparse_category = mmread('sparse_category')
# sparse_title = mmread('sparse_title')

Model validation
--

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.cross_validation import ShuffleSplit
from sklearn.naive_bayes import MultinomialNB

In [None]:
import importlib
import util

importlib.reload(util)
from util import print_log, validate_model, sparse_validate_model

In [None]:
# Validate logistic regression 
print_log("starting validation")
# clf = LogisticRegression()
# clf = SGDClassifier(loss='log')
clf = MultinomialNB()
util.sparse_validate_model(X=joined_data, Y=pd.np.ravel(train.brand_id.astype(int)), classifier=clf, 
               split_generator=lambda Y: ShuffleSplit(n=Y.shape[0], n_iter=3, test_size=0.4))
print_log("completed validation")

Apply Model
--

Data preparation

In [None]:
test_title_sparse = sparse_title = vectorizer.transform(test['product_title'])
test_category_sparse = category_dict_vectorizer.transform(test.category_id.astype(str).apply(lambda x: {x: 1}))
joined_test_data = hstack([sparse_category, sparse_title], format='csr')

In [None]:
mmwrite('joined_test_data.mtx', joined_test_data)
mmwrite('test_title_sparse', test_title_sparse)
mmwrite('test_category_sparse', test_category_sparse)

In [None]:
joined_test_data = mmread('joined_test_data.mtx')
test_title_sparse = mmread('test_title_sparse.mtx')
test_category_sparse = mmread('test_category_sparse.mtx')

predict

In [None]:
test_pred = clf.predict(joined_test_data)

# category wise model strategy

Second submimssion code

In [23]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from sklearn.externals import joblib

class Tokenizer(object):
    def __init__(self):
        self.tokenizer = word_tokenize
        self.stop_words = set(
        ['is', 'of', 'it', 'at', 'on', 'and', 'as', 'the', 'to', 'are', 'this', 'that', 'be', 'in',
          'an', 'or','any', 'all', 'am','you','we', '__NUMBER__', '__SERIAL__'])

    def __call__(self, text):
        text = text.lower()
        # replace special characters
        text = re.sub(r'[^a-z0-9\s/\\_\t,\-]', '', text,flags=re.IGNORECASE)
        text = re.sub(r'[/\\_\t,-]', ' ', text,flags=re.IGNORECASE)
        # replace numbers to reduce number of features
        text = re.sub(r'\b[0-9]+\b', ' __NUMBER__ ', text) 
        # replace possible product/serial numbers
        text = re.sub(r'\b\w*\d+\w*\d?\b', ' __SERIAL__ ', text)
                
        tokens = [w for w in self.tokenizer(text) if (w not in self.stop_words and len(w)>1)]
        # only return first and last two tokens
        return tokens if len(tokens) <5 else tokens[:3] + tokens[-2:]
    
def learn_model_for_category(train_df):
    category = train_df.category_id.iloc[0]
    learner = MultinomialNB()
    vectorizer = TfidfVectorizer(tokenizer=Tokenizer())
    estimators = [('transform', vectorizer), ('learner', learner)]
    pipe_line = Pipeline(estimators)
    pipe_line.fit(train_df['product_title'].values,train_df['brand_id'].astype(int))
    joblib.dump(pipe_line,'category_'+str(category)+'_model.clf')
    return True

In [24]:
category_size = train.category_id.value_counts()
major_cats = category_size[category_size>100].shape

In [25]:
%time cat_models = train.groupby('category_id').apply(learn_model_for_category)

CPU times: user 4min 23s, sys: 5.69 s, total: 4min 29s
Wall time: 4min 31s


In [26]:
cat_models.sum()

609

In [42]:
def apply_model_for_category(test_df):
    category = test_df.category_id.iloc[0]
    try:
        learner = joblib.load('category_'+str(category)+'_model.clf')
        test_df.loc[test_df.index,'predicted_brand_id'] = learner.predict(test_df['product_title'].values)
        return test_df
    except Exception as e:
        print(e,test_df.shape)
        test_df.loc[test_df.index,'predicted_brand_id'] = -1
        return test_df

In [43]:
%time predictions = test.groupby('category_id').apply(apply_model_for_category)

[Errno 2] No such file or directory: 'category_9_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_12_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_14_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_18_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_24_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_25_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_27_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_35_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_48_model.clf' (2, 2)
[Errno 2] No such file or directory: 'category_64_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_69_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_70_model.clf' (4, 2)
[Errno 2] No such file or directory: 'category_73_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_89_model.clf' (1, 2)
[Errno 2] No such file or directory: 'category_96

In [48]:
predictions[predictions.predicted_brand_id.astype(int)==-1]

Unnamed: 0,product_title,category_id,predicted_brand_id
8757,"""USG New Release! * H.265 Compression * 5MP 3744x1408 @ 30FPS * IP Bullet Security Camera: 2.8-12mm 5MP Lens, Power Over Ethernet, 72x IR LEDs For 200ft Night Vision, 1/1.8"""" Sony IMX178 Sensor + Hi3516A DSP, Weatherproof, ONVIF 2.4, Remote Viewing On Phones + Computers *** Ideal For Business & Industrial Applications""",674,-1.0
11412,"""70"""" LCD Public Display""",96,-1.0
22256,"GJY WS-980 Model Car Loudspeaker Box , Silver",151,-1.0
22475,"""Adjustable Rotating Footrest, Extra-Wide, 17 3/4 x 14 x 4"""", Black by KELLY COMPUTERS""",345,-1.0
23543,MCH-ATH-EQ500 3.5MM Mini In-Ear Earphone for T-388 Walkie Talkie with Microphone for Handheld Two Way Radio,417,-1.0
38693,"3dRose dpp_13301_1 Wall Clock, Computers Room, 10 by 10-Inch",64,-1.0
39573,"JJE LAN Leather Bobbin Winder for Cables/Earphone , Brown",446,-1.0
48532,ZCLATH-EQ500 3.5MM Mini In-Ear Earphone for T-388 Walkie Talkie with Microphone for Handheld Two Way Radio,417,-1.0
51672,Display Calibration Sensor Colorimeter for Md & Spectraview,200,-1.0
59215,YAN CP-3508 USB3.0 Front Panel 100% Copper,349,-1.0


In [49]:
predictions.shape

(619240, 3)

In [50]:
predictions.loc[predictions.index,'predicted_brand_id'] = predictions.predicted_brand_id.astype(int)

query for br_id -1  
get title from test and fit tiidf  
take entire training transform tfidf  
train classifier learn, apply on test

In [82]:
unpredicted = predictions.query('predicted_brand_id == -1')

In [85]:
test_vectorizer = TfidfVectorizer(tokenizer=Tokenizer())
test_vectorizer.fit(unpredicted.product_title)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.Tokenizer object at 0x7f34bc92d7b8>,
        use_idf=True, vocabulary=None)

In [None]:
tokenize = Tokenizer()
vocab = test_vectorizer.vocabulary_.keys()

In [93]:
missing_relevant_train = train['product_title'].apply(lambda x:vocab.isdisjoint(tokenize(x)))

In [94]:
missing_train = train[~missing_relevant_train]

In [120]:
mc = missing_train.category_id.value_counts()
missing_train_df = missing_train[missing_train.category_id.isin(mc[:5].index)]

In [121]:
def learn_model_for_missing_category(train_df, test_df):
    test_vectorizer = TfidfVectorizer(tokenizer=Tokenizer())
    test_vectorizer.fit(test_df.product_title)
    category = "missing"
    learner = MultinomialNB()
    vectorizer = TfidfVectorizer(tokenizer=Tokenizer(), vocabulary=test_vectorizer.vocabulary_)
    estimators = [('transform', vectorizer), ('learner', learner)]
    pipe_line = Pipeline(estimators)
    pipe_line.fit(train_df['product_title'].values,train_df['brand_id'].astype(int))
    joblib.dump(pipe_line,'category_'+str(category)+'_model.clf')
    return True

learn_model_for_missing_category(missing_train_df, unpredicted)

True

In [123]:
def apply_model_for_missing_category(test_df):
    category = 'missing'
    try:
        learner = joblib.load('category_'+str(category)+'_model.clf')
        test_df.loc[test_df.index,'predicted_brand_id'] = learner.predict(test_df['product_title'].values)
        return test_df
    except Exception as e:
        print(e,test_df.shape)
        test_df.loc[test_df.index,'predicted_brand_id'] = -1
        return test_df
    
missing_predicted = apply_model_for_missing_category(unpredicted)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [125]:
missing_predicted.shape

(116, 3)

In [129]:
predictions.loc[missing_predicted.index,'predicted_brand_id'] = missing_predicted.predicted_brand_id

In [131]:
predictions.predicted_brand_id.to_csv('category_wise_mnb.csv',index=False)