### Predicting Category Label from Item Name and its Description

** Steps: **
1. Load dataset and combine name and description into single full text feature
2. Convert text feature into numeric feature using label encoder for item category and TfIdf Vectorizer for name and description
3. Train a multiclass logistic regression model to establish a baseline performance for the training set
4. Train a neural network and compare performance of neural network with simple logistic regression model

In [123]:
import warnings
warnings.filterwarnings("ignore")

# core data analytics library
from IPython.display import display_html
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import random
import requests
import json
import os
# machine learning/feature extraction and related library
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("seaborn-muted")

# Some basic configuration
train_fraction = 0.6
test_fraction = 1 - train_fraction
max_features = 10000
sample_frac = 0.7

### Data Preparation

In [124]:
# Load data
df = pd.read_csv("nlp-dataset/train.csv")

In [125]:
# View/inspect few rows in data
df.head(5).T

Unnamed: 0,0,1,2,3,4
id,1008537883,1009127328,1008982705,1006975280,1008950745
title,2010 Audi A3,1 tb internal harddrives,3× Mercedes benz Actros 26:40 for sale,Samsung S7 Edge (Rose Pink),2TB Desktop Hardrive
description,A white in colour audi a3 tfsi in good conditi...,Internal harddrives for sale 1tb 3.5 inch for ...,Make Mercedes benz Actros 26:40\nModel ...,Selling my Samsung S7 Edge Rose Pink colour ve...,2TB Desktop Hardrive for a bargain \n\nCall or...
category,Cars & Bakkies,Computer Hardware & Accessories,Trucks & Commercial Vehicles,Cell Phones,Computer Hardware & Accessories


In [126]:
# see all unique categories in data
df.category.unique()

array(['Cars & Bakkies', 'Computer Hardware & Accessories',
       'Trucks & Commercial Vehicles', 'Cell Phones',
       'Businesses for Sale', 'Furniture & Decor', 'Tools & DIY',
       'Houses & Flats for rent', 'Art, Collectibles & Rare Items',
       'Car Parts & Accessories', 'Land', 'Clothing & Shoes',
       'Rooms for rent & Shared', 'Gym & Fitness', 'TV, Audio & Visual',
       'Prams, Cots & Equipment', 'iPads & Tablets',
       'Feeds, Supplements & Seeds', 'Gaming & Consoles',
       'Musical Instruments', 'Homeware & Appliances',
       'Toys, Games & Remote Control', 'Motorcycles & Scooters',
       'Dogs & Cats', 'Construction & Home Improvement',
       'Outdoor & Sports Equipment', 'Business & Industrial Equipment',
       'CVs & Resumes', 'Health, Beauty & Cosmetics',
       'Jewellery & Accessories', 'Computers & Laptops',
       'Boats & Aviation', 'Garden & Braai',
       'Farming Equipment & Vehicles', 'Other Services', 'Bicycles',
       'Community Announcements'

In [127]:
# check frequency of each category
df.category.value_counts()

Cars & Bakkies                           164468
Cell Phones                               50316
Furniture & Decor                         45247
Car Parts & Accessories                   43378
Homeware & Appliances                     29019
Motorcycles & Scooters                    24225
Gaming & Consoles                         20988
TV, Audio & Visual                        20688
Computers & Laptops                       18207
Outdoor & Sports Equipment                16991
Tools & DIY                               14611
Houses & Flats for rent                   12324
Computer Hardware & Accessories           10634
Prams, Cots & Equipment                   10009
Dogs & Cats                                7643
Clothing & Shoes                           7456
Art, Collectibles & Rare Items             7263
Trucks & Commercial Vehicles               6963
Garden & Braai                             6354
Bicycles                                   6037
Business & Industrial Equipment         

In [128]:
# Load test set data
test = pd.read_csv("nlp-dataset/test.csv")

In [129]:
# Inspect few rows of test set data
test.head(5)

Unnamed: 0,id,title,description
0,1008387829,Massage Chair,Electronic Massage Chair. Very good condition....
1,1007542388,Acer Aspire AX3910 PC desktop with windows 10 Pro,This is the best PC for any student its in gre...
2,1009234002,Beach buggy,Hy is nie voledig nie en het nie papiere nie
3,1007888574,Rest Assured Double mattress for sale great co...,Very great condition firm
4,1007436437,2008 Volkswagen Polo 1.9 TDi Highline,Factory Features\r\n\r\n- ABS\r\n- Airbags\r\n...


** Preparing Title and Description **

In [130]:
# Check if training set have null values
df.isnull().sum()

id              0
title           0
description    14
category        0
dtype: int64

In [131]:
# Replace null with empty string
df.fillna("", inplace=True)

In [132]:
# derive full text feature by combining title and descrition of each item
df["full_text"] = df.title + " " + df.description

** Preparing/Encoding Category **

In [133]:
encoder = LabelEncoder().fit(df.category)
df["category_id"] = encoder.transform(df.category)

** Partition Data in Two Set: Training and Test **

In [134]:
df_full = df
df = df.sample(frac=sample_frac)

In [135]:
train_df = df.sample(frac=train_fraction)
test_df = df[~df.index.isin(train_df.index)].reset_index(drop=True)
train_df.reset_index(drop=True, inplace=True)

** Process Text Feature **

In [136]:
import nltk.stem

english_stemmer = nltk.stem.SnowballStemmer("english")
class StemmedCountVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

In [137]:
text_column = "full_text"
vec = TfidfVectorizer(max_features=max_features, ngram_range=(1, 3), analyzer="word", 
                      stop_words="english", token_pattern=r"(?u)\b\w+\b").fit(train_df[text_column])
assert len(vec.vocabulary_) == max_features

In [138]:
train_dataset = vec.transform(train_df.loc[:,text_column])
#pd.DataFrame(vec.transform(train_df.loc[:,text_column]).todense(), columns=vec.vocabulary_)

In [139]:
test_dataset = vec.transform(test_df.loc[:,text_column])
# pd.DataFrame(vec.transform(test_df.loc[:,text_column]).todense(), columns=vec.vocabulary_)

In [71]:
# Utility function to find model score
def model_score(actual, predicted):
    return 1.0 * np.sum((predicted == actual)) / actual.size

In [72]:
oh_encoder = OneHotEncoder().fit(df.category_id.values.reshape(-1, 1))
def one_hot_encoding(df, category=55):
    return np.asarray(oh_encoder.transform(df.category_id.values.reshape([-1, 1])).todense())

** Multiclass Logistic Regression **

---

In [140]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [141]:
classifier = LogisticRegression(penalty='l2', multi_class='multinomial', solver='lbfgs', verbose=3)

In [142]:
classifier = classifier.fit(train_dataset, train_df.category_id)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min finished


In [121]:
pred = classifier.predict(test_dataset)

In [122]:
model_score(pred, test_df.category_id)

0.8113363039079728

** Bag of Word Model **

---

In [114]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalAveragePooling1D
from sklearn.preprocessing import MinMaxScaler
from keras.layers.advanced_activations import PReLU
from keras.optimizers import SGD

In [115]:
# losses: binary_crossentropy, categorical_crossentropy
def get_bow_model(input_shape, output_dim):
    model = Sequential()
    model.add(Dense(768, input_shape=input_shape))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(512))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(output_dim))
    model.add(Activation('sigmoid'))
    model.compile(
        loss='categorical_crossentropy',
        optimizer="Adadelta",
        metrics=['accuracy'])
    return model

In [116]:
pred = None
label = one_hot_encoding(train_df)
model = get_bow_model((train_dataset.shape[1],), 55)
model.fit(train_dataset.values, 
          label,
          validation_split=0.1,
          batch_size=512, nb_epoch=10, verbose=1)
pred = model.predict(test_dataset.values) if pred is None else pred + model.predict(test_dataset.values)

Train on 237528 samples, validate on 26393 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [117]:
prediction = pred.argmax(axis=1)
model_score(test_df.category_id, prediction)

0.8161331757110055

In [None]:
test_df = test
test_df.fillna("", inplace=True)
test_df["full_text"] = test_df.title + " " + test_df.description

In [None]:
test_dataset = pd.DataFrame(vec.transform(test_df.loc[:,text_column]).todense(), columns=vec.vocabulary_)
pred = model.predict(test_dataset.values)
prediction = pred.argmax(axis=1)

In [None]:
test_df["category_id"] = prediction
test_df["category"] = encoder.inverse_transform(prediction)
test_df = test_df[["id", "category"]]
test_df.to_csv("output.csv", index=False)