In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

pd.options.display.max_rows = 4
pd.options.display.max_colwidth = 100

In [2]:
prods = pd.read_csv('product_descriptions.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
#TODO : Havent' used attributes.csv at all

In [3]:
#product_descriptions.csv doesn't have the "product_title" column. Augmenting it with the values from the full dataset.
train_uids = train.filter(items=['product_uid', 'product_title'])
test_uids = test.filter(items=['product_uid', 'product_title'])
data_uids = train_uids.merge(test_uids, how = "outer", on = 'product_uid')

data_uids['product_title'] = data_uids['product_title_x']
nas = data_uids['product_title'].isnull()
#The below statement throws a SettingWithCopyWarning because of the [a][b] indexing. Its safe in this case.
data_uids['product_title'][nas] = data_uids['product_title_y'][nas]
del data_uids['product_title_x'], data_uids['product_title_y']
data_uids.drop_duplicates(subset=['product_uid', 'product_title'], inplace=True)

prods = prods.merge(data_uids, on="product_uid")
prods['full_description'] = prods['product_title'] + ". " + prods['product_description']
print(map(len, [np.unique(df['product_uid'].values) for df in [prods, data_uids, train, test]]))

#We have title dupes
print(len(np.unique(data_uids['product_uid'].values)))
print(len(np.unique(data_uids['product_title'].values)))

[124428, 124428, 54667, 97460]
124428
120348


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
#This extracts the term counts per product description + title
tf_vectorizer = CountVectorizer(stop_words='english', strip_accents='unicode', decode_error='replace')
full_desc = prods['full_description']
tf = tf_vectorizer.fit_transform(full_desc)

In [6]:
#Should grid-search on max_iter, n_topics, learning_*, batch_size
#TODO : Do we have any meaningful topic priors? A uniform dirichlet with concentration param < 1 seems natural here.
#TODO : The default conc. param is 1/n_topics. What does this imply?
#TODO : Separate the brand out into another variable. Don't have it participate in the lda.
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, verbose=5, n_jobs=8, batch_size=2500)
topics = lda.fit_transform(tf)

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.6s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]

LatentDirichletAllocation(batch_size=2500, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=8, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=5)

In [16]:
#From the sklearn topic extraction example
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [17]:
feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, feature_names, 10)
#TODO : The data is shitty, as someone pointed out on the forum. Might need to filter things like
#mmnumber, 2sink, screwdriverada, acrylicwater, bracketsoptional, awningtime...

Topic #0:
easy mmnumber 2sink finish design vanity door cardell kingsley white
Topic #1:
steel door easy ft use storage high design used installation
Topic #2:
whitewall efv jerk awningtime sensor100 acrylicwater light easy ft bracketsoptional
Topic #3:
ft easy steel use 10 water home trayshigh design provides
Topic #4:
mpi easy paint use finish ft water color door resistant
Topic #5:
efv easy ft use steel light high power installation provides
Topic #6:
presto plastpro use easy tayse door screwdriverada home wall steel
Topic #7:
mixet kaarskoker light door easy water use finish glass shower
Topic #8:
use light steel easy water metal hestra high paint ft
Topic #9:
efv plastpro packcheck easy packelectro water use framefpr7sold home steel


In [118]:
#topic_dists will is the distribution you get from transforming your product descriptions using your model.
#For example, if using lda, you will countVectorize the product descriptions and then fit this to an lda model.
#topic_dists will be this model's transform of the countVectorized data.
#NOTE : We are assuming that df['product_uid'] - 100001 corresponds to the row-index of all_topic_dists

#TODO : The normalized topic_dists fall off exponentially. So might make sense to work with their log when
#plugging them into a regression model.

def augment_with_topic_dist(df, topic_dists, inplace=False):
    
    frame = None
    if inplace:
        frame = df
    else:
        frame = df.copy()
    
    #Normalize the row-sums of topic_dists to 1
    dist_sums = topic_dists.sum(axis=1).reshape(len(topic_dists), 1)
    topic_dists_normed = topic_dists / dist_sums
    
    inds = frame['product_uid'] - 100001
    for feat_num in range(topic_dists_normed.shape[1]):
        frame['topic_dist_{}'.format(feat_num)] = topic_dists_normed[inds, feat_num] 
    
    return frame

In [119]:
aug_train = augment_with_topic_dist(train, topics)
aug_test = augment_with_topic_dist(test, topics)