In [214]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

pd.options.display.max_rows = 4
pd.options.display.max_colwidth = 100

In [2]:
prods = pd.read_csv('product_descriptions.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
#TODO : Havent' used attributes.csv at all

In [3]:
#product_descriptions.csv doesn't have the "product_title" column. Augmenting it with the values from the full dataset.
train_uids = train.filter(items=['product_uid', 'product_title'])
test_uids = test.filter(items=['product_uid', 'product_title'])
data_uids = train_uids.merge(test_uids, how = "outer", on = 'product_uid')

data_uids['product_title'] = data_uids['product_title_x']
nas = data_uids['product_title'].isnull()
#The below statement throws a SettingWithCopyWarning because of the [a][b] indexing. Its safe in this case.
data_uids['product_title'][nas] = data_uids['product_title_y'][nas]
del data_uids['product_title_x'], data_uids['product_title_y']
data_uids.drop_duplicates(subset=['product_uid', 'product_title'], inplace=True)

prods = prods.merge(data_uids, on="product_uid")
prods['full_description'] = prods['product_title'] + ". " + prods['product_description']
print(map(len, [np.unique(df['product_uid'].values) for df in [prods, data_uids, train, test]]))

#We have title dupes
print(len(np.unique(data_uids['product_uid'].values)))
print(len(np.unique(data_uids['product_title'].values)))

[124428, 124428, 54667, 97460]
124428
120348


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
#This extracts the term counts per product description + title
tf_vectorizer = CountVectorizer(stop_words='english', strip_accents='unicode', decode_error='replace')
full_desc = prods['full_description']
tf = tf_vectorizer.fit_transform(full_desc)

In [6]:
#Should grid-search on max_iter, n_topics, learning_*, batch_size
#TODO : Do we have any meaningful topic priors? A uniform dirichlet with concentration param < 1 seems natural here.
#TODO : The default conc. param is 1/n_topics. What does this imply?
#TODO : Separate the brand out into another variable. Don't have it participate in the lda.
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, verbose=5, n_jobs=8, batch_size=2500)
topic_dists = lda.fit_transform(tf)

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.6s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]

LatentDirichletAllocation(batch_size=2500, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=8, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=5)

In [16]:
#From the sklearn topic extraction example
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [17]:
feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, feature_names, 10)
#TODO : The data is shitty, as someone pointed out on the forum. Might need to filter things like
#mmnumber, 2sink, screwdriverada, acrylicwater, bracketsoptional, awningtime...

Topic #0:
easy mmnumber 2sink finish design vanity door cardell kingsley white
Topic #1:
steel door easy ft use storage high design used installation
Topic #2:
whitewall efv jerk awningtime sensor100 acrylicwater light easy ft bracketsoptional
Topic #3:
ft easy steel use 10 water home trayshigh design provides
Topic #4:
mpi easy paint use finish ft water color door resistant
Topic #5:
efv easy ft use steel light high power installation provides
Topic #6:
presto plastpro use easy tayse door screwdriverada home wall steel
Topic #7:
mixet kaarskoker light door easy water use finish glass shower
Topic #8:
use light steel easy water metal hestra high paint ft
Topic #9:
efv plastpro packcheck easy packelectro water use framefpr7sold home steel


In [246]:
#topic_dists will is the distribution you get from transforming your product descriptions using your model.
#For example, if using lda, you will countVectorize the product descriptions and then fit this to an lda model.
#topic_dists will be this model's transform of the countVectorized data.

#TODO : We are assuming that df['product_uid'] - 100001 corresponds to the row-index of topic_dists

#TODO : The normalized topic_dists fall off exponentially. So might make sense to work with their log when
#plugging them into a regression model.

def augment_with_topic_dist(df, topic_dists, prefix, inplace=False, use_row_inds=False):
    
    frame = None
    if inplace:
        frame = df
    else:
        frame = df.copy()
    
    #Normalize the row-sums of topic_dists to 1
    #TODO : Does this make sense though? Aren't these the dirichlet alpha's?
    dist_sums = topic_dists.sum(axis=1).reshape(len(topic_dists), 1)
    topic_dists_normed = topic_dists / dist_sums
    
    assert not use_row_inds or (len(frame) == len(topic_dists))
    inds = np.arange(len(frame)) if use_row_inds else (frame['product_uid'] - 100001)
    
    for feat_num in range(topic_dists_normed.shape[1]):
        frame['{}_{}'.format(prefix, feat_num)] = topic_dists_normed[inds, feat_num] 
    
    return frame


#TODO : How do we extract the topic distributions for the search terms if they contain words not in the product_descs?
#data should be a iterable of strings to transform
def get_topic_dists(data, model, vectorizer):

    data_tf = vectorizer.transform(data)
    return model.transform(data_tf)


def get_reg_data(df, model, vectorizer, prod_desc_dists):
    
    search_dists = get_topic_dists(df['search_term'], model, vectorizer)
    aug_df = augment_with_topic_dist(df, prod_desc_dists, "prod_dist")
    aug_df = augment_with_topic_dist(aug_df, search_dists, "search_dist", use_row_inds=True)
    
    x = aug_df.filter(regex="_dist_[0-9]")
    y = None
    try:
        y = np.ravel(aug_df['relevance'])
    except:
        print "No targets"

    return x, y

def write_submission(test_data, pred, fname = "predictions.csv"):
    
    submit_df = pd.DataFrame(columns=["id", "relevance"])
    submit_df['id'] = test['id']

    clipped_pred = np.clip(pred, 1.0, 3.0)
    submit_df['relevance'] = clipped_pred
    
    submit_df.columns = ['"' + col + '"' for col in submit_df.columns]
    submit_df.to_csv(fname, index=False, quotechar='\\')
    
    return submit_df

In [196]:
#TODO : Right now, we have no test set. Need to hold out part of the train
train_x, train_y = get_reg_data(train, lda, tf_vectorizer, topics)
test_x, test_y = get_reg_data(test, lda, tf_vectorizer, topics)

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    3.6s finished


No targets


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    7.2s finished


In [247]:
relevance_reg = ensemble.GradientBoostingRegressor(n_estimators=2000, subsample=0.5, max_depth=5, verbose=1)

In [248]:
relevance_reg.fit(train_x, train_y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.2825           0.0015           13.56m
         2           0.2816           0.0013           14.20m
         3           0.2805           0.0011           14.03m
         4           0.2787           0.0011           13.86m
         5           0.2781           0.0007           13.76m
         6           0.2765           0.0006           13.69m
         7           0.2779           0.0007           13.72m
         8           0.2758           0.0006           13.68m
         9           0.2734           0.0005           13.64m
        10           0.2746           0.0005           13.63m
        20           0.2698           0.0001           13.40m
        30           0.2708           0.0001           13.14m
        40           0.2646           0.0000           12.93m
        50           0.2657           0.0000           12.76m
        60           0.2608          -0.0000           12.58m
       

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=5, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=2000,
             presort='auto', random_state=None, subsample=0.5, verbose=1,
             warm_start=False)

In [249]:
pred_y = relevance_reg.predict(test_x)
write_submission(test, pred_y)

Unnamed: 0,"""id""","""relevance"""
0,1,2.701755
1,4,2.638092
...,...,...
166691,240759,2.610292
166692,240760,1.788169
