In [565]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split

from fuzzywuzzy import fuzz

pd.options.display.max_rows = 4
pd.options.display.max_colwidth = 100

import json
import git
repo = git.Repo(path="/Users/apple/projects/kaggle/homedepot/")

In [273]:
attributes = pd.read_csv('attributes.csv')
for col in ['name', 'value']:
    attributes[col] = attributes[col].str.lower()

#TODO : Remove punctuation from the brand name

#NOTE: The word "brand" in the name usually is about the mfg brand.
#However, there are some entries about fit, compatibility and "pump brand", whatever that is.
brand_name_predicate = attributes['name'].str.contains("brand").fillna(False)
brand_names = attributes[brand_name_predicate].filter(
    items=['product_uid', 'value', 'name']).rename(columns={"value": "brand_name"})

brand_names[brand_names['brand_name'] == '.n/a'] = None
brand_names[brand_names['brand_name'].str.contains("unbranded", na=False)] = None
brand_names.dropna(subset=['brand_name'], inplace=True)

brand_filters = [brand_names['name'].str.contains(val).fillna(False)
                 for val in ["mfg", "fit", "compat", "pump"]]
mfg, fit, compat, pump = brand_filters

other = brand_names[~(mfg | fit | compat | pump)]
assert other.shape[0] == 0

mfg_brands, fit_brands, compat_brands, pump_brands = [brand_names[filt].drop('name', axis=1) for filt in brand_filters]

In [369]:
def get_brand_match(data_row):
    
    if data_row.isnull()['brand_name']:
        return 0

    search_term = data_row['search_term']
    brand_terms = data_row['brand_name'].lower().split(' ')
    
    num_matches = sum([term in search_term for term in brand_terms])
    return (1.0*num_matches) / len(brand_terms)


def get_fuzzy_brand_match(data_row):
    
    if data_row.isnull()['brand_name']:
        return 0

    return fuzz.token_set_ratio(data_row['brand_name'], data_row['search_term'])

In [264]:
#product_descriptions.csv doesn't have the "product_title" column. Augmenting it with the values from the full dataset.
train_uids = train.filter(items=['product_uid', 'product_title'])
test_uids = test.filter(items=['product_uid', 'product_title'])
data_uids = train_uids.merge(test_uids, how = "outer", on = 'product_uid')

data_uids['product_title'] = data_uids['product_title_x']
nas = data_uids['product_title'].isnull()
#The below statement throws a SettingWithCopyWarning because of the [a][b] indexing. Its safe in this case.
data_uids['product_title'][nas] = data_uids['product_title_y'][nas]
del data_uids['product_title_x'], data_uids['product_title_y']
data_uids.drop_duplicates(subset=['product_uid', 'product_title'], inplace=True)

prods = prods.merge(data_uids, how='left', on="product_uid")
prods['full_description'] = prods['product_title'] + ". " + prods['product_description']

#Add brand_name
prods = prods.merge(mfg_brands, how="left", on="product_uid")
train = train.merge(mfg_brands, how="left", on="product_uid")
test = test.merge(mfg_brands, how="left", on="product_uid")

#TODO: Modularize this entire section

#Add brand_match
for df in [train, test]:
    df['brand_match'] = df.apply(get_fuzzy_brand_match, axis=1)

uniq_lens = map(len, [np.unique(df['product_uid'].values) for df in [prods, data_uids, train, test]])
assert uniq_lens[0] == uniq_lens[1]

#We have title dupes
print(len(np.unique(data_uids['product_uid'].values)))
print(len(np.unique(data_uids['product_title'].values)))

124428
120348


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
#This extracts the term counts per product description + title
tf_vectorizer = CountVectorizer(stop_words='english', strip_accents='unicode', decode_error='replace')
full_desc = prods['full_description']
tf = tf_vectorizer.fit_transform(full_desc)

In [6]:
#Should grid-search on max_iter, n_topics, learning_*, batch_size
#TODO : Do we have any meaningful topic priors? A uniform dirichlet with concentration param < 1 seems natural here.
#TODO : The default conc. param is 1/n_topics. What does this imply?
#TODO : Separate the brand out into another variable. Don't have it participate in the lda.
lda = LatentDirichletAllocation(n_topics=10, max_iter=5, verbose=5, n_jobs=8, batch_size=2500)
topic_dists = lda.fit_transform(tf)

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.6s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]

LatentDirichletAllocation(batch_size=2500, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=8, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=5)

In [395]:
#From the sklearn topic extraction example
def print_top_words(model, feature_names, n_top_words, thold=0.01):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #{}:".format(topic_idx))
        print(" ".join([feature_names[i] + "(%.2lf)" % (topic[i]/sum(topic))  
                        for i in topic.argsort()[:-n_top_words - 1:-1]
                           if topic[i] >= thold*sum(topic) ]))

In [396]:
feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, feature_names, 10)
#TODO : The data is shitty, as someone pointed out on the forum. Might need to filter things like
#mmnumber, 2sink, screwdriverada, acrylicwater, bracketsoptional, awningtime...

Topic #0:
easy(0.84) mmnumber(0.06) 2sink(0.05) finish(0.03)
Topic #1:
steel(0.77) door(0.17) easy(0.05)
Topic #2:
whitewall(0.99)
Topic #3:
ft(0.91) easy(0.07) steel(0.01)
Topic #4:
mpi(0.97)
Topic #5:
efv(1.00)
Topic #6:
presto(0.65) plastpro(0.34)
Topic #7:
mixet(0.99)
Topic #8:
use(0.47) light(0.28) steel(0.18) easy(0.03) water(0.01)
Topic #9:
efv(0.50) plastpro(0.50)


In [626]:
#topic_dists will is the distribution you get from transforming your product descriptions using your model.
#For example, if using lda, you will countVectorize the product descriptions and then fit this to an lda model.
#topic_dists will be this model's transform of the countVectorized data.

#TODO : We are assuming that df['product_uid'] - 100001 corresponds to the row-index of topic_dists

#TODO : The normalized topic_dists fall off exponentially. So might make sense to work with their log when
#plugging them into a regression model.

def augment_with_topic_dist(df, topic_dists, prefix, inplace=False, use_row_inds=False):
    
    frame = None
    if inplace:
        frame = df
    else:
        frame = df.copy()
    
    #Normalize the row-sums of topic_dists to 1
    #TODO : Does this make sense though? Aren't these the dirichlet alpha's?
    dist_sums = topic_dists.sum(axis=1).reshape(len(topic_dists), 1)
    topic_dists_normed = topic_dists / dist_sums
    
    assert not use_row_inds or (len(frame) == len(topic_dists))
    inds = np.arange(len(frame)) if use_row_inds else (frame['product_uid'] - 100001)
    
    for feat_num in range(topic_dists_normed.shape[1]):
        frame['{}_{}'.format(prefix, feat_num)] = topic_dists_normed[inds, feat_num] 
    
    return frame


#TODO : How do we extract the topic distributions for the search terms if they contain words not in the product_descs?
#data should be a iterable of strings to transform
def get_topic_dists(data, model, vectorizer):

    data_tf = vectorizer.transform(data)
    return model.transform(data_tf)


def get_reg_data(df, model, vectorizer, prod_desc_dists):
    
    search_dists = get_topic_dists(df['search_term'], model, vectorizer)
    aug_df = augment_with_topic_dist(df, prod_desc_dists, "prod_dist")
    aug_df = augment_with_topic_dist(aug_df, search_dists, "search_dist", use_row_inds=True)
    
    for feat_num in range(prod_desc_dists.shape[1]):
        aug_df['interaction_dist_%d' % feat_num] = aug_df['prod_dist_%d'%feat_num] * aug_df['search_dist_%d'%feat_num]
    
    x = aug_df.filter(regex="_dist_[0-9]|brand_match")
    y = None
    try:
        y = np.ravel(aug_df['relevance'])
    except:
        print "No targets"

    return x, y

def write_submission(test_data, pred, fname = "predictions.csv", git_repo = None, model_params = None):
    
    submit_df = pd.DataFrame(columns=["id", "relevance"])
    submit_df['id'] = test_data['id']

    clipped_pred = np.clip(pred, 1.0, 3.0)
    submit_df['relevance'] = clipped_pred
    
    submit_df.columns = ['"' + col + '"' for col in submit_df.columns]
    submit_df.to_csv(fname, index=False, quotechar='\\')
    
    if git_repo:
        git_repo.git.commit(a=True, m="Latest predictions.\n\nModel details:\n{}\n".format(
                json.dumps(model_params, indent=2)))
    
    return submit_df


def predict_with_best_tree(tree_reg, X, bestind = None):
    
    if bestind is None:
        oob_sums = np.cumsum(tree_reg.oob_improvement_)
        bestind = np.argsort(oob_sums)[::-1][0]
        print "Best model has {} boosted trees with cumulative oob improvement {}.".format(
            bestind + 1, oob_sums[bestind])
    
    best_pred = None
    stage_gen = tree_reg.staged_predict(X)
    for ignore_ind in range(bestind + 1):
        best_pred = stage_gen.next()

    return best_pred, bestind


def score_with_best_tree(tree_reg, X, y, bestind = None):
    
    pred = predict_with_best_tree(tree_reg, X, bestind=bestind)[0]
    return math.sqrt(mean_squared_error(y, pred))


def get_model_params(model):
    
    params = model.get_params()
    params['model_class'] = model.__class__.__name__
    return params


def fit_tree(tree_reg, train_df, model, vectorizer, prod_desc_dists):
    
    given_x, given_y = get_reg_data(train_df, model, vectorizer, prod_desc_dists)
    train_x, heldout_x, train_y, heldout_y = train_test_split(given_x, given_y)
    tree_reg.fit(train_x, train_y)
    
    bestind = predict_with_best_tree(tree_reg, train_x)[1]
    train_score = score_with_best_tree(tree_reg, train_x, train_y, bestind=bestind)
    heldout_score = score_with_best_tree(tree_reg, heldout_x, heldout_y, bestind=bestind)
    overfitted_holdout_score = score_with_best_tree(tree_reg, heldout_x, heldout_y)
    
    print "For the best stage {}:\n\tTrain RMSE : {}\n\tHeld-out RMSE : {}\nOverfitted Hold-out RMSE : {}\n".format(
        bestind, train_score, heldout_score, overfitted_holdout_score)

    model_params = get_model_params(tree_reg)
    model_params['train_score'] = train_score
    model_params['validation_score'] = heldout_score
    model_params['num_trees'] = bestind + 1
    
    return tree_reg, bestind, model_params


def predict_tree(tree_reg, test_df, model, vectorizer, prod_desc_dists, bestind=None, git_repo=None, model_params=None):

    test_x = get_reg_data(test_df, model, vectorizer, prod_desc_dists)[0]
    pred, bestind = predict_with_best_tree(tree_reg, test_x, bestind=bestind)

    write_submission(test_df, pred, git_repo=git_repo, model_params=model_params or get_model_params(tree_reg))
    return pred, bestind


def fit_tree_and_predict(tree_reg, train_df, test_df, model, vectorizer, prod_desc_dists, git_repo=None):
    
    tree_reg, bestind, model_params = fit_tree(tree_reg, train_df, model, vectorizer, prod_desc_dists)
    pred_y = predict_tree(tree_reg, test_df, model, vectorizer, prod_desc_dists,
                          bestind=bestind, git_repo=git_repo, model_params=model_params)
    return tree_reg, pred_y, bestind

In [616]:
relevance_reg = ensemble.GradientBoostingRegressor(n_estimators=3500, subsample=0.8, max_depth=15, verbose=1,
                                                   learning_rate=0.001, min_samples_split=50, min_samples_leaf=10,
                                                   max_features='sqrt')
fit_tree_and_predict(relevance_reg, train, test, lda, tf_vectorizer, topics, git_repo=repo)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.2845           0.0000           46.54m
         2           0.2849           0.0000           43.03m
         3           0.2828           0.0000           41.23m
         4           0.2812           0.0000           39.71m
         5           0.2851           0.0000           39.75m
         6           0.2837           0.0000           40.87m
         7           0.2814           0.0001           40.85m
         8           0.2834           0.0000           40.63m
         9           0.2833           0.0001           40.96m
        10           0.2839           0.0000           40.59m
        20           0.2836           0.0000           38.41m
        30           0.2800           0.0000           37.02m
        40           0.2830           0.0000           36.65m
        50           0.2800           0.0000           36.17m
        60           0.2789           0.0000           35.79m
       

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.0s finished


No targets


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    8.8s finished


(GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
              loss='ls', max_depth=15, max_features='sqrt',
              max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=50,
              min_weight_fraction_leaf=0.0, n_estimators=3500,
              presort='auto', random_state=None, subsample=0.8, verbose=1,
              warm_start=False),
 (array([ 2.36518443,  2.38650485,  2.36799677, ...,  2.35683093,
          2.51680939,  2.28076886]), 2510),
 2510)

In [627]:
train_x = get_reg_data(train, lda, tf_vectorizer, topics)

[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    3.6s finished


In [628]:
train_x

(       brand_match  prod_dist_0  prod_dist_1  prod_dist_2  prod_dist_3  \
 0               26     0.001177     0.035222     0.189607     0.121561   
 1                7     0.001177     0.035222     0.189607     0.121561   
 ...            ...          ...          ...          ...          ...   
 74065           17     0.000982     0.131062     0.000981     0.135040   
 74066           31     0.001316     0.001317     0.001317     0.165733   
 
        prod_dist_4  prod_dist_5  prod_dist_6  prod_dist_7  prod_dist_8  \
 0         0.001177     0.001177     0.001177     0.001177     0.646548   
 1         0.001177     0.001177     0.001177     0.001177     0.646548   
 ...            ...          ...          ...          ...          ...   
 74065     0.129481     0.000981     0.018695     0.060091     0.521705   
 74066     0.001317     0.001317     0.080332     0.001317     0.672957   
 
               ...          interaction_dist_0  interaction_dist_1  \
 0             ...        

In [629]:
train_x[0]

Unnamed: 0,brand_match,prod_dist_0,prod_dist_1,prod_dist_2,prod_dist_3,prod_dist_4,prod_dist_5,prod_dist_6,prod_dist_7,prod_dist_8,...,interaction_dist_0,interaction_dist_1,interaction_dist_2,interaction_dist_3,interaction_dist_4,interaction_dist_5,interaction_dist_6,interaction_dist_7,interaction_dist_8,interaction_dist_9
0,26,0.001177,0.035222,0.189607,0.121561,0.001177,0.001177,0.001177,0.001177,0.646548,...,0.000039,0.001174,0.069209,0.004052,0.000039,0.000039,0.000039,0.000039,0.238129,0.000039
1,7,0.001177,0.035222,0.189607,0.121561,0.001177,0.001177,0.001177,0.001177,0.646548,...,0.000059,0.001761,0.104281,0.006078,0.000059,0.000059,0.000059,0.000059,0.032328,0.000059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74065,17,0.000982,0.131062,0.000981,0.135040,0.129481,0.000981,0.018695,0.060091,0.521705,...,0.000502,0.003277,0.000025,0.038908,0.003238,0.000025,0.000468,0.001502,0.013044,0.000025
74066,31,0.001316,0.001317,0.001317,0.165733,0.001317,0.001317,0.080332,0.001317,0.672957,...,0.000022,0.000022,0.000022,0.002764,0.000022,0.000022,0.023339,0.000022,0.293571,0.011435
