In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

pd.options.display.max_rows = 4
pd.options.display.max_colwidth = 100

In [111]:
prods = pd.read_csv('product_descriptions.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [98]:
attributes = pd.read_csv('attributes.csv')
for col in ['name', 'value']:
    attributes[col] = attributes[col].str.lower()

#NOTE: The word "brand" in the name usually is about the mfg brand.
#However, there are some entries about fit, compatibility and "pump brand", whatever that is.
brand_name_predicate = attributes['name'].str.contains("brand").fillna(False)
brand_names = attributes[brand_name_predicate].filter(
    items=['product_uid', 'value', 'name']).rename(columns={"value": "brand_name"})

brand_names[brand_names['brand_name'] == '.n/a'] = None
brand_names.dropna(subset=['brand_name'], inplace=True) #Need to do this before the following .str ops on a Series. Better way?
brand_names[brand_names['brand_name'].str.contains("unbranded")] = None
brand_names.dropna(subset=['brand_name'], inplace=True)

brand_filters = [brand_names['name'].str.contains(val).fillna(False)
                 for val in ["mfg", "fit", "compat", "pump"]]
mfg, fit, compat, pump = brand_filters

other = brand_names[~(mfg | fit | compat | pump)]
assert other.shape[0] == 0

mfg_brands, fit_brands, compat_brands, pump_brands = [brand_names[filt].drop('name', axis=1) for filt in brand_filters]

In [112]:
#product_descriptions.csv doesn't have the "product_title" column. Augmenting it with the values from the full dataset.
train_uids = train.filter(items=['product_uid', 'product_title'])
test_uids = test.filter(items=['product_uid', 'product_title'])
data_uids = train_uids.merge(test_uids, how = "outer", on = 'product_uid')

data_uids['product_title'] = data_uids['product_title_x']
nas = data_uids['product_title'].isnull()
#The below statement throws a SettingWithCopyWarning because of the [a][b] indexing. Its safe in this case.
data_uids['product_title'][nas] = data_uids['product_title_y'][nas]
del data_uids['product_title_x'], data_uids['product_title_y']
data_uids.drop_duplicates(subset=['product_uid', 'product_title'], inplace=True)

prods = prods.merge(data_uids, how='left', on="product_uid")
prods['full_description'] = prods['product_title'] + ". " + prods['product_description']
prods = prods.merge(mfg_brands, how="left", on="product_uid")

uniq_lens = map(len, [np.unique(df['product_uid'].values) for df in [prods, data_uids, train, test]])
assert uniq_lens[0] == uniq_lens[1]

#We have title dupes
print(len(np.unique(data_uids['product_uid'].values)))
print(len(np.unique(data_uids['product_title'].values)))

124428
120348


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
