### resources
- [Deep Learning for NLP](http://nlp.stanford.edu/courses/NAACL2013/)
- [forum - similar competitions @ Kaggle](https://www.kaggle.com/c/crowdflower-search-relevance/forums/t/14169/any-similar-kaggle-competition)
- [Beat the bentchmark](https://www.kaggle.com/users/5309/abhishek/crowdflower-search-relevance/beating-the-benchmark)
- [BTB with pipeline](https://www.kaggle.com/users/993/ben-hamner/crowdflower-search-relevance/python-benchmark)

### Ideas
- map all entities in test set. That gives parameter - is query entity in list of all result entities. Alchemy api seems not to perform well. should use some manual 
- proportion of query words (2-grams) existing (not existing) in product title/desc-s
- proportion of query words not anywhere in test set
- word2vec library
- some words correlate with worse/better matches (eg. metal). Build boolean category for 'word_specific_class'

In [9]:
import numpy as np
import pandas as pd
import nltk

from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

In [2]:
df = pd.read_csv('./data/train.csv')
print df.columns

Index([u'id', u'query', u'product_title', u'product_description', u'median_relevance', u'relevance_variance'], dtype='object')


In [20]:
df.groupby(df.median_relevance).relevance_variance.aggregate(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
median_relevance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,774,0.382893,0.437872
2,1476,0.618362,0.335397
3,1737,0.619174,0.299838
4,6171,0.251786,0.357309


In [6]:
df['query'][:10]

0    bridal shower decorations
1         led christmas lights
2                    projector
3                    wine rack
4                   light bulb
5       oakley polarized radar
6              boyfriend jeans
7     screen protector samsung
8            pots and pans set
9                 waffle maker
Name: query, dtype: object

### Wordcount by category

In [4]:
def custom_wordcount(terms, c=None, as_dict=False):
    if c is None:
        c = Counter()
    for term in terms:
        c.update(term.lower().split())
    if as_dict:
        return c
    else:
        return sorted(c.items(), key=lambda x:x[1], reverse=True)

In [5]:
gr = df.groupby(df.median_relevance)
print gr['query'].aggregate(lambda x: custom_wordcount(x)[:6])

median_relevance
1                   [(dress, 42), (shoes, 31), (lathe, 28), (metal...
2                   [(coffee, 79), (shoes, 60), (for, 55), (dress,...
3                   [(dress, 71), (coffee, 56), (shoes, 46), (make...
4                   [(case, 205), (maker, 194), (dress, 173), (cof...
Name: query, dtype: object


### Unique words by category

In [78]:
#load words by category
words = dict()
for cat in df.median_relevance.unique():
    words[cat] = set(custom_wordcount( df[df.median_relevance == cat]['query'], as_dict=True ).keys())
#find unique words for each category
words[2].union(set(['kala']))
uniques = dict()
for cat in df.median_relevance.unique():
    uniques[cat] = words[cat] - set.union(*[wordset for tmpcat, wordset in words.iteritems() if tmpcat != cat])
    
print '\napparently no category has unique words:'
print uniques
print '\nwords only in categories 1 & 2:'
print words[1].union(words[2]) - words[3].union(words[4])
print '\nwordsonly in categories 3 & 4:'
print words[3].union(words[4]) - words[1].union(words[2])


apparently no category has unique words:
{1: set([]), 2: set([]), 3: set([]), 4: set([])}

words only in categories 1 & 2:
set(['pink', 'gown', 'lace', 'victoria', 'secret', 'victorias'])

wordsonly in categories 3 & 4:
set(['steel', 'heart', 'polar', 'short', 'speck', 'tote', 'cowboy', 'shox', 'deep', 'sports', 'volcom', 'rate', 'chandelier', 'ban', 'earbuds', 'fryers', 'bra'])


###Words that are more frequently in certain class (FEATURE)

In [74]:
'''
Compare variance by wordcounts:
Create dataframe of words and counts under each category.
describe by mean & variance
'''
aux = defaultdict(lambda : [0,0,0,0])
for i, row in df.iterrows():
    cat = row['median_relevance']
    ws = '{}'.format(
                row['query']#, row['product_title'], row['product_description']
            ).lower().replace('][@"', '')
    wc = Counter(
                filter(lambda x: len(x) > 2 and x.isalpha(), 
                       ws.split())
            )
    
    for word, count in wc.iteritems():
        aux[word][cat - 1] += count

tmp_df = pd.DataFrame(aux).transpose()

In [75]:
min_count = min( df.median_relevance.value_counts() )
normalizers = {}
for col, count in df.median_relevance.value_counts().iteritems():
    normalizers[col - 1] = count / float( min_count)
    
for col in tmp_df:
    tmp_df[col] /= normalizers[col]

In [77]:
# @todo - scale each row

tmp_df['std'] = tmp_df.apply(np.std, axis=1)
tmp_df[tmp_df['std'] > 2.0].sort(columns='std', ascending=False)

Unnamed: 0,0,1,2,3,std
dress,42,24.646341,31.637306,21.698590,11.294785
coffee,14,41.426829,24.953368,21.573165,10.890715
metal,28,2.621951,1.782383,0.000000,10.401289
lathe,28,2.621951,1.782383,0.000000,10.401289
shoes,31,31.463415,20.497409,17.183277,9.378135
lanyard,25,5.768293,0.445596,0.250851,9.112104
who,25,5.768293,0.445596,0.250851,9.112104
blue,25,14.158537,5.347150,0.501702,8.376861
jeans,27,12.060976,3.119171,5.644142,8.366640
case,2,11.536585,18.269430,25.712202,8.129257


In [7]:
aux = pd.DataFrame({'a':range(1,10), 'b':range(11,20), 'c':np.repeat('a', 9)})
#len(df['victoria' in df['query']]) 
tmp_index = aux.c.map(lambda x: 'b' in x)

df[df['query'].map(lambda x: 'shoe' in x.lower())][['query', 'product_title', 'median_relevance']][:5]

Unnamed: 0,query,product_title,median_relevance
21,aqua shoes,Nurse Mates Bryar (Women's) Aqua,4
35,dc shoes black,Women's DC Shoes Trase TX Black/White,4
55,dc shoes black,Boys' DC Shoes Pure Black/Grey/Yellow,3
62,skechers womens shoes,Danskin Now Girl's Bungee Tie Cross-trainer Shoe,2
125,ecco shoes,Ecco Enrico Sneakers Men's Shoes,4
