In [1]:
import numpy as np
import re
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import OrderedDict
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from pywsd.utils import lemmatize_sentence
stop = set(stopwords.words('english'))


def tokenize_sentences(sentences):
    words = []
    for sentence in sentences:
        w = extract_words(sentence)
        words.extend(w)
        
    words = sorted(list(set(words)))
    return words

def extract_words(sentence):
    ignore_words = ['a']
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    return words_cleaned    
    
def bagofwords(sentence, words):
    sentence_words = extract_words(sentence)
    # frequency word count
    bag = np.zeros(len(words))
    for sw in sentence_words:
        for i,word in enumerate(words):
            if word == sw: 
                bag[i] += 1
                
    return np.array(bag)


def apply_lemmatization(query):
    query =lemmatize_sentence(query)
    
    return TreebankWordDetokenizer().detokenize(query)

def filter_stopwords(query,stop_words=stop):
#     global stop
    query = [word for word in word_tokenize(query) if word not in stop_words]
    query =TreebankWordDetokenizer().detokenize(query)
    
    return query


def get_values_for_the_keys(words,queries):
    hm = defaultdict(list)
    for word in words:
        for query in queries:
            if word in query:
                hm[word].append(query)
    return hm
    

def pd_fill_diagonal(df_matrix, value=0): 
    mat = df_matrix.values
    n = mat.shape[0]
    mat[range(n), range(n)] = value
    return pd.DataFrame(mat)

Warming up PyWSD (takes ~10 secs)... took 4.873117685317993 secs.


In [2]:
queries = ['eye makeup','how to do eyemakeup','eye makeup tutorial','eyeshadow tutorial',
           'how to apply eye makeup','how to put on eyeshadow','how to apply eyeshadow','how to do a smokey eye',
           'smokey eye makeup','smokey eye','smokey eyes','smoky eye',
           'smokey eye tutorial','how to do smokey eyes',
           'contouring','contouring kit','contouring makeup kit','contour kit',
           'contouring makeup','contour makeup','makeup contouring','how to contour',
           'face contouring','how to contour face','highlight and contour',
           'best eyebrow pencil','brow','eyebrow pencil','eyebrow',
           'eyebrow makeup','eyebrows','how to do eyebrows','how to do your eyebrows',
           'how to shape eyebrows','how to pluck eyebrow','perfect eyebrows','eyebrow tutorial'
          ]


In [3]:
queries = [apply_lemmatization(query) for query in queries]
queries = [filter_stopwords(query) for query in queries]
# queries

In [4]:
# extract unique words
words = tokenize_sentences(queries)

In [5]:
# build word and set of queries where word found as hashmap
hash_map = get_values_for_the_keys(words,queries)

In [6]:

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(queries)
print(vectorizer.get_feature_names())

# bag of words
query_word_mat=pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
query_word_mat.index = ['Q{}'.format(i) for i in range(query_word_mat.shape[0])]
probabilities = {col:sum(query_word_mat[col])/len(queries) for col in query_word_mat.columns}

df_asint = query_word_mat.astype(int)
coocc = df_asint.T.dot(df_asint)

['apply', 'best', 'brow', 'contour', 'eye', 'eyebrow', 'eyemakeup', 'eyeshadow', 'face', 'highlight', 'kit', 'makeup', 'pencil', 'perfect', 'pluck', 'put', 'shape', 'smokey', 'smoky', 'tutorial']


In [7]:
aa = pd.DataFrame(columns=probabilities.keys(),index=probabilities.keys())
for key in probabilities:
    aa[key]=probabilities[key]

bb = pd.DataFrame(columns=probabilities.keys(),index=probabilities.keys())
for key in probabilities:
    bb.loc[key]=probabilities[key]

cc=pd.DataFrame(aa.values*bb.values, columns=aa.columns, index=aa.index)

# common probability
dd = coocc/len(queries)

ee=pd.DataFrame(dd.values/cc.values, columns=dd.columns, index=dd.index)



In [8]:
rp=coocc
rp[rp!=0]=1
lift_scores=ee*rp

lift_scores2=lift_scores[(lift_scores >= 3).any(1)]


query_topic_mat = query_word_mat.T
query_topic_mat = query_topic_mat.loc[lift_scores2.index.tolist(),:]
query_topic_mat.shape

(20, 37)

In [9]:

continue_cooccurence = True
iteration = 1
while continue_cooccurence:
    print('Gathering new topics at the iteration {}'.format(iteration))
    iteration += 1
    topics = query_topic_mat
    # having the backup topic-query matrix
    topics=topics.dot(query_word_mat)/len(queries)


    # probabilities
    for col in topics.columns:
        topics[col]=topics[col]/probabilities[col]

    topics_log_prob = np.log2(topics)


    # shanon_entropies
    shanon = pd.DataFrame(topics.values*topics_log_prob.values,columns=topics.columns,index=topics.index)
    shanon_entropies = shanon.sum()

    cooccurence_score = topics_log_prob
    for col in topics_log_prob.columns:
        cooccurence_score[col] = topics_log_prob[col].fillna(0)-shanon_entropies[col]

    for ind in cooccurence_score.index:
        cooccurence_score.loc[ind,ind]=-1

    cooccurence_score=cooccurence_score[cooccurence_score>=1.5]

    cooccurence_score=cooccurence_score.dropna(axis=1,how='all').dropna(how='all')


    if cooccurence_score.empty:
        print('setting continue_cooccurence')
        continue_cooccurence = False

    updated_topics = defaultdict(set)
    query_word_mat_trans = query_word_mat.T
    words_index = cooccurence_score.columns.tolist()
    for index,row in cooccurence_score.iterrows():
        for col in words_index:
            if row[col]:
                print(index,col)
                updated_topics[index].add(col)
                query_topic_mat.loc[index] += query_word_mat_trans.loc[col]
    query_topic_mat[query_topic_mat>0]=1

    for key,value in updated_topics.items():
        updated_index=key+','+','.join(value)
        print('updated topcis is {}'.format(updated_index))
        query_topic_mat.rename(index={key:updated_index},inplace=True)
        probabilities[updated_index]=sum(query_topic_mat.loc[updated_index])/len(queries)

Gathering new topics at the iteration 1
contour makeup
contour tutorial
eye makeup
eye tutorial
updated topcis is contour,tutorial,makeup
updated topcis is eye,tutorial,makeup
Gathering new topics at the iteration 2
contour,tutorial,makeup contour
contour,tutorial,makeup eye
contour,tutorial,makeup makeup
contour,tutorial,makeup tutorial
eye,tutorial,makeup contour
eye,tutorial,makeup eye
eye,tutorial,makeup makeup
eye,tutorial,makeup tutorial
smokey contour
smokey eye
smokey makeup
smokey tutorial
updated topcis is contour,tutorial,makeup,eye,tutorial,contour,makeup
updated topcis is eye,tutorial,makeup,eye,tutorial,contour,makeup
updated topcis is smokey,eye,tutorial,contour,makeup
Gathering new topics at the iteration 3
apply apply
apply contour
apply eye
apply eyeshadow
apply tutorial
contour,tutorial,makeup,eye,tutorial,contour,makeup apply
contour,tutorial,makeup,eye,tutorial,contour,makeup contour
contour,tutorial,makeup,eye,tutorial,contour,makeup eye
contour,tutorial,makeup,ey

  app.launch_new_instance()


 eyeshadow
contour,tutorial,makeup,eye,tutorial,contour,makeup tutorial
eye,tutorial,makeup,eye,tutorial,contour,makeup apply
eye,tutorial,makeup,eye,tutorial,contour,makeup contour
eye,tutorial,makeup,eye,tutorial,contour,makeup eye
eye,tutorial,makeup,eye,tutorial,contour,makeup eyeshadow
eye,tutorial,makeup,eye,tutorial,contour,makeup tutorial
eyeshadow apply
eyeshadow contour
eyeshadow eye
eyeshadow eyeshadow
eyeshadow tutorial
makeup apply
makeup contour
makeup eye
makeup eyeshadow
makeup tutorial
put apply
put contour
put eye
put eyeshadow
put tutorial
smokey,eye,tutorial,contour,makeup apply
smokey,eye,tutorial,contour,makeup contour
smokey,eye,tutorial,contour,makeup eye
smokey,eye,tutorial,contour,makeup eyeshadow
smokey,eye,tutorial,contour,makeup tutorial
tutorial apply
tutorial contour
tutorial eye
tutorial eyeshadow
tutorial tutorial
updated topcis is apply,apply,eye,eyeshadow,tutorial,contour
updated topcis is contour,tutorial,makeup,eye,tutorial,contour,makeup,apply,eye,

In [10]:
# query_topic_mat

In [11]:
# lift_scores

In [12]:
query_topic_mat

Unnamed: 0,Q0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,Q27,Q28,Q29,Q30,Q31,Q32,Q33,Q34,Q35,Q36
"apply,apply,eye,eyeshadow,tutorial,contour,eyebrow,contour",1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
best,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brow,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"contour,tutorial,makeup,eye,tutorial,contour,makeup,apply,eye,eyeshadow,tutorial,contour,eyebrow,eyebrow,makeup,eyebrow,contour",1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"eye,tutorial,makeup,eye,tutorial,contour,makeup,apply,eye,eyeshadow,tutorial,contour,eyebrow,eyebrow,makeup,eyebrow,contour",1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
eyebrow,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
eyemakeup,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"eyeshadow,apply,eye,eyeshadow,tutorial,contour,eyebrow,contour",1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
face,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
highlight,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
