## Requirments

In [1]:
#!pip install nltk



In [5]:
#!pip install -U spacy



In [6]:
#!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


### Call the fashion classifier model (We pushed it on HuggingFace)

In [1]:
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt


model_id = "rasta/distilbert-base-uncased-finetuned-fashion"
classifier = pipeline("text-classification", model=model_id)

def classify(text):
    preds = classifier(text, return_all_scores=True)
    if preds[0][0]['score']  <= preds[0][1]['score']:
        return "Not Fashion"
    else:
        return "Fashion"

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

## Attribute extraction

In [4]:
def attribute_extraction(txt):          # Method: we get each noun and its descriptions (by going back until no more) and filter the fashion from the non fashion
    tokenized = sent_tokenize(txt)

    attributes = []
    for i in tokenized:                                 # Get the POS tagging with NLTK
        wordsList = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(wordsList)

    for i,w in enumerate(tagged) :
        if w[1] in ['NN','NNS','RB'] :                # All the nouns are considered as attributes 
            ind =i 
            attr = w[0]
            while tagged[ind-1][1] in ['JJ','VBN','NN','RB','VBD','EX']:                # All these tags are considered as descriptions of the attributes
                    attr = tagged[ind-1][0] + ' ' +  attr
                    ind = ind - 1
                    
            if len(attr.split())==1 and txt.split()[0].lower()=='will':         # To avoid the classification of will as a noun
                attr = tagged[ind-1][0] + ' ' +  attr
                
            if classify(attr) == 'Fashion':
                attributes.append(attr)
            for a in attributes:
                for b in attributes:
                    if (a!=b) and (a in b):
                        attributes.remove(a)
                        
            # The following is to avoid the the classification of fit and match as nouns
            for a in attributes:
                if 'fit' in a :
                    attributes = list(map(lambda x: x.replace(a, a.replace(' fit','')), attributes))
                if 'match' in a :  
                    attributes = list(map(lambda x: x.replace(a, a.replace(' match','')), attributes))                                       
                
    return attributes        

In [9]:
attribute_extraction('Should I wear red top with green boots?')

['red top', 'green boots']

### Simple module of the extraction pipeline : isQuestion()

In [5]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]

# 10% of the total data
size = int(len(featuresets) * 0.1)

# first 10% for test_set to check the accuracy, and rest 90% after the first 10% for training
train_set, test_set = featuresets[size:], featuresets[:size]

# get the classifer from the training set
classifiers = nltk.NaiveBayesClassifier.train(train_set)
# to check the accuracy - 0.67
# print(nltk.classify.accuracy(classifier, test_set))

question_types = ["whQuestion","ynQuestion"]
def is_ques_using_nltk(ques):
    question_type = classifiers.classify(dialogue_act_features(ques)) 
    return question_type in question_types


question_pattern = ["do i", "do you", "what", "who", "is it", "why","would you", "how","is there",
                    "are there", "is it so", "is this true" ,"to know", "is that true", "are we", "am i", 
                   "question is", "tell me more", "can i", "can we", "tell me", "can you explain",
                   "question","answer", "questions", "answers", "ask"]

helping_verbs = ["is","am","can", "are", "do", "does"]
# check with custom pipeline if still this is a question mark it as a question

def is_question(question):
    question = question.lower().strip()
    if not is_ques_using_nltk(question):
        is_ques = False
        # check if any of pattern exist in sentence
        for pattern in question_pattern:
            is_ques  = pattern in question
            if is_ques:
                break

        # there could be multiple sentences so divide the sentence
        sentence_arr = question.split(".")
        for sentence in sentence_arr:
            if len(sentence.strip()):
                # if question ends with ? or start with any helping verb
                # word_tokenize will strip by default
                first_word = nltk.word_tokenize(sentence)[0]
                if sentence.endswith("?") or first_word in helping_verbs:
                    is_ques = True
                    break
        return is_ques    
    else:
        return True

## Semantic check

Semantic check is composed of two modules: a hard coded method and an automated model. In order to optimize the response time, we would like to avoid using the model for each query. With the hard coded method we can verify 90% of usual query (with match, suit, ...). Also, the semantic check has a 100% accuracy on detecting semantically true sentences. So here is the proposed pipeline : Fist we check with the hard coded method, if it is true then we return. If not, we go through the automated model in order to clear all the doubts.

In [6]:
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer


model_semantick_id = "PriaPillai/distilbert-base-uncased-finetuned-query"
classifier_sem = pipeline("text-classification", model=model_semantick_id)


ps = PorterStemmer()
verb_pattern = [ps.stem(i) for i in ['match', 'suit', 'fit', 'wear', 'pair']]
# 'be', 'go', 'are'

def semantic_check_hard_coded(txt):
    tokenized = sent_tokenize(txt)
    verbs = []
    
    for i in tokenized:
        wordsList = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(wordsList)

    for i,w in enumerate(tagged) :
        if w[1] in ['VB','VBD','VBN','VBG','VBP','VBZ'] :
            verbs.append(ps.stem(w[0]))
    
    for v in verbs:
        if v in verb_pattern :
            return True
    return False

def semantic_check(text):
    if semantic_check_hard_coded(text):
        return True
    preds = classifier_sem(text, return_all_scores=True)
    if preds[0][0]['score']  <= preds[0][1]['score']:
        return True
    else:
        return False

In [58]:
semantic_check("Can I eat blue jeans ?")

False

In [59]:
semantic_check("What matches my white shirt ?")

True

In [60]:
semantic_check("What can match my white shirt ?")

True

In [62]:
semantic_check('What can i wear with green pants?')

True

In [63]:
semantic_check("Will blue jeans and a black shirt match ?")

True

In [16]:
semantic_check('Do you want a blue necklace ?')

False

In [17]:
semantic_check("What can suit my blue jeans and dark shirt ?")

True

In [18]:
semantic_check("Can I actually have a blue pants paired with a white shirt ?")

True

In [22]:
semantic_check("Can blue pants be a good fit with white shirt ?")

True

In [20]:
semantic_check("Hey Aleksey, do you think it's a good work ?")

False

In [21]:
semantic_check("Maybe I should ask the model again... What can I wear with my skinny blue jeans ?")

True

In [23]:
semantic_check("I want to eat a blue pant")

False

## Whole Pipeline function 

In [8]:
def extraction_pipeline(query):
    if not is_question(query):
        print("I am not understanding you, please enter a question that is related to fashion")
        return []
    elif not semantic_check(query) :
        print("I am not sure to get your query can you please try again ?")
        return []
    else:
        return attribute_extraction(query)

In [10]:
extraction_pipeline("Will a blue jean and a black shirt fit ?")

['blue jean', 'black shirt']