In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [5]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [6]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [7]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [8]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [9]:
X_train = ['i want to buy a jeans pent', 
           'i want to purchase a pair of shoes',
           'are you selling laptops',
            'i need an apple jam',
            'can you please tell me the price of this product',
            'please give me some discount.', 
            "i cannot afford such price",
            "could you negotiate", 
            "i agree on your offer",
            "yes i accepcted your offer",
            "offer accepted", 
            "agreed",
          ]

In [10]:
y_train = [
     'Buy_a_product',
     'Buy_a_product',
     'Buy_a_product',
     'Buy_a_product',
     'Buy_a_product',
     'negotition',
     'negotition',
     "negotition",
     "success",
     "success",
     "success",
     "success",
]

In [19]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(multi_class='multinomial', solver ='newton-cg')

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x0000017B2DA00B70>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [20]:
predicted = pipe.predict(["Please give me some discount i cannot afford such price"])

In [21]:
print(predicted)

['negotition']


In [22]:
predicted = pipe.predict(["I like this shirt how much it costs?"])

In [23]:
print(predicted)

['Buy_a_product']


In [24]:
predicted = pipe.predict(["Your offer is awesome i accept it"])
print(predicted)

['success']


In [17]:
# # Logistic Regression Classifier
# from sklearn.naive_bayes import MultinomialNB
# classifier = MultinomialNB()

# # Create pipeline using Bag of Words
# pipe1 = Pipeline([("cleaner", predictors()),
#                  ('vectorizer', bow_vector),
#                  ('classifier', classifier)])

# # model generation
# pipe1.fit(X_train,y_train)