In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

# https://www.kaggle.com/sid321axn/amazon-alexa-reviews

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy
import en_core_web_sm

nlp = en_core_web_sm.load()

docs = nlp(u"All is well that ends well.")

<IPython.core.display.Javascript object>

In [3]:
#!pip install spacy

<IPython.core.display.Javascript object>

In [4]:
#!python -m spacy download en_core_web_sm

<IPython.core.display.Javascript object>

In [5]:
for word in docs:
    print(word.text, word.pos_)

# for visualization of Entity detection importing displacy from spacy:

nytimes = nlp(
    u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklynâ€™s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000."""
)

entities = [(i, i.label_, i.label) for i in nytimes.ents]
entities

displacy.render(nytimes, style="ent", jupyter=True)

docp = nlp(" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text)

displacy.render(docp, style="dep", jupyter=True)

All DET
is AUX
well ADJ
that DET
ends VERB
well ADV
. PUNCT


pursuit pursuit 439 In
a wall wall 439 of
President Trump Trump 429 ran


<IPython.core.display.Javascript object>

In [6]:
nlp = en_core_web_sm.load()
mango = nlp(u"mango")
print(mango.vector.shape)
print(mango.vector)

# Loading TSV file
df_amazon = pd.read_csv("amazon_alexa.tsv", sep="\t")

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

(96,)
[-0.46230495 -0.970135   -0.3536407   0.2874034  -0.01573688 -0.2451393
 -1.2153258  -0.8796608  -0.33881992 -0.8536644   1.1009686  -0.40891314
  0.22952664  0.321049   -0.16520278  0.19346957  0.18104273 -0.25050712
 -0.8657057  -0.51587033  0.13842589 -1.1441298  -1.2371404  -0.3105632
 -0.7719851  -0.7328713   0.82144946  0.46671128  0.46151483 -0.3285224
  0.5737765   0.5633073   0.8174698  -0.16661754 -0.3198406   0.10492496
 -1.0577449   0.3584243   0.47973    -0.2904761  -0.07571021  1.1125598
 -0.2145705   1.0962679  -0.11507303  0.00683653  0.34713548  0.7762998
 -0.18421783 -0.40364277  0.4234588   0.2599886   0.43403542 -0.32599023
  1.1417987  -0.21782622  0.6406773   0.25259674 -0.17306802 -0.4783875
  0.89445317  0.16820912 -0.01807785  0.8160813  -0.42206082 -0.67849445
  0.26927525 -0.56313473  0.6785855   1.0213484   1.2156713   0.1348514
 -0.5303205  -0.13513315  0.05589189 -0.24881172  0.37840652 -0.9752467
 -0.5351175   0.12411766  0.12952301  0.01232283 -0.5

<IPython.core.display.Javascript object>

In [7]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [
        word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in mytokens
    ]

    # Removing stop words
    mytokens = [
        word for word in mytokens if word not in stop_words and word not in punctuations
    ]

    # return preprocessed list of tokens
    return mytokens


class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}


# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

<IPython.core.display.Javascript object>

In [8]:
bow_vector = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 1))

tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)

<IPython.core.display.Javascript object>

In [9]:
X = df_amazon["verified_reviews"]  # the features we want to analyze
ylabels = df_amazon["feedback"]  # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

classifier = LogisticRegression()

<IPython.core.display.Javascript object>

In [10]:
# Create pipeline using Bag of Words
pipe = Pipeline(
    [("cleaner", predictors()), ("vectorizer", bow_vector), ("classifier", classifier)]
)

<IPython.core.display.Javascript object>

In [16]:
for text in X_train[:5]:
    print(clean_text(text))

works great
this is my 2nd fire tv stick with alexa i have been very happy with my first one love that you can use so many apps with it. big 👍
convenient and fun.
love the alexa feature. firestick is working just fine.
small speaker tin sound. great with good bluetooth speaker.


<IPython.core.display.Javascript object>

In [12]:
# model generation
pipe.fit(X_train, y_train)

ValueError: empty vocabulary; perhaps the documents only contain stop words

<IPython.core.display.Javascript object>

In [14]:
predictors()

<__main__.predictors at 0x7ff339acf730>

<IPython.core.display.Javascript object>

In [None]:
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))