# Sentence Tags Recognition

In [1]:
%matplotlib inline

import pandas as pd

In [8]:
raw_data_set = pd.read_csv("./setence-tags-training-data.csv")

In [16]:
raw_data_set.head(5)

Unnamed: 0,Sentence type,Sample
0,YES-NO-QUESTION,Can you cancel my order?
1,YES-NO-QUESTION,Could you please create a return?
2,YES-NO-QUESTION,Are you going to cancel it?
3,YES-NO-QUESTION,Is she Dutch?
4,YES-NO-QUESTION,Shall we start?


In [17]:
raw_data_set.describe()

Unnamed: 0,Sentence type,Sample
count,242,241
unique,22,235
top,RESPONSE ACKNOWLEDGMENT,"Yes, I do"
freq,13,2


In [20]:
raw_data_set.columns = ["tag", "text"]
raw_data_set.head()

Unnamed: 0,tag,text
0,YES-NO-QUESTION,Can you cancel my order?
1,YES-NO-QUESTION,Could you please create a return?
2,YES-NO-QUESTION,Are you going to cancel it?
3,YES-NO-QUESTION,Is she Dutch?
4,YES-NO-QUESTION,Shall we start?


In [22]:
raw_data_set = raw_data_set.dropna()
raw_data_set.describe()

Unnamed: 0,tag,text
count,241,241
unique,22,235
top,RESPONSE ACKNOWLEDGMENT,"Yes, I do"
freq,13,2


In [26]:
raw_data_set["tag"] = pd.Categorical(raw_data_set["tag"])
raw_data_set["tag_code"] = raw_data_set["tag"].cat.codes
raw_data_set.head()

Unnamed: 0,tag,text,tag_code
0,YES-NO-QUESTION,Can you cancel my order?,21
1,YES-NO-QUESTION,Could you please create a return?,21
2,YES-NO-QUESTION,Are you going to cancel it?,21
3,YES-NO-QUESTION,Is she Dutch?,21
4,YES-NO-QUESTION,Shall we start?,21


In [27]:
raw_data_set["tag_code"].unique()

array([21, 16, 18,  1, 12,  8,  0,  2, 20,  4, 19,  9, 14,  3, 17, 11, 13,
       15,  5, 10,  6,  7])

In [28]:
X = raw_data_set["text"]
y = raw_data_set["tag_code"]

In [29]:
y.describe()

count    241.000000
mean      10.568465
std        6.357777
min        0.000000
25%        5.000000
50%       11.000000
75%       16.000000
max       21.000000
Name: tag_code, dtype: float64

In [40]:
## Importing spacy
import spacy
from sklearn.preprocessing import FunctionTransformer

nlp = spacy.load("en_core_web_sm")

In [41]:
def _lemmatize(sentence):
    doc = nlp(sentence)
    lemma_list = [token.lemma_ for token in doc if token.is_alpha]
    return ' '.join(lemma_list)

def lemmatize_transformer(X):
    return X.apply(_lemmatize)

SpacyLemmatizer = FunctionTransformer(func=lemmatize_transformer, validate=False)

In [36]:
_lemmatize("He go")

'-PRON- goo'

In [42]:
# Model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [57]:
model = make_pipeline(
    SpacyLemmatizer,
    CountVectorizer(), 
    LogisticRegression(max_iter=10_000)
)

# Split data
X_train, X_test, y_train, y_text = train_test_split(X, y, test_size=0.2)

# Train
model.fit(X_train, y_train)

# Score
model.score(X_train, y_train)



0.9375

In [70]:
sentence = "When will I have my package at home?"
model.predict(pd.Series([_lemmatize(sentence)]))

array([13], dtype=int8)

In [72]:
raw_data_set.query("tag_code == 13")

Unnamed: 0,tag,text,tag_code
101,REJECT,"Well, no",13
102,REJECT,Never,13
103,REJECT,Hell no!,13
104,REJECT,No way,13
105,REJECT,I won't do that,13
106,REJECT,I can't do this,13
107,REJECT,I would never say that,13
148,REJECT,No,13
149,REJECT,I can't,13
150,REJECT,Unfortunately,13
