In [1]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
# from __future__ import unicode_literals
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
en_nlp = spacy.load('en')

In [3]:
def spacy_get_vec(sentence):
    vec = np.zeros(96)
    doc = en_nlp((sentence))
    for word in doc:
        #if word.lower_ in ENGLISH_STOP_WORDS:
        #    continue
        vec += word.vector
    return vec

In [4]:
lines = open('./is_question.txt').readlines()
vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(96)])
df['intents'] = intents
df.intents = df.intents.astype('category')

In [5]:
from sklearn.utils import shuffle
df = shuffle(df)

In [6]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_87,vec_88,vec_89,vec_90,vec_91,vec_92,vec_93,vec_94,vec_95,intents
144,7.693374,-4.42519,10.370076,-8.059266,-6.715349,5.353402,-2.543957,-3.710457,1.941995,12.817253,...,-13.698405,3.170844,6.462161,-5.732632,-2.501776,14.463405,-6.847696,3.601706,0.969412,question
71,3.278343,-1.675798,2.725453,-2.183201,2.442451,0.639979,-0.94462,-0.534458,-3.375728,1.780514,...,-0.362511,5.803208,-1.636521,-0.631588,3.263864,1.444387,0.279575,-2.825133,5.470883,sentiment
75,-0.091077,-1.676832,6.234412,-2.988006,-3.43732,5.952391,-0.241372,-1.364574,1.897492,4.432281,...,-3.514902,1.312641,4.867523,-0.994885,2.009481,0.795232,-2.636366,-0.286786,-3.103258,sentiment
11,5.669478,-5.065109,-0.989023,-0.689991,7.991487,6.222493,15.764335,-6.990608,-9.58633,5.690118,...,-0.133917,8.80873,1.23152,-0.333365,1.899144,-3.473238,-4.665623,-8.717071,9.846305,sentiment
65,-1.978794,-0.953607,-3.321292,-2.197667,3.180284,1.942628,1.725907,-0.902973,-0.098377,-0.716015,...,2.659779,2.615698,0.753646,3.109646,2.798889,0.515412,-2.441212,3.466424,-0.745764,sentiment


In [7]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)

In [9]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(class_weight={'question':0.95,'sentiment':1.05})
logit_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight={'sentiment': 1.05, 'question': 0.95},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='warn', tol=0.0001, verbose=0,
          warm_start=False)

In [10]:
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

1.0
0.9354838709677419


In [12]:
sent = 'i forgot your name'
print(logit_model.predict_proba([spacy_get_vec(sent)]))
print(logit_model.predict([spacy_get_vec(sent)]))

[[0.01991578 0.98008422]]
['sentiment']


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=25)

In [14]:
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

1.0
0.8064516129032258


In [15]:
sent = 'i am feeling very happy'
gradboost.predict([spacy_get_vec(sent)])

array(['sentiment'], dtype=object)

In [17]:
sent = 'i think i forgot your name'
gradboost.predict([spacy_get_vec(sent)])

array(['sentiment'], dtype=object)

In [22]:
from sklearn.svm import SVC
svc = SVC(kernel='linear',C=0.3)
svc.fit(X_train, y_train)

SVC(C=0.3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [23]:
print(svc.score(X_train, y_train))
print(svc.score(X_val, y_val))

1.0
0.8387096774193549


In [24]:
sent = 'do you live in France'
svc.predict([spacy_get_vec(sent)])

array(['question'], dtype=object)

In [25]:
sent = 'my name is Batman'
svc.predict([spacy_get_vec(sent)])

array(['question'], dtype=object)

In [26]:
sent = 'i think i forgot your name'
svc.predict([spacy_get_vec(sent)])

array(['sentiment'], dtype=object)

In [21]:
sent = 'Hii'
svc.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [22]:
svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel=u'linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
from sklearn.externals import joblib
joblib.dump(svc, 'is_question.pkl')

[u'is_question.pkl']