In [1]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
# from __future__ import unicode_literals
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
en_nlp = spacy.load('en')

In [3]:
def spacy_get_vec(sentence):
    vec = np.zeros(96)
    doc = en_nlp((sentence))
    for word in doc:
        #if word.lower_ in ENGLISH_STOP_WORDS:
        #    continue
        vec += word.vector
    return vec

In [5]:
lines = open('./is_question.txt').readlines()
vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(96)])
df['intents'] = intents
df.intents = df.intents.astype('category')

In [6]:
from sklearn.utils import shuffle
df = shuffle(df)

In [7]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_87,vec_88,vec_89,vec_90,vec_91,vec_92,vec_93,vec_94,vec_95,intents
141,8.638584,-4.511331,9.105298,-7.873738,-7.094472,5.46047,-2.411022,-2.590003,0.577147,10.021751,...,-12.117597,2.807703,6.294908,-5.405303,-1.441424,12.072329,-6.007735,5.095221,2.366286,question
106,5.659891,-6.080133,7.118525,-4.139761,-9.1315,-1.62125,-6.766408,-7.214301,6.137925,15.765751,...,-0.745241,-3.616721,14.099648,-2.818982,5.383953,15.527542,-3.013386,-1.775828,1.177742,question
61,-5.571187,-4.136174,-5.608514,1.553669,2.0626,0.558095,2.835774,1.772828,4.804504,-2.707741,...,-3.443504,0.231024,0.563015,3.728613,3.088901,-1.503433,0.544523,0.821426,-1.572616,sentiment
59,-3.771748,-1.192746,4.249696,-1.058279,0.899832,5.556467,1.612815,4.628816,0.29266,3.088551,...,-3.371501,1.527476,3.814626,1.510213,3.573054,-2.389172,-2.857864,-1.929039,-1.581492,sentiment
137,1.623816,-5.007186,4.284825,-7.427752,-8.513345,-1.796197,12.413925,6.584578,-18.073582,12.328767,...,5.213313,-4.052181,24.446793,-3.277435,-1.788192,5.015595,2.265996,8.576342,-13.774484,question


In [8]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)

In [12]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(class_weight={'question':0.95,'sentiment':1.05})
logit_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight={'question': 0.95, 'sentiment': 1.05},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='warn', tol=0.0001, verbose=0,
          warm_start=False)

In [13]:
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

1.0
0.8709677419354839


In [11]:
sent = 'i forgot your name'
print(logit_model.predict_proba(spacy_get_vec(sent)))
print(logit_model.predict(spacy_get_vec(sent)))

[[ 0.14874617  0.85125383]]
[u'sentiment']




In [14]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=25)

In [15]:
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

1.0
0.8387096774193549


In [14]:
sent = 'i am feeling very happy'
gradboost.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [15]:
sent = 'i think i forgot your name'
gradboost.predict(spacy_get_vec(sent))



array([u'question'], dtype=object)

In [16]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [17]:
print(svc.score(X_train, y_train))
print(svc.score(X_val, y_val))

1.0
0.8709677419354839


In [18]:
sent = 'do you live in France'
svc.predict(spacy_get_vec(sent))



array([u'question'], dtype=object)

In [19]:
sent = 'my name is Batman'
svc.predict(spacy_get_vec(sent))



array([u'question'], dtype=object)

In [20]:
sent = 'i think i forgot your name'
svc.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [21]:
sent = 'Hii'
svc.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [18]:
svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:
from sklearn.externals import joblib
joblib.dump(svc, 'is_question.pkl')

['is_question.pkl']