In [1]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
from __future__ import unicode_literals
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
en_nlp = spacy.load('en')

In [3]:
def spacy_get_vec(sentence):
    vec = np.zeros(384)
    doc = en_nlp((sentence))
    for word in doc:
        #if word.lower_ in ENGLISH_STOP_WORDS:
        #    continue
        vec += word.vector
    return vec

In [4]:
lines = open('./is_question.txt').readlines()
vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(384)])
df['intents'] = intents
df.intents = df.intents.astype('category')

In [5]:
from sklearn.utils import shuffle
df = shuffle(df)

In [6]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_375,vec_376,vec_377,vec_378,vec_379,vec_380,vec_381,vec_382,vec_383,intents
6,-3.850943,-6.639082,5.704214,6.938688,6.879732,2.601594,-4.512103,-4.214966,1.815665,-1.607367,...,-0.118715,-0.079425,-0.99921,0.808647,0.909412,0.454612,0.197847,1.291913,1.130899,sentiment
107,3.870726,-0.750165,8.419325,0.993593,4.111084,-1.089855,-1.959931,-3.264334,2.92544,2.240141,...,0.908486,0.034517,-0.306704,0.796868,1.662183,-1.061653,0.973119,0.577741,0.380746,question
30,-2.043465,0.682373,0.795384,6.596918,4.184609,3.28304,-6.837724,-2.488773,-3.688787,6.685079,...,0.088831,-1.01409,0.023326,0.878456,1.352227,0.00386,1.486881,1.820464,0.449756,sentiment
45,-3.208255,-3.126131,-1.758441,10.487405,12.028679,-6.479359,-1.753736,-0.827356,2.270455,-0.661625,...,-0.48018,-0.867633,-0.684285,0.675551,1.351758,0.035565,0.813848,0.907636,0.539921,sentiment
74,2.330897,-2.618627,-3.213691,10.215489,3.283876,-2.868323,2.903101,3.035145,5.665443,-0.959649,...,0.227861,0.272062,-0.450582,0.262057,0.354631,-0.914984,0.660381,0.309936,0.862658,sentiment


In [7]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [8]:
from sklearn.cross_validation import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)



In [9]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(class_weight={'question':0.95,'sentiment':1.05})
logit_model.fit(X_train, y_train)

LogisticRegression(C=1.0,
          class_weight={u'question': 0.95, u'sentiment': 1.05}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [10]:
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

1.0
0.935483870968


In [11]:
sent = 'i forgot your name'
print(logit_model.predict_proba(spacy_get_vec(sent)))
print(logit_model.predict(spacy_get_vec(sent)))

[[ 0.14874617  0.85125383]]
[u'sentiment']




In [12]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=25)

In [13]:
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

1.0
0.838709677419


In [14]:
sent = 'i am feeling very happy'
gradboost.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [15]:
sent = 'i think i forgot your name'
gradboost.predict(spacy_get_vec(sent))



array([u'question'], dtype=object)

In [16]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel=u'linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
print(svc.score(X_train, y_train))
print(svc.score(X_val, y_val))

1.0
0.935483870968


In [18]:
sent = 'do you live in France'
svc.predict(spacy_get_vec(sent))



array([u'question'], dtype=object)

In [19]:
sent = 'my name is Batman'
svc.predict(spacy_get_vec(sent))



array([u'question'], dtype=object)

In [20]:
sent = 'i think i forgot your name'
svc.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [21]:
sent = 'Hii'
svc.predict(spacy_get_vec(sent))



array([u'sentiment'], dtype=object)

In [22]:
svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel=u'linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
from sklearn.externals import joblib
joblib.dump(svc, 'is_question.pkl')

[u'is_question.pkl']