In [2]:
import spacy
import numpy as np
import pandas as pd
from stopwords import ENGLISH_STOP_WORDS
# from __future__ import unicode_literals
# import numba
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)


In [3]:
en_nlp = spacy.load('en')

In [4]:
def spacy_get_vec(sentence):
    vec = np.zeros(96)
    doc = en_nlp((sentence))
    for word in doc:
        if word.lower_ in ENGLISH_STOP_WORDS:
            continue
        vec += word.vector
    return vec

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
lines = open('./class.txt').readlines()
vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorizer.fit_transform([''.join(line.split(',')[0]) for line in lines])


<151x125 sparse matrix of type '<class 'numpy.float64'>'
	with 248 stored elements in Compressed Sparse Row format>

In [6]:
def get_idf(sentence):
    score = 1.0
    for word in sentence.split():
        if word[-1] == '\n' or word[-1] == ',' or word[-1] == '.' or word[-1] == ['!']:
            word = word[:-1]
        if word not in vectorizer.vocabulary_:
            continue
        index = vectorizer.vocabulary_[word]
        score = score / vectorizer.idf_[index]
    return score

In [9]:

vecs = []
intents = []
idfs = []
for line in lines:
    tokens = line.split(',')
    sentence = tokens[0]
    intent = tokens[1]
    if intent[-1] == '\n':
        intent = intent[:-1]
    vecs.append(spacy_get_vec(sentence))
    intents.append(intent)
    #idfs.append(get_idf(sentence))

df = pd.DataFrame(vecs, columns=['vec_%d' % i for i in range(96)])
#df['idf'] = idfs
df['intents'] = intents
df.intents = df.intents.astype('category')

In [10]:
from sklearn.utils import shuffle
df = shuffle(df)

In [11]:
df.head()

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_87,vec_88,vec_89,vec_90,vec_91,vec_92,vec_93,vec_94,vec_95,intents
143,-6.307271,0.06669,-2.986557,-5.895083,-4.020977,7.05879,6.794763,4.620492,8.125482,-3.79987,...,1.409871,1.07315,-0.526352,3.428928,-2.789751,-2.929514,-0.418538,1.376534,-0.985598,non_intent
125,-2.927856,2.094635,-2.404032,-4.694757,0.919,1.596371,10.053276,1.889114,4.804279,-3.741004,...,1.967579,0.396301,-0.783363,3.338813,-3.391479,-3.104036,0.769581,-0.551365,-1.06686,non_intent
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,non_intent
66,4.907976,-0.578859,3.453419,-5.34468,3.191152,0.040947,7.186928,-3.454149,2.563387,3.893679,...,2.460749,3.39212,-2.898923,-1.820098,-1.349198,-1.193615,0.248984,-3.875893,14.987288,non_intent
84,-1.801312,-0.607008,-2.832177,-3.968141,4.030876,5.043756,4.556532,1.08523,-2.143206,1.359876,...,3.218234,1.22996,-1.494867,2.468833,0.324141,-1.926399,-1.394034,-2.152557,-1.573624,non_intent


In [12]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1:].values.ravel()

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.20)

In [16]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(C=5.0, class_weight={'intent': 1.2, 'non_intent': 0.8})
logit_model.fit(X_train, y_train)



LogisticRegression(C=5.0, class_weight={'non_intent': 0.8, 'intent': 1.2},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='warn', tol=0.0001, verbose=0,
          warm_start=False)

In [17]:
print(logit_model.score(X_train, y_train))
print(logit_model.score(X_val, y_val))

0.9666666666666667
0.8064516129032258


In [19]:
sent = 'it looks cloudy'
#gradboost.predict_proba(np.append(spacy_get_vec(sent), get_idf(sent)))
logit_model.predict_proba([spacy_get_vec(sent)])

array([[5.05908415e-07, 9.99999494e-01]])

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=25, max_features='log2')

In [21]:
gradboost.fit(X_train, y_train)
print(gradboost.score(X_train, y_train))
print(gradboost.score(X_val, y_val))

0.9666666666666667
0.9032258064516129


In [59]:
sent = 'it looks cloudy'
#gradboost.predict_proba(np.append(spacy_get_vec(sent), get_idf(sent)))
gradboost.predict_proba(spacy_get_vec(sent))

array([[  1.54143365e-12,   1.00000000e+00]])

In [23]:
gradboost.classes_

array(['intent', 'non_intent'], dtype=object)

In [22]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', degree=2, probability=True, class_weight={'intent':0.8,'non_intent':1.2})
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight={'non_intent': 1.2, 'intent': 0.8},
  coef0=0.0, decision_function_shape='ovr', degree=2,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, probability=True,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [24]:
svc.score(X_val,y_val)

0.8064516129032258

In [25]:
gradboost.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=25,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [26]:
from sklearn.externals import joblib
joblib.dump(gradboost, 'class.pkl')

['class.pkl']