In [1]:
from app.models.user import User
from app.models.tweet import Tweet
from app.app import create_app
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
import onnxruntime as rt
from app.api.basilica import BASILICA
import collections
import pprint as pp
from sklearn.model_selection import RandomizedSearchCV

In [2]:
app = create_app()
app.app_context().push()

In [3]:
tweets = Tweet.query.all()

In [4]:
data = np.array([np.hstack([tweet.embedding, tweet.user_id]) for tweet in tweets])

In [5]:
data.shape

(5916, 769)

In [6]:
np.random.shuffle(data)

In [7]:
X = data[:, :-1]
y = data[:, -1]

In [8]:
X.shape, y.shape

((5916, 768), (5916,))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
clf = SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-3, random_state=42,
                    max_iter=5, tol=None)

In [11]:
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [12]:
predicted = clf.predict(X_test)
np.mean(predicted == y_test)

0.9546991210277215

In [13]:
tweet = """No one says it better than @MichelleObama
 — the Obama Presidential Center coming to the South Side isn't for us, it's for the community and the next generation of leaders."""

In [14]:
with BASILICA as c:
    embedding = np.array(c.embed_sentence(tweet,model="twitter")).reshape(1,-1)

In [15]:
clf.predict(embedding)

array([813286.])

In [16]:
parameters = {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    'max_iter': [5, 10, 100, 500, 1000, 2000, 10000],
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], # logistic regression,
    'penalty': ['l2'],
    'n_jobs': [-1]}

In [17]:
rs_clf = RandomizedSearchCV(clf, parameters, verbose=20, cv=5, n_jobs=-1)

In [18]:
rs_clf = rs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   48.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

In [19]:
rs_clf.best_params_

{'penalty': 'l2',
 'n_jobs': -1,
 'max_iter': 2000,
 'loss': 'modified_huber',
 'alpha': 0.01}

In [20]:
rs_clf.predict(embedding)

array([813286.])

In [21]:
predicted = rs_clf.predict(X_test)
np.mean(predicted == y_test)

0.966869506423259

In [22]:
sess = rt.InferenceSession("app/ml_models/tweet.onnx")
input_name = sess.get_inputs()[0].name
prob_name = sess.get_outputs()[1].name
prob_rt = sess.run([prob_name],
                   {input_name: embedding.astype(np.float32)})[0][0]

In [23]:
users = User.query.all()
users_dict = {}
for user in users:
    users_dict[1 / (1 + np.exp(-prob_rt[user.id]))] = {
        'name':     user.name,
        'username': user.username,
        'prob':     1 / (1 + np.exp(-prob_rt[user.id]))
    }

users_dict = collections.OrderedDict(sorted(users_dict.items(),
                                            reverse=True))

pp.pprint(users_dict)

OrderedDict([(0.07384211848885748,
              {'name': 'Donald J. Trump',
               'prob': 0.07384211848885748,
               'username': 'realDonaldTrump'}),
             (0.01103391277822752,
              {'name': 'Barack Obama',
               'prob': 0.01103391277822752,
               'username': 'BarackObama'}),
             (0.0002279743874632505,
              {'name': 'Justin Bieber',
               'prob': 0.0002279743874632505,
               'username': 'justinbieber'})])


In [24]:
rs_clf.predict_proba(embedding)

array([[0.52060133, 0.47939867, 0.        ]])