In [1]:
from app.models.user import User
from app.app import create_app
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import onnxruntime as rt
from sklearn.model_selection import GridSearchCV

In [2]:
app = create_app()
app.app_context().push()

In [3]:
user1 = User.query.filter(User.username == 'elonmusk').first()
user2 = User.query.filter(User.username == 'realDonaldTrump').first()
user1_encoding = np.full((user1.tweet_count, 1), 1)
user2_encoding = np.full((user2.tweet_count, 1), 2)

In [4]:
user1_embeddings = np.hstack([np.array([tweet.embedding for tweet in user1.tweets]), user1_encoding])
user2_embeddings = np.hstack([np.array([tweet.embedding for tweet in user2.tweets]), user2_encoding])

In [5]:
user1_embeddings.shape

(2852, 769)

In [6]:
embeddings = np.vstack([user1_embeddings, user2_embeddings])

In [7]:
embeddings.shape

(4629, 769)

In [8]:
np.random.shuffle(embeddings)

In [9]:
embeddings.shape

(4629, 769)

In [10]:
embeddings[5, -1]

1.0

In [11]:
y = embeddings[:, -1].astype(int)

In [12]:
X = embeddings[:, :-1]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
print(X_train.shape, X_test.shape)

(3101, 768) (1528, 768)


In [15]:
clf = SGDClassifier(loss='hinge', penalty='l2',
                    alpha=1e-3, random_state=42,
                    max_iter=5, tol=None)

In [16]:
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [17]:
predicted = clf.predict(X_test)
np.mean(predicted == y_test)

0.9600785340314136