In [83]:
import re
import string
from random import shuffle

import nltk
import pandas as pd
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm.notebook import tqdm

from nltk.corpus import stopwords
from nltk import FreqDist

from nltk import classify
from nltk import NaiveBayesClassifier

import pickle

import numpy as np

import tensorflow as tf

In [84]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [85]:
def remove_noise(tweet_tokens, stop_words=()):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())

    return cleaned_tokens


def get_tweets_for_model(cleaned_tokens):
    for tweet_tokens in cleaned_tokens:
        yield dict([token, True] for token in tweet_tokens)

In [86]:
data = pd.read_csv('training_data/data.csv', encoding='latin-1')

# Sentiments: 0 = Negative, 2 = Neutral, 4 = Positive
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Get rid of unnecessary columns
data = data.drop(['id', 'date', 'query', 'user'], axis=1)
data = data.sample(frac=.3)

from sklearn.preprocessing import OneHotEncoder

type_one_hot = OneHotEncoder(sparse=False).fit_transform(
    data['sentiment'].to_numpy().reshape(-1, 1)
)



In [87]:
from sklearn.model_selection import train_test_split

train_reviews, test_reviews, y_train, y_test = train_test_split(
    data.text,
    type_one_hot,
    test_size=.25
)

In [88]:
import tensorflow_hub as hub
from tqdm import tqdm
import tensorflow_text

use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

X_train = []
for r in tqdm(train_reviews):
    emb = use(r)
    review_emb = tf.reshape(emb, [-1]).numpy()
    X_train.append(review_emb)
X_train = np.array(X_train)

100%|██████████| 360000/360000 [5:13:07<00:00, 19.16it/s]  


In [89]:
X_test = []
for r in tqdm(test_reviews):
    emb = use(r)
    review_emb = tf.reshape(emb, [-1]).numpy()
    X_test.append(review_emb)
X_test = np.array(X_test)

100%|██████████| 120000/120000 [1:39:36<00:00, 20.08it/s]


In [90]:
from tensorflow import keras

model = keras.Sequential()
model.add(
    keras.layers.Dense(
        units=256,
        input_shape=(X_train.shape[1],),
        activation='relu'
    )
)
model.add(
    keras.layers.Dropout(rate=0.5)
)
model.add(
    keras.layers.Dense(
        units=128,
        activation='relu'
    )
)
model.add(
    keras.layers.Dropout(rate=0.5)
)
model.add(keras.layers.Dense(2, activation='softmax'))  #2
model.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

In [91]:
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=16,
    validation_split=0.1,
    verbose=1,
    shuffle=True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [92]:
model.evaluate(X_test, y_test)



[0.42681849002838135, 0.8041166663169861]

In [93]:
print(test_reviews.iloc[2])
print(X_test[1])
model.predict(X_test[1:2])

man, my legs are killing me  Karina needs to learn how to conrol her shopping 
[ 2.44514327e-02 -7.66500607e-02 -2.71941349e-02 -2.00951751e-02
 -1.40204895e-02 -7.09348843e-02 -2.78535057e-02 -9.93064884e-03
 -3.78731750e-02  2.52701831e-03 -5.29936589e-02  5.58230020e-02
  3.33819725e-03  9.06927139e-02 -1.32107334e-02  9.86650437e-02
  3.50685157e-02  1.93762761e-02  5.51859513e-02 -1.30026946e-02
  2.55867578e-02  5.83649520e-03 -7.94267282e-03 -3.64703611e-02
 -2.08679028e-02  7.63593335e-03 -4.51187380e-02  4.25460562e-02
 -2.46053329e-03  2.26143803e-02  7.97883272e-02 -3.43916900e-02
  6.28950968e-02 -3.88446487e-02 -2.05555977e-03 -1.71087496e-02
 -4.13332134e-02 -3.60765532e-02 -2.53885966e-02 -3.51767987e-02
  2.96047772e-03 -1.67121403e-02 -7.73802921e-02  7.56391138e-02
  1.23799145e-02 -5.70056364e-02 -2.33443025e-02  2.54866295e-03
  8.92798067e-04 -1.03220232e-02 -3.31834122e-03  4.79071820e-03
 -1.86432805e-02  4.77810651e-02  6.61782846e-02 -2.29170527e-02
  1.3354172

array([[0.5948704 , 0.40512964]], dtype=float32)

In [94]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
from tqdm import tqdm
import tensorflow_text


def evaluate_multiple(strings: list):
    data_to_test = np.array(strings)

    i_want_this_tested = []
    for r in data_to_test:
        emb = use(r)
        review_emb = tf.reshape(emb, [-1]).numpy()
        i_want_this_tested.append(review_emb)
    i_want_this_tested = np.array(i_want_this_tested)

    return list(model.predict(i_want_this_tested))


def evaluate_single(single: str):
    return evaluate_multiple([single])


desired_strings = ["Twitter is horrible.", "I love twitter!", "I dunno :(", "bahaha", "Starlink now has more than 1,000,000 active subscribers – thank you to all customers and members of the Starlink team who contributed to this", "Those who want power are the ones who least deserve it"]

evaluated = evaluate_multiple(desired_strings)

output = {}
for i in range(len(desired_strings)):
    output[desired_strings[i]] = "POSITIVE" if evaluated[i].argmax() == 1 else "NEGATIVE"

output



{'Twitter is horrible.': 'NEGATIVE',
 'I love twitter!': 'POSITIVE',
 'I dunno :(': 'NEGATIVE',
 'bahaha': 'POSITIVE',
 'Starlink now has more than 1,000,000 active subscribers – thank you to all customers and members of the Starlink team who contributed to this': 'POSITIVE',
 'Those who want power are the ones who least deserve it': 'POSITIVE'}

In [97]:
import pickle

saved_model = open("models/new_model.pickle", "wb")
pickle.dump(model, saved_model)
saved_model.close()

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
variables.h5                                   2022-12-20 00:42:12      2001384
config.json                                    2022-12-20 00:42:12         2552
metadata.json                                  2022-12-20 00:42:12           64
