# Machine Learning models applied to the data collected from the twitter api

In [13]:
# !pip install gensim --upgrade
# !pip install keras --upgrade
# !pip install pandas --upgrade

In [16]:
# !pip install tensorflow

In [19]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting click
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting regex>=2021.8.3
  Downloading regex-2023.3.23-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.6/769.6 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.3 nltk-3.8.1 regex-2023.3.23 tqdm-4.65.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.1[0m
[1m[[0

In [20]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
# DATASET
DATASET_COLUMNS = ["text","favorite_count",	"date_creation","retweet_count"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [26]:
# Importing the dataset
df = pd.read_csv('./data/TWEET_INFO_2023-04-26.csv', header=0)
df

Unnamed: 0,text,favorite_count,date_creation,retweet_count
0,@ispace_inc Stream on Twitter too!,3014,2023-04-25,285
1,@zerohedge So bizarre that people and companie...,7200,2023-04-25,636
2,@stillgray Really? 🤔,11125,2023-04-25,977
3,@_CryMiaRiver @krassenstein @ZubyMusic I repea...,7635,2023-04-25,1244
4,@krassenstein @ZubyMusic Please correct if wro...,2240,2023-04-25,108
...,...,...,...,...
1171,@unusual_whales Nice. Just me here @elonmusk t...,10341,2023-04-25,484
1172,"@ErcXspace @SpaceX Gravity, gravity, \ntime to...",5793,2023-04-24,375
1173,Or maybe just X https://t.co/5nCtYbrPfN,61164,2023-04-24,6187
1174,@SawyerMerritt @SpaceX @Tesla Yay!,5042,2023-04-24,237


In [27]:
print("Dataset size:", len(df))

Dataset size: 1176


In [28]:
df.head(5)

Unnamed: 0,text,favorite_count,date_creation,retweet_count
0,@ispace_inc Stream on Twitter too!,3014,2023-04-25,285
1,@zerohedge So bizarre that people and companie...,7200,2023-04-25,636
2,@stillgray Really? 🤔,11125,2023-04-25,977
3,@_CryMiaRiver @krassenstein @ZubyMusic I repea...,7635,2023-04-25,1244
4,@krassenstein @ZubyMusic Please correct if wro...,2240,2023-04-25,108


### Preprocess dataset

In [30]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [31]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [34]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))
df

CPU times: user 22.3 ms, sys: 0 ns, total: 22.3 ms
Wall time: 21.4 ms


Unnamed: 0,text,favorite_count,date_creation,retweet_count
0,stream twitter,3014,2023-04-25,285
1,bizarre people companies use money management ...,7200,2023-04-25,636
2,really,11125,2023-04-25,977
3,krassenstein zubymusic repeat statement parent...,7635,2023-04-25,1244
4,zubymusic please correct wrong communitynotes,2240,2023-04-25,108
...,...,...,...,...
1171,nice elonmusk something dumb said definitely,10341,2023-04-25,484
1172,spacex gravity gravity time escape,5793,2023-04-24,375
1173,maybe x,61164,2023-04-24,6187
1174,spacex tesla yay,5042,2023-04-24,237


### Split into train/test

In [35]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 940
TEST size: 236


### Word2vec

In [39]:
%%time
documents = [_text.split() for _text in df_train.text] 

CPU times: user 750 µs, sys: 0 ns, total: 750 µs
Wall time: 761 µs


In [41]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

2023-04-26 09:43:20,101 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2023-04-26T09:43:20.101098', 'gensim': '4.3.1', 'python': '3.10.4 (main, Apr  3 2023, 22:35:52) [GCC 9.4.0]', 'platform': 'Linux-5.4.0-1105-azure-x86_64-with-glibc2.31', 'event': 'created'}


In [42]:
w2v_model.build_vocab(documents)

2023-04-26 09:43:37,404 : INFO : collecting all words and their counts
2023-04-26 09:43:37,408 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-26 09:43:37,411 : INFO : collected 224 word types from a corpus of 5104 raw words and 940 sentences
2023-04-26 09:43:37,412 : INFO : Creating a fresh vocabulary
2023-04-26 09:43:37,414 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 224 unique words (100.00% of original 224, drops 0)', 'datetime': '2023-04-26T09:43:37.414818', 'gensim': '4.3.1', 'python': '3.10.4 (main, Apr  3 2023, 22:35:52) [GCC 9.4.0]', 'platform': 'Linux-5.4.0-1105-azure-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2023-04-26 09:43:37,416 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 5104 word corpus (100.00% of original 5104, drops 0)', 'datetime': '2023-04-26T09:43:37.416043', 'gensim': '4.3.1', 'python': '3.10.4 (main, Apr  3 2023, 22:35:52) [GCC 9.4.0]', 'platform': 'Linux-5.4.0-

In [47]:
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 224


In [48]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

2023-04-26 09:45:43,262 : INFO : Word2Vec lifecycle event {'msg': 'training model with 8 workers on 224 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7 shrink_windows=True', 'datetime': '2023-04-26T09:45:43.262591', 'gensim': '4.3.1', 'python': '3.10.4 (main, Apr  3 2023, 22:35:52) [GCC 9.4.0]', 'platform': 'Linux-5.4.0-1105-azure-x86_64-with-glibc2.31', 'event': 'train'}
2023-04-26 09:45:43,370 : INFO : EPOCH 0: training on 5104 raw words (3440 effective words) took 0.1s, 56048 effective words/s
2023-04-26 09:45:43,385 : INFO : EPOCH 1: training on 5104 raw words (3586 effective words) took 0.0s, 365729 effective words/s
2023-04-26 09:45:43,402 : INFO : EPOCH 2: training on 5104 raw words (3515 effective words) took 0.0s, 439383 effective words/s
2023-04-26 09:45:43,434 : INFO : EPOCH 3: training on 5104 raw words (3551 effective words) took 0.0s, 265748 effective words/s
2023-04-26 09:45:43,516 : INFO : EPOCH 4: training on 5104 raw words (3560 effective

CPU times: user 365 ms, sys: 56.8 ms, total: 422 ms
Wall time: 631 ms


(112810, 163328)