# Data Vectorization

----

## PROJECT TERMINOLOGIES

#### IMPORTANT NOTE
Throughout the notebook, the term "tweet" refers specifically to the context of the search terms that were utilized to query the API for data extraction. The context for these search terms, which now represent the tweets, as their `topic` or Labels, can be found in the `"3. Raw Dataset"` section.


#### TWITTER


- **Anchor**: The term used to tag a particular person, entity or another twitter handle. Such as "@CUBoulder" or "@BoulderPolice" are anchor terms used to tag University of Colorado Boulder twitter handle, and official Boulder County Police handle.


- **Hastag**: The term used to highlight a common word or a phrase to make it detectable for seach engine crawlers, other Twitter users and could be associated with trending topics. Such as "BlackLivesMatter" or "BLM" is not an entity but a term which could be used by other Twitter account holders.

---

## Contents:

`Total Read Time ~ 20 mins.`

`Total Execution Time ~ 45 mins.`

#### EDA and Preprocessing
1. Vectorization

## 1. Imports

In [4]:
## Imports
'''Python 3.8.0'''

# Standard libs
import os
import sys
import json
import warnings
import re
import io
from io import StringIO
import inspect
import shutil
import ast
import string
import time
import pickle
import glob
import traceback
import multiprocessing
import requests
import logging
import math
from ast import literal_eval
import pytz
from itertools import chain
from string import Template
from datetime import datetime, timedelta
from dateutil import parser
import base64
from collections import defaultdict, Counter, OrderedDict
from contextlib import contextmanager
import unicodedata
from functools import reduce
import itertools
import tempfile
from typing import Any, Dict, List, Callable, Optional, Tuple, NamedTuple, Union
from functools import wraps
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

# graph
import networkx as nx

# Required pkgs
import numpy as np
from numpy import array, argmax
import pandas as pd
import ntpath
import tqdm

# General text correction - fit text for you (ftfy) and others
import ftfy
from fuzzywuzzy import fuzz
#from wordcloud import WordCloud
from spellchecker import SpellChecker

# imbalanced-learn
from imblearn.over_sampling import SMOTE, SVMSMOTE, ADASYN

# scikit-learn
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, jaccard_score, silhouette_score, homogeneity_score, calinski_harabasz_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.base import BaseEstimator, TransformerMixin

# scipy
from scipy import spatial, sparse
from scipy.sparse import coo_matrix, vstack, hstack
from scipy.spatial.distance import euclidean, jensenshannon, cosine, cdist
from scipy.io import mmwrite, mmread
from scipy.stats import entropy, kurtosis, skew
from scipy.cluster.hierarchy import dendrogram, ward, fcluster
import scipy.cluster.hierarchy as sch
from scipy.sparse.csr import csr_matrix
from scipy.sparse.lil import lil_matrix
from scipy.sparse.csgraph import connected_components

# sparse_dot_topn: matrix multiplier
from sparse_dot_topn import awesome_cossim_topn
import sparse_dot_topn.sparse_dot_topn as ct

# Gensim
import gensim
from gensim.models import Phrases, Word2Vec, KeyedVectors, FastText, LdaModel
from gensim import utils
from gensim.utils import simple_preprocess
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim import models, corpora, similarities

# NLTK
import nltk
#nltk_model_data_path = "/somepath/"
#nltk.data.path.append(nltk_model_data_path)
from nltk import FreqDist, tokenize, sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import *
from nltk.translate.bleu_score import sentence_bleu
print("NLTK loaded.")

# Spacy
import spacy
from spacy import displacy
from spacy.matcher import Matcher
#from spacy.lang.en import English
from spacy.language import Language
from spacy_language_detection import LanguageDetector
print("Spacy loaded.")

# Pytorch
import torch
from torch import optim, nn
import torch.nn.functional as Functional
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from transformers import pipeline
from transformers import AutoModel
print("PyTorch loaded.")

# Plots
from matplotlib import pyplot as plt, ticker as ticker
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import offline
%matplotlib inline

# Theme settings
pd.set_option("display.max_columns", 80)
sns.set_context('talk')
sns.set(rc={'figure.figsize':(15,10)})
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

0it [00:00, ?it/s]

NLTK loaded.
Spacy loaded.
PyTorch loaded.


## 2. Directory Setup

In [24]:
root_dir = os.path.abspath("../")
data_dir = os.path.join(root_dir, "data")
models_dir = os.path.join(root_dir, "models")
output_dir = os.path.join(root_dir, "output")

sbert_model_fp = os.path.join(os.path.join(root_dir, "models"), "transformer_models/all-distilroberta-v1/")

## 3. Load Preprocessed Data

In [25]:
df = pd.read_csv(os.path.join(data_dir, "processed_cuboulder_TwitterData.csv"))
df.hastags = df.hastags.apply(lambda x: literal_eval(x) if str(x) not in ['none', 'nan', 'np.nan', 'null', ''] else None)

In [26]:
df.shape

(3394, 22)

## 4. Vectorization

In [27]:
class BertTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer, model, max_length=128, embedding_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,):
        self.tokenizer = tokenizer
        self.model = model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func
        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text):
        # Mean Pooling - Take attention mask into account for correct averaging
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask

        # Tokenize the text with the provided tokenizer
        encoded_input = tokenizer(text, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # Perform mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # bert takes in a batch so we need to unsqueeze the rows
        return sentence_embeddings

    def transform(self, text: List[str], formatt='tensor'):
        """ MODIFIED LATEST JAN 27, 2023"""
        if isinstance(text, pd.Series):
            text = text.tolist()
        
        # default previously returned embeddings
        embeddings = self._tokenize(text)
        
        # new **modified**
        if formatt:
            formatt = str(formatt).strip().lower()
            if formatt=='tensor':
                return embeddings
            elif formatt=='numpy':
                return embeddings.numpy()
            elif formatt=='csr':
                embeddings_matrix = Functional.normalize(embeddings, p=2, dim=1)
                embeddings_matrix_csr = csr_matrix(embeddings_matrix.numpy().astype(np.float64))
                return embeddings_matrix_csr
            else:
                raise Exception("Invalid input for formatt!")

    def fit(self, X, y=None):
        """No fitting required so we just return ourselves. For fine-tuning, refer to shared gpu-code!"""
        return self

In [28]:
# SENTENCE-BERT VECTORIZATION #

# load tokenizer, model classes
tokenizer = AutoTokenizer.from_pretrained(sbert_model_fp)
model_bert = AutoModel.from_pretrained(sbert_model_fp)

# load vectorizer
bert_vectorizer = BertTransformer(tokenizer, model_bert, embedding_func=lambda x: x[0][:, 0, :].squeeze())
print("Bert Model '%s' loaded." % sbert_model_fp)

## SAMPLE FOR VECTORIZATION
# corpus = df['text_col']
# embeddings = bert_vectorizer.transform(corpus)
# embeddings = bert_vectorizer.transform(corpus, formatt='numpy')
# embeddings = bert_vectorizer.transform(corpus, formatt='csr')

Bert Model '/Volumes/Local Drive/WORK/Machine Learning/0. Portfolio - ML Algorithms By Hand (Revised)/GITHUB/pranzell.github.io/source/models/transformer_models/all-distilroberta-v1/' loaded.


In [21]:
# vectorizing

embeddings = bert_vectorizer.transform(df['Processed_tweet'], formatt='numpy')
embeddings.to_pickle(os.path.join(os.path.join(root_dir, "models"), "processed_text_embeddings.pkl"))

In [23]:
embeddings.shape

(10, 768)

----

---