## Vectorize the preprocessed data

#### import libraries

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, save_npz, csr_matrix
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import random
import os

In [2]:
def train_w2v_model(data, vector_size, window, min_count, seed=42):
    tokenized_data = [word_tokenize(doc) for doc in data]
    model = Word2Vec(sentences=tokenized_data, vector_size=vector_size, window=window, min_count=min_count, seed=seed)
    return model

In [3]:
def get_document_vector(doc, model):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [4]:
def one_hot_encode(nominal_data):
    encoder = OneHotEncoder(sparse_output=True)
    return encoder.fit_transform(nominal_data.values.reshape(-1, 1))

In [5]:
def keep(numerical_data):
    return numerical_data.reshape(-1, 1)

In [6]:
def w2v_vectorize(data, model, vector_size):
    tokenized_data = [word_tokenize(doc) for doc in data]
    X_w2v = np.array([
        np.mean([model.wv[word] for word in doc if word in model.wv], axis=0)
        if any(word in model.wv for word in doc)
        else np.zeros(vector_size)
        for doc in tokenized_data
    ])
    return csr_matrix(X_w2v)

In [7]:
def d2v_train_model(data, vector_size=20, min_count=2, epochs=50, seed=42):
    data = data.fillna("").astype(str)
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(data)]
    model = Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs, seed=seed)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [8]:
def d2v_vectorize(data, model):
    data = data.fillna("").astype(str)
    document_vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in data]
    return csr_matrix(document_vectors)

In [9]:
def encode_boolean(boolean_data):
    return boolean_data.astype(int).values.reshape(-1, 1)

In [14]:
#main function 

def vectorize(data):
    # Set random seeds globally
    random.seed(42)
    np.random.seed(42)
    os.environ['PYTHONHASHSEED'] = '42'

    # get labels
    y = data.Label

    # text data
    X_text = data.Cleaned_Text

    # 1. Fit Vectorizers and Models
    all_text = pd.concat([X_text]) #?

    # Word2Vec
    w2v_model = train_w2v_model(X_text, vector_size=100, window=5, min_count=25, seed=42)

    # Doc2Vec (fit on all text to have a consistent latent space)
    d2v_model = d2v_train_model(all_text, vector_size=20, min_count=2, epochs=50, seed=42)

    # CountVectorizer and TfidfVectorizer (fit once on all)
    cv = CountVectorizer(ngram_range=(1,4), max_features=100)
    cv.fit(all_text)

    tfidf = TfidfVectorizer(max_features=100, ngram_range=(1,4))
    tfidf.fit(all_text)


    # 2. Different Vectorizations

    # Count Vector for turns
    X_bow = cv.transform(X_text)

    # TF-IDF Vectors for turns
    X_tfidf = tfidf.transform(X_text)

    # Word2Vec Vectors for turns
    X_word2vec = w2v_vectorize(X_text, w2v_model, vector_size=100)

    # Doc2Vec Vectors for turns
    X_doc2vec = d2v_vectorize(X_text, d2v_model)


    # 3. Encode other features

    X_register1 = one_hot_encode(data["Dialogue/Monologue"])
    X_register2 = one_hot_encode(data["Register_low_level"])
    X_register3 = one_hot_encode(data["Register"])
    X_pos_tags = one_hot_encode(data["PoS_Tags"])
    X_pos_bigrams = one_hot_encode(data["PoS_Bigrams"])
    X_greeting = encode_boolean(data["Greeting"])
    X_first_word = one_hot_encode(data["First_Word"])
    X_sent_length = keep(data["Sentence_Length"].array)

    # 4. Combine Feature Vectors

    feature_vector_all = hstack([
        X_bow, 
        X_tfidf, 
        X_word2vec, 
        X_doc2vec,
        X_register1, X_register2, X_register3,
        X_sent_length, X_pos_tags, X_pos_bigrams,X_greeting, X_first_word
    ]).tocsr()

    # 4. Save Feature Vectors and Labels

    save_npz("vectorized_data.npz", feature_vector_all)
    np.save("labels.npy", y.to_numpy())
    np.save("filenames.npy", data.Filename.to_numpy())

In [15]:
data = pd.read_csv("preprocessed_data.csv", keep_default_na=False) #path/to/preprocessed_data.csv

In [16]:
data.columns

Index(['Unnamed: 0', 'Register', 'Dialogue/Monologue', 'Register_low_level',
       'Region', 'Filename', 'Original_Text', 'Cleaned_Text', 'Prev_Text',
       'Next_Text', 'Sentence_Length', 'Label', 'PoS_Tags', 'PoS_Bigrams',
       'Greeting', 'First_Word'],
      dtype='object')

In [17]:
vectorize(data)