In [1]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys

import nltk
from nltk.tokenize import word_tokenize

import gensim.downloader as api
import gensim
from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

import joblib

import os

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
datasets = {
    "davidson" : pd.read_csv("literature\datasets\dt\labeled_data_all_2classes_only.csv", index_col=0),
    "hateval" : pd.read_csv("literature\\datasets\\hateval\\train_en.tsv", sep='\t', index_col=0),
    "ethos" : pd.read_csv("literature\\datasets\\ethos\\Ethos_Dataset_Binary.csv", sep=';')
}

In [3]:
datasets["davidson"] = datasets["davidson"].rename({"tweet":"text"}, axis=1)
datasets["davidson"]['class'] = datasets["davidson"]['class'].replace({0: 1, 2: 0})

datasets["hateval"] = datasets["hateval"].rename({"HS":"class"}, axis=1)

datasets["ethos"] = datasets["ethos"].rename({"comment":"text", "isHate":"class"}, axis=1)
datasets["ethos"]['class'] = datasets["ethos"]['class'].astype("int")

In [4]:
def pre_process(data):
    df_pm = data.lower()
    return df_pm
def sep_rem(data):
    df_pm = data
    df_pm = re.sub(r"[^a-zA-Z0-9#@ ]","",df_pm)
    return df_pm.strip()
def remove_hashtag(data):
    df_pm = re.sub(r"#\S+", "", data)
    return df_pm.strip()
def remove_mentions(data):
    data = re.sub(r"@\S+", "", data)
    return data.strip()
def remove_NCR(data):
    data = re.sub(r"&#[0-9]+;|&#x[0-9a-fA-F]+;|&[a-zA-Z]+", "", data)
    return data.strip()
def remove_RT(data):
    data = re.sub(r"(^|\s)rt\s", "", data)
    return data.strip()
def remove_links(data):
    data = re.sub(r"https?://(?:[\w./])+", " ", data)
    data = re.sub(r"http?://(?:[\w./&#])+", " ", data)
    return data.strip()
def remove_spaces(data):
    data = re.sub(r" +", " ", data)
    return data.strip()
    

def process_data(df):
    processed_column = df['text']
    processed_column = processed_column.apply(lambda x:pre_process(x))
    processed_column = processed_column.apply(lambda x:remove_links(x))
    processed_column = processed_column.apply(lambda x:remove_NCR(x))
    processed_column = processed_column.apply(lambda x:remove_hashtag(x))
    processed_column = processed_column.apply(lambda x:remove_mentions(x))
    processed_column = processed_column.apply(lambda x:sep_rem(x))
    processed_column = processed_column.apply(lambda x:remove_RT(x))
    processed_column = processed_column.apply(lambda x:remove_spaces(x))
    df["clean"] = processed_column
    return df


In [5]:
pretrained_model = api.load('word2vec-google-news-300')
word_indices = list(pretrained_model.key_to_index.keys())
word_freq = {word: pretrained_model.get_vecattr(word, "count") for word in word_indices}

def text_to_vectors(text):
    tokens = text.split()  
    
    # Get word vectors for each token
    word_vectors = [pretrained_model[token] for token in tokens if token in pretrained_model]

    if not word_vectors:
        # Return zero-filled array of the desired shape
        return np.zeros((100, 300), dtype=np.float32)
    
    # Pad sequence
    max_length = 100
    padded_vectors = np.zeros((max_length, 300), dtype=np.float32)
    num_vectors = min(len(word_vectors), max_length)
    padded_vectors[:num_vectors] = word_vectors[:max_length]
    
    return padded_vectors

# Feature Representation
def feature_rep(df, feature_type="TFIDF", resample=True, split=0.3):
    start_time = time.time()
    if feature_type == "TFIDF": # TFIDF
        documents = df["clean"].tolist()
        
        tfidf_vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1, 3),max_features=10000,min_df=5,max_df=0.501)
        X = tfidf_vectorizer.fit_transform(documents)
    
    elif feature_type == "BOW": # Bag of words
        vectorizer = CountVectorizer()
        documents = df["clean"].tolist()
    
        bow_matrix = vectorizer.fit_transform(documents)
        X = bow_matrix.toarray()
    
        vocabulary = vectorizer.get_feature_names_out()
    elif feature_type == "W2V": # Word2Vec
        # Tokenize
        df['tokenized_text'] = df['clean'].apply(lambda x: word_tokenize(x))
        
        # Apply text_to_vectors function to convert text to Word2Vec vectors
        df['text_vector'] = df['clean'].apply(text_to_vectors)
        
        # Convert document vectors to numpy array
        X = np.stack(df['text_vector'].values, axis=0)
        y = pd.get_dummies(df['class']).values.astype(int)
        input_layer = Input((100, 300))
        # X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split)
        
        
        # if resample:
        #     y_train = np.argmax(y_train, axis=1)
            
        #     num_classes = len(set(y_train.tolist()))
            
        #     num_samples, seq_length, feature_dim = X_train.shape
            
        #     X_flattened = X_train.reshape(num_samples, -1)  # Reshaping to (24783, 100*300)
        #     oversampler = RandomOverSampler()
        #     X_resampled, y_resampled = oversampler.fit_resample(X_flattened, y_train)
            
        #     X_original_shape = X_resampled.reshape(len(X_resampled), seq_length, feature_dim)
        #     y_onehot = np.eye(num_classes)[y_resampled]
        #     X_train, y_train = X_original_shape, y_onehot
            
    end_time = time.time()
        
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print("Feature Type:", feature_type)
    print("Feature Rep Elapsed time:", round(elapsed_time,2), "seconds")
    print("")
    return input_layer, X, y

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential, Model
from keras.layers import LSTM, Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten, InputLayer, Input, Dropout, Concatenate
from sklearn.metrics import f1_score, accuracy_score

In [7]:
def train_eval_model(input_layer, X, y, model="complex CNN", batch_size=100, epochs=30, num_folds=5, target_class=1):
    kf = KFold(n_splits=num_folds)

    all_precisions, all_recalls, all_f1_macro, all_f1_weighted = [], [], [], []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]  # y contains your labels

        tf.compat.v1.reset_default_graph()
        tf.keras.backend.clear_session()

        if model == "complex CNN":
            input = input_layer
            
            conv_layers = []
            kernel_sizes = [2, 3, 4]
            for kernel_size in kernel_sizes:
                conv_layer = Conv1D(filters=100, kernel_size=kernel_size, activation='relu')(input)
                conv_layers.append(conv_layer)
                
            concat = Concatenate(axis=1)(conv_layers)
            lstm = LSTM(32, return_sequences=True)(concat)
            global_pool = GlobalMaxPooling1D()(lstm)
            dense1 = Dense(16, activation='relu')(global_pool)
            dense2 = Dense(2, activation='softmax')(dense1)
            model = Model(inputs=input, outputs=dense2)
        
        # Estimate memory usage (assuming 32-bit float)
        num_params = np.sum([np.prod(w.shape) for w in model.trainable_variables])
        memory_usage = num_params * 4
        print("Estimated model size (bytes):", memory_usage)
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        # Train the model
        history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=1)
        
        start_time = time.time()
        
        y_pred = model.predict(X_val)
        y_pred = np.argmax(y_pred, axis=1)
        y_val = np.argmax(y_val, axis=1)

        end_time = time.time()
        elapsed_time = end_time - start_time
        
        print("Predict time:", round(elapsed_time,7), "seconds")
        
        # Calculate precision and recall for the target class
        precision = precision_score(y_val, y_pred, average='binary')
        recall = recall_score(y_val, y_pred, average='binary')
        f1_macro_score = f1_score(y_val, y_pred, average='macro')
        f1_weighted_score = f1_score(y_val, y_pred, average='weighted')
        
        # Store precision and recall for this fold
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1_macro.append(f1_macro_score)
        all_f1_weighted.append(f1_weighted_score)

    avg_precision = sum(all_precisions) / num_folds
    avg_recall = sum(all_recalls) / num_folds
    avg_f1_macro = sum(all_f1_macro) / num_folds
    avg_f1_weighted = sum(all_f1_weighted) / num_folds
    
    print("Average Precision for Class", target_class, ":", avg_precision)
    print("Average Recall for Class", target_class, ":", avg_recall)
    print("Average F1 Macro", avg_f1_macro)
    print("Average F1 Weighted", avg_f1_weighted)

    

In [8]:
dataset = "davidson" # Options: davidson, hateval, ethos
print("Dataset: ", dataset)
print("")

df = datasets[dataset]
df = process_data(df)

Dataset:  davidson



In [9]:
input_layer, X, y = feature_rep(df, feature_type="W2V", resample=True, split=0.3) 
print(X.shape)
print(y.shape)

Feature Type: W2V
Feature Rep Elapsed time: 3.74 seconds

(24783, 100, 300)
(24783, 2)


In [10]:
train_eval_model(input_layer, X, y, model="complex CNN",batch_size=30,epochs=2,num_folds=4, target_class=1)

Estimated model size (bytes): 1151544
Epoch 1/2
Epoch 2/2
Predict time: 2.1379988 seconds
Estimated model size (bytes): 1151544
Epoch 1/2
Epoch 2/2
Predict time: 1.4399993 seconds
Estimated model size (bytes): 1151544
Epoch 1/2
Epoch 2/2
Predict time: 1.427001 seconds
Estimated model size (bytes): 1151544


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [11]:
tf.compat.v1.reset_default_graph()
tf.keras.backend.clear_session()