In [10]:
import pandas as pd
import numpy as np
import random
import re
import time
import sys
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import gensim.downloader as api
import gensim
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import enchant
import joblib
import os
from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
datasets = {
    "davidson" : pd.read_csv("literature\datasets\dt\labeled_data_all_2classes_only.csv", index_col=0),
    "hateval" : pd.read_csv("literature\\datasets\\hateval\\train_en.tsv", sep='\t', index_col=0),
    "ethos" : pd.read_csv("literature\\datasets\\ethos\\Ethos_Dataset_Binary.csv", sep=';')
}

In [4]:
# Dataset preprocess
datasets["davidson"] = datasets["davidson"].rename({"tweet":"text"}, axis=1)
datasets["davidson"]['class'] = datasets["davidson"]['class'].replace({0: 1, 2: 0})

datasets["hateval"] = datasets["hateval"].rename({"HS":"class"}, axis=1)

datasets["ethos"] = datasets["ethos"].rename({"comment":"text", "isHate":"class"}, axis=1)
datasets["ethos"]['class'] = datasets["ethos"]['class'].astype("int")

In [5]:
def pre_process(data):
    df_pm = data.lower()
    return df_pm
def sep_rem(data):
    df_pm = data
    # df_pm = re.sub(r"[-()\"/;:<>{}`+=~|.!?,]", "", df_pm)
    df_pm = re.sub(r"[^a-zA-Z0-9#@ ]","",df_pm)
    return df_pm.strip()
def remove_hashtag(data):
    df_pm = re.sub(r"#\S+", "", data)
    return df_pm.strip()
def remove_mentions(data):
    data = re.sub(r"@\S+", "", data)
    return data.strip()
def remove_NCR(data):
    data = re.sub(r"&#[0-9]+;|&#x[0-9a-fA-F]+;|&[a-zA-Z]+", "", data)
    return data.strip()
def remove_RT(data):
    data = re.sub(r"(^|\s)rt\s", "", data)
    return data.strip()
def remove_links(data):
    data = re.sub(r"https?://(?:[\w./])+", " ", data)
    data = re.sub(r"http?://(?:[\w./&#])+", " ", data)
    return data.strip()
def remove_spaces(data):
    data = re.sub(r" +", " ", data)
    return data.strip()
    
d = enchant.Dict("en_US")

def spell_check_and_replace(text):
    # Tokenize the text into words
    words = np.array(text.split())
    
    # Create an empty array to store the misspelled mask
    misspelled_mask = np.zeros(len(words), dtype=bool)
    
    # Iterate through each word
    for i, word in enumerate(words):
        # Check if the word is misspelled
        if not d.check(word):
            # Update the misspelled mask
            misspelled_mask[i] = True
    
    # Get suggestions for misspelled words
    misspelled_words = words[misspelled_mask]
    suggestions = [d.suggest(word) for word in misspelled_words]
    
    # Replace misspelled words with the first suggestion
    for i, suggestion_list in enumerate(suggestions):
        if suggestion_list:
            misspelled_word_index = np.where(words == misspelled_words[i])[0][0]
            words[misspelled_word_index] = suggestion_list[0]
    
    # Join the corrected words back into a single string
    corrected_text = ' '.join(words)
    
    return corrected_text
def process_data(df):
    processed_column = df['text']
    processed_column = processed_column.apply(lambda x:pre_process(x))
    processed_column = processed_column.apply(lambda x:remove_links(x))
    processed_column = processed_column.apply(lambda x:remove_NCR(x))
    processed_column = processed_column.apply(lambda x:remove_hashtag(x))
    processed_column = processed_column.apply(lambda x:remove_mentions(x))
    processed_column = processed_column.apply(lambda x:sep_rem(x))
    processed_column = processed_column.apply(lambda x:remove_RT(x))
    processed_column = processed_column.apply(lambda x:remove_spaces(x))
    # processed_column = processed_column.apply(lambda x:spell_check_and_replace(x))
    df["clean"] = processed_column
    return df



In [6]:

pretrained_model = api.load('word2vec-google-news-300')
word_indices = list(pretrained_model.key_to_index.keys())
word_freq = {word: pretrained_model.get_vecattr(word, "count") for word in word_indices}

def text_to_vectors(text):
    # Tokenize the text (assuming it's already tokenized)
    tokens = text.split()  # You may need a different tokenization method depending on your data
    
    # Get word vectors for each token
    word_vectors = [pretrained_model[token] for token in tokens if token in pretrained_model]

    if not word_vectors:
        # Return zero-filled array of the desired shape
        return np.zeros((100, 300), dtype=np.float32)
    
    # Pad sequence
    max_length = 100
    padded_vectors = np.zeros((max_length, 300), dtype=np.float32)
    num_vectors = min(len(word_vectors), max_length)
    padded_vectors[:num_vectors] = word_vectors[:max_length]
    
    return padded_vectors

# Feature Representation
def feature_rep(df, feature_type="TFIDF"):
    start_time = time.time()
    if feature_type == "TFIDF": # TFIDF
        documents = df["clean"].tolist()
        
        tfidf_vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1, 3),max_features=10000,min_df=5,max_df=0.501)
        X = tfidf_vectorizer.fit_transform(documents)
    
    elif feature_type == "BOW": # Bag of words
        vectorizer = CountVectorizer()
        documents = df["clean"].tolist()
    
        bow_matrix = vectorizer.fit_transform(documents)
        X = bow_matrix.toarray()
    
        vocabulary = vectorizer.get_feature_names_out()
    elif feature_type == "W2V": # Word2Vec
        df['tokenized_text'] = df['clean'].apply(lambda x: word_tokenize(x))
        
        # Initialize Word2Vec model with parameters
        w2v_model = Word2Vec(df['tokenized_text'], vector_size=300, window=5, min_count=1, workers=4)
        
        # Build vocabulary and update it with word frequency
        w2v_model.build_vocab_from_freq(word_freq, update=True)
        w2v_model.wv.vectors_lockf = np.ones(len(w2v_model.wv), dtype=np.dtype(float))
        
        # Load pretrained Word2Vec vectors and intersect with the current model
        pretrained_model_path = api.info('word2vec-google-news-300')['file_name']
        pretrained_model_path_full = "C:\\Users\\Portul\\gensim-data\\word2vec-google-news-300\\" + pretrained_model_path
        w2v_model.wv.intersect_word2vec_format(pretrained_model_path_full, binary=True, lockf=1.0)
        
        # Apply text_to_vectors function to convert text to Word2Vec vectors
        df['text_vector'] = df['clean'].apply(text_to_vectors)
        
        # Average the word vectors for each document
        document_vectors = df['text_vector'].apply(lambda x: np.mean(x, axis=0) if len(x) > 0 else np.zeros(300))
        
        # Convert document vectors to numpy array
        X = np.vstack(document_vectors)
        
    end_time = time.time()
        
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print("Feature Type:", feature_type)
    print("Feature Rep Elapsed time:", round(elapsed_time,2), "seconds")
    print("")
    return X, df["class"].tolist()


In [7]:
def train_classifier(X, y, model_type="SVM"):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)
    
    
    f1_scorer = make_scorer(f1_score, average='weighted')
    cv_scores = []
    elapsed_times = []
    
    start_time = time.time()
    
        
    if model_type == "SVM":
        print("Model Type:", model_type)
        model = LinearSVC(class_weight='balanced', C=0.01, penalty='l2', loss='squared_hinge', multi_class='ovr')
        cv_scores = cross_val_score(model, X, y, cv=5, scoring=f1_scorer )
    
        start_time = time.time()
        model.fit(X_train, y_train)
    
    elif model_type == "NB":
        print("Model Type:", model_type)
        model = MultinomialNB()
        cv_scores = cross_val_score(model, X, y, cv=5, scoring=f1_scorer )
        
        start_time = time.time()
        model.fit(X_train, y_train)
    
    elif model_type == "LR":
        print("Model Type:", model_type)
        model = LogisticRegression(class_weight='balanced', penalty="l2")
        cv_scores = cross_val_score(model, X, y, cv=5, scoring=f1_scorer )
        
        start_time = time.time()
        model.fit(X_train, y_train)
    
    end_time = time.time()
        
    # Save the model to a file
    joblib.dump(model, 'temp_model.pkl')
    
    # Get the size of the saved file
    model_file_size = os.path.getsize('temp_model.pkl')
    
    elapsed_time = end_time - start_time
    pred = model.predict(X_val)

    report = classification_report(y_val, pred, output_dict=True)

    
    print("Cross-Validation Scores:", cv_scores)
    print("Mean CV F1:", cv_scores.mean())
    print("Standard Deviation of CV F1:", cv_scores.std())
    
    print(classification_report(y_val, pred))
    print(f"{round(report['1']['precision'], 2)}/{round(report['1']['recall'], 2)}/{round(report['1']['f1-score'], 2)}/{round(report['macro avg']['f1-score'], 2)}/{round(cv_scores.mean(), 2)}")
    print("Elapsed time:", round(elapsed_time,3), "seconds")
    print("Model file size:", model_file_size, "bytes")

In [12]:
dataset = "davidson"
print("Dataset: ", dataset)
print("")

df = datasets[dataset]
df = process_data(df)
X, y = feature_rep(df, feature_type="TFIDF")
train_classifier(X, y, model_type="SVM")

Dataset:  davidson

Feature Type: TFIDF
Feature Rep Elapsed time: 1.19 seconds

Model Type: SVM
Cross-Validation Scores: [0.88857946 0.8905991  0.91636755 0.91723893 0.91323786]
Mean CV F1: 0.9052045779364535
Standard Deviation of CV F1: 0.012834997317196705
              precision    recall  f1-score   support

           0       0.97      0.90      0.94      7023
           1       0.27      0.60      0.37       412

    accuracy                           0.89      7435
   macro avg       0.62      0.75      0.66      7435
weighted avg       0.94      0.89      0.91      7435

0.27/0.6/0.37/0.66/0.91
Elapsed time: 0.017 seconds
Model file size: 80753 bytes


