In [1]:
%load_ext autoreload
%autoreload

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv(
    "Ethos_Dataset_Binary.csv",
    sep=";"
)

### DataSet Prepare

In [4]:
initial_dpath = "./data/initial_data/"
intermediate_dpath = "./data/intermediate_data/"
prepared_dpath = "./data/prepared_data/"

In [5]:
import os
# making dirs if needed

if not os.path.exists(initial_dpath):
    os.makedirs(initial_dpath)
if not os.path.exists(intermediate_dpath):
    os.makedirs(intermediate_dpath)
if not os.path.exists(prepared_dpath):
    os.makedirs(prepared_dpath)

In [6]:
from pathlib import PurePath

# prepare data files
with open(PurePath(initial_dpath, "comments.csv"), "w") as f:
    for string in data.index:
        f.write(str(data.iloc[string, 0])+"\n")

In [7]:
from pathlib import PurePath

# prepare data files
with open(PurePath(initial_dpath, "labels.csv"), "w") as f:
    for string in data.index:
        f.write(str(data.iloc[string, 1])+"\n")

In [8]:
# prepare_data script
# use poethepoet lib to run scripts

!poetry run poe prepare_data

[37mPoe =>[0m [94mpoetry run python utils/preprocess_text.py[0m


## Prepare Word2Vec model

In [9]:
# use parsers
from utils.parsers import wordtovec_parse_args

# models save dir
model_path = "./models"
if not os.path.exists(model_path):
    os.mkdirs(model_path)
playbook_path = "./playbooks/word2vec_params.json"
train = "False" # set True if train is needed

In [10]:
parser_text = [
    "--prepared_dpath", prepared_dpath,
    "--model_path",     model_path,
    "--train",          train, 
    "--playbook",       playbook_path
]

In [11]:
args = wordtovec_parse_args().parse_args(parser_text)

In [12]:
# download word2vec model
from utils.wordtovec import WordToVec

In [13]:
word2vec = WordToVec(args).load_model()

## Prepare the model

In [16]:
seed = 1

In [38]:
labels = pd.read_csv("./data/initial_data/labels.csv", header=None)
labels[0] =  labels[0].astype(int)
with open("./data/prepared_data/comments_tokinize.csv", "r") as f:
    comments = eval(f.read())

In [39]:
labels[0].unique()

array([1, 0])

In [40]:
comments = [comment for comment in comments if comment!=[]]

In [41]:
# check lens 
assert len(comments) == labels.shape[0]

In [42]:
from sklearn.model_selection import train_test_split

X, validX, Y, validY = train_test_split(
    comments, 
    labels, 
    stratify=labels, 
    shuffle=True,
    random_state=seed,
    test_size=0.1
)

In [43]:
trainX, testX, trainY, testY = train_test_split(
    X, 
    Y,
    stratify=Y, 
    shuffle=True,
    random_state=seed,
    test_size=0.2
)

In [44]:
import numpy as np
def vectorize(sentence, w2v_model):
    words_vecs = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [45]:
trainX = np.array([word2vec.vectorize_sentence(sentence) for sentence in trainX])
testX = np.array([word2vec.vectorize_sentence(sentence) for sentence in testX])
validX = np.array([word2vec.vectorize_sentence(sentence) for sentence in validX])

## Train the model Optimize Hyperparameters

In [90]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import optuna
from dvclive.optuna import DVCLiveCallback

def objective(trial):

    bst = XGBClassifier(
        n_estimators=trial.suggest_int('n_estimators',1,10), 
        max_depth=trial.suggest_int('max_depth',1,10), 
        learning_rate=trial.suggest_float('learning_rate',1e-10,1), 
        objective='binary:logistic'
    )
    # fit model
    bst.fit(trainX, trainY)
    # make predictions
    preds = bst.predict(testX) 
    return (
        f1_score(testY.values.reshape(-1), preds, average="macro"),
        accuracy_score(testY.values.reshape(-1), preds),
        precision_score(testY.values.reshape(-1), preds, average="macro"),
        recall_score(testY.values.reshape(-1), preds, average="macro"),
    )

In [91]:
study = optuna.create_study(directions=["maximize", "maximize","maximize","maximize"])
study.optimize(objective, n_trials=10, callbacks=[DVCLiveCallback()])

[I 2023-06-10 16:17:35,125] A new study created in memory with name: no-name-3ad34163-2aa2-4193-82f2-7454405d7fb6
[I 2023-06-10 16:17:35,778] Trial 0 finished with values: [0.48290241868223516, 0.8277777777777777, 0.5454545454545454, 0.5073076044759077] and parameters: {'n_estimators': 3, 'max_depth': 3, 'learning_rate': 0.7622519581258735}. 
	Ethos_Dataset_Binary.csv, playbooks/word2vec_params.json, utils/wordtovec.py
[I 2023-06-10 16:17:38,298] Trial 1 finished with values: [0.4966829321782992, 0.7166666666666667, 0.4970439189189189, 0.49680292304179036] and parameters: {'n_estimators': 2, 'max_depth': 7, 'learning_rate': 0.8824061578281969}. 
	Ethos_Dataset_Binary.csv, playbooks/word2vec_params.json, utils/wordtovec.py
[I 2023-06-10 16:17:42,940] Trial 2 finished with values: [0.5094505494505495, 0.8277777777777777, 0.5890804597701149, 0.5212377255081069] and parameters: {'n_estimators': 9, 'max_depth': 8, 'learning_rate': 0.5008891225948034}. 
	Ethos_Dataset_Binary.csv, playbooks/w