In [None]:
import pandas as pd
import numpy as np

In [None]:
train_set = pd.read_csv('dataset/train_processed.csv')
train_set.head()

In [None]:
test_set = pd.read_csv('dataset/test_processed.csv')
test_set.head()

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features=15000, stop_words='english', ngram_range=(1, 3))

In [None]:
tfidf_X_train = vectorizer.fit_transform(train_set['clean_text'])
tfidf_X_test = vectorizer.fit_transform(test_set['clean_text'])

### Word2Vec

In [None]:
!pip install gensim

In [None]:
import gensim.downloader as api

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
def text_vectorize(ls):
    sumup = np.zeros(300)
    vector_values = []
    for i in ls:
        try:
            word_vector = wv[i]
            vector_values.append(word_vector)
        except KeyError:
            pass
    for k in vector_values:
        sumup += k
    sen_vector = sumup/len(vector_values)
    return sen_vector

In [None]:
def w2v_process(column):
  output = []
  for i in range(len(column)):
    tokens = column[i].split()
    vector = text_vectorize(tokens)
    output.append(vector)
  return output

In [None]:
train_w2v = w2v_process(train_set['clean_text'])
test_w2v = w2v_process(test_set['clean_text'])

In [None]:
train_set['w2v'] = train_w2v
test_set['w2v'] = test_w2v

### XGBClassifier

In [None]:
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

#### TF-IDF

In [None]:
xgb_model_tfidf = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_tfidf.fit(tfidf_X_train, train_set['label_num'])

In [None]:
y_pred = xgb_model_tfidf.predict(tfidf_X_test)
print(f"Accuracy: {metrics.accuracy_score(test_set['label_num'], y_pred)}")
print(f"F1 Score: {metrics.f1_score(test_set['label_num'], y_pred)}")
print(f"Precision: {metrics.precision_score(test_set['label_num'], y_pred)}")
print(f"Recall: {metrics.recall_score(test_set['label_num'], y_pred)}")

#### Word2Vec

In [None]:
xgb_model_w2v = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
train_w2v_array = np.vstack(train_set['w2v'].to_numpy())
train_w2v_array.shape

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_w2v_array_imputed = imputer.fit_transform(train_w2v_array)

In [None]:
xgb_model_w2v.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
test_w2v_array = np.vstack(test_set['w2v'].to_numpy())
test_w2v_array_imputed = imputer.transform(test_w2v_array)
y_pred = xgb_model_w2v.predict(test_w2v_array_imputed)
print(f"Accuracy: {metrics.accuracy_score(test_set['label_num'], y_pred)}")
print(f"F1 Score: {metrics.f1_score(test_set['label_num'], y_pred)}")
print(f"Precision: {metrics.precision_score(test_set['label_num'], y_pred)}")
print(f"Recall: {metrics.recall_score(test_set['label_num'], y_pred)}")

### XGBClassifier - Tuned

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1.0],
    'tree_method': ['hist'],
    'device': ['cuda:0']
}

#### TF-IDF

In [None]:
grid_search_tfidf = GridSearchCV(
    estimator=xgb_model_tfidf,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
grid_search_tfidf.fit(tfidf_X_train, train_set['label_num'])

In [None]:
print("Best parameters found: ", grid_search_tfidf.best_params_)

In [None]:
xgb_model_tfidf_tuned = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=50,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_tfidf_tuned.fit(tfidf_X_train, train_set['label_num'])

In [None]:
y_pred_tuned = xgb_model_tfidf_tuned.predict(tfidf_X_test)
print(f"Accuracy Tuned: {metrics.accuracy_score(test_set['label_num'], y_pred_tuned)}")
print(f"F1 Score Tuned: {metrics.f1_score(test_set['label_num'], y_pred_tuned)}")
print(f"Precision Tuned: {metrics.precision_score(test_set['label_num'], y_pred_tuned)}")
print(f"Recall Tuned: {metrics.recall_score(test_set['label_num'], y_pred_tuned)}")

#### Word2Vec

In [None]:
grid_search_w2v = GridSearchCV(
    estimator=xgb_model_w2v,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
grid_search_w2v.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
print("Best parameters found: ", grid_search_w2v.best_params_)

In [None]:
xgb_model_w2v_tuned = XGBClassifier(
    tree_method='hist',
    device='cuda:0',
    n_estimators=200,
    max_depth=9,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [None]:
xgb_model_w2v_tuned.fit(train_w2v_array_imputed, train_set['label_num'])

In [None]:
y_pred_tuned = xgb_model_w2v_tuned.predict(test_w2v_array_imputed)
print(f"Accuracy Tuned: {metrics.accuracy_score(test_set['label_num'], y_pred_tuned)}")
print(f"F1 Score Tuned: {metrics.f1_score(test_set['label_num'], y_pred_tuned)}")
print(f"Precision Tuned: {metrics.precision_score(test_set['label_num'], y_pred_tuned)}")
print(f"Recall Tuned: {metrics.recall_score(test_set['label_num'], y_pred_tuned)}")