In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import os

# Read data

In [2]:
def read_file(*args,file_type="csv")->pd.DataFrame:
    """:parameter input directories in sequence
        :return DataFrame"""
    
    path=os.path.join(*args)
    if file_type == "tsv":
        return pd.read_csv(path, sep="\t")
    return pd.read_csv(path)

In [3]:
df=read_file("..","data","processed","cleaned.csv").dropna(how="any").drop_duplicates()

In [4]:
X= df["comment"]
y= df["category"]+1

In [5]:
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Load spacy language model for POS tagging
nlp = spacy.load('en_core_web_sm')

In [7]:
# Function to extract custom features
def extract_custom_features(text):
    doc = nlp(text)
    word_list = [token.text for token in doc]

    # 1. Comment Length (number of characters)
    comment_length = len(text)

    # 2. Word Count
    word_count = len(word_list)

    # 3. Average Word Length
    avg_word_length = sum(len(word) for word in word_list) / word_count if word_count > 0 else 0

    # 4. Unique Word Count
    unique_word_count = len(set(word_list))

    # 5. Lexical Diversity
    lexical_diversity = unique_word_count / word_count if word_count > 0 else 0

    # 6. Count of POS Tags
    pos_count = len([token.pos_ for token in doc])

    # 7. Proportion of POS Tags
    pos_tags = [token.pos_ for token in doc]
    pos_proportion = {tag: pos_tags.count(tag) / word_count for tag in set(pos_tags)} if word_count > 0 else {}

    return {
        'comment_length': comment_length,
        'word_count': word_count,
        'avg_word_length': avg_word_length,
        'unique_word_count': unique_word_count,
        'lexical_diversity': lexical_diversity,
        'pos_count': pos_count,
        **pos_proportion  # Flattening the POS proportions
    }

In [8]:
# Apply the custom feature extraction
train_custom_features = pd.DataFrame([extract_custom_features(text) for text in X_train_cleaned])
test_custom_features = pd.DataFrame([extract_custom_features(text) for text in X_test_cleaned])

In [9]:
train_custom_features.head()

Unnamed: 0,comment_length,word_count,avg_word_length,unique_word_count,lexical_diversity,pos_count,ADJ,NOUN,PROPN,VERB,...,NUM,PART,ADV,AUX,SCONJ,CCONJ,PRON,DET,PUNCT,SYM
0,49,7,6.142857,7,1.0,7,0.285714,0.285714,0.285714,0.142857,...,,,,,,,,,,
1,938,140,5.707143,101,0.721429,140,0.135714,0.364286,0.042857,0.178571,...,0.007143,0.028571,0.114286,0.028571,0.014286,0.035714,0.014286,,,
2,15,2,7.0,2,1.0,2,0.5,,0.5,,...,,,,,,,,,,
3,86,15,4.8,13,0.866667,15,0.2,0.333333,0.266667,0.133333,...,,,,0.066667,,,,,,
4,9,1,9.0,1,1.0,1,,,1.0,,...,,,,,,,,,,


In [10]:
# Replace NaN values in POS tag proportions with 0
train_custom_features.fillna(0, inplace=True)
test_custom_features.fillna(0, inplace=True)

In [11]:
test_custom_features.isnull().sum()

comment_length       0
word_count           0
avg_word_length      0
unique_word_count    0
lexical_diversity    0
pos_count            0
ADJ                  0
VERB                 0
NOUN                 0
ADV                  0
PROPN                0
AUX                  0
ADP                  0
NUM                  0
PART                 0
DET                  0
SCONJ                0
CCONJ                0
PRON                 0
INTJ                 0
X                    0
PUNCT                0
SYM                  0
dtype: int64

In [12]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train_cleaned)
X_test_tfidf = tfidf.transform(X_test_cleaned)

In [13]:
# Convert TF-IDF to DataFrame
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [14]:
# Combine TF-IDF and custom features
X_train_combined = pd.concat([X_train_tfidf_df.reset_index(drop=True), train_custom_features.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([X_test_tfidf_df.reset_index(drop=True), test_custom_features.reset_index(drop=True)], axis=1)

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

In [16]:
clf = ExtraTreesClassifier(n_estimators=50)

In [17]:
clf.fit(X_train_combined, y_train_cleaned)

In [18]:
clf.feature_importances_

array([8.84990989e-05, 8.17035065e-06, 8.85514936e-06, ...,
       1.00876904e-03, 7.10614060e-05, 5.05696044e-06])

In [19]:
from sklearn.feature_selection import SelectFromModel

In [20]:
fea_selection_model = SelectFromModel(clf, prefit=True, threshold=0.000025)

X_new = fea_selection_model.transform(X_train_combined)

X_new.shape



(28992, 5007)

In [21]:
import lightgbm as lgb

In [22]:
model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 0.1,  # L2 regularization
    learning_rate= 0.08081298097796712,
    n_estimators= 367,
    max_depth= 20
)

In [23]:
# Fit the model on the resampled training data
model.fit(X_new, y_train_cleaned)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.171817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129874
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 3831
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [24]:
# Predict on the test set
from sklearn.metrics import accuracy_score
y_pred = model.predict(fea_selection_model.transform(X_test_combined))
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy



0.8584437086092715

In [25]:
import optuna

In [26]:
def objective(trial):
    threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)
    fea_selection_model = SelectFromModel(clf, prefit=True, threshold=threshold)

    X_new = fea_selection_model.transform(X_train_combined)
    model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 0.1,  # L2 regularization
    learning_rate= 0.08081298097796712,
    n_estimators= 367,
    max_depth= 20,
    )
    
    model.fit(X_new, y_train_cleaned)
    y_pred = model.predict(fea_selection_model.transform(X_test_combined))
    accuracy = accuracy_score(y_test_cleaned, y_pred)
    return accuracy
    

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-10-21 18:15:05,230] A new study created in memory with name: no-name-9e34e8ac-9880-4612-b096-3241a9ed06b6
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.103277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110140
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 2224
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:15:25,070] Trial 0 finished with value: 0.8584437086092715 and parameters: {'threshold': 6.384685675221005e-05}. Best is trial 0 with value: 0.8584437086092715.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.152798 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 124219
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 3307
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:15:48,665] Trial 1 finished with value: 0.8577538631346578 and parameters: {'threshold': 3.5372712004607385e-05}. Best is trial 0 with value: 0.8584437086092715.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 116108
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 2635
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:16:10,782] Trial 2 finished with value: 0.859271523178808 and parameters: {'threshold': 5.0617359080315246e-05}. Best is trial 2 with value: 0.859271523178808.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110574
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 2253
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:16:31,699] Trial 3 finished with value: 0.8585816777041942 and parameters: {'threshold': 6.274636601727467e-05}. Best is trial 2 with value: 0.859271523178808.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068851 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98150
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 1551
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:16:48,529] Trial 4 finished with value: 0.8533388520971302 and parameters: {'threshold': 9.542632102557464e-05}. Best is trial 2 with value: 0.859271523178808.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.193641 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133410
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 4185
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:17:15,917] Trial 5 finished with value: 0.8580298013245033 and parameters: {'threshold': 1.5620070885216963e-05}. Best is trial 2 with value: 0.859271523178808.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97212
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 1507
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:17:34,986] Trial 6 finished with value: 0.8522350993377483 and parameters: {'threshold': 9.977317973447437e-05}. Best is trial 2 with value: 0.859271523178808.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.151252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 104699
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 1887
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-10-21 18:17:56,645] Trial 7 finished with value: 0.8552704194260485 and parameters: {'threshold': 7.674776785983511e-05}. Best is trial 2 with value: 0.859271523178808.
  threshold = trial.suggest_uniform('threshold', 0.00001, 0.0001)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087885 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97524
[LightGBM] [Info] Number of data points in the train set: 28992, number of used features: 1521
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
