In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [2]:
# Load the dataset
df = pd.read_csv(
    '../../data/processed/reddit_clean_final.csv',
    keep_default_na=False,
    na_filter=False
)

In [3]:
# Separate features and target
X_cleaned = df['clean_comment']
y_cleaned = df['category']

# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)


In [5]:
# Load spacy language model for POS tagging
nlp = spacy.load('en_core_web_sm')

In [6]:
# Function to extract custom features
def extract_custom_features(text):
    doc = nlp(text)
    word_list = [token.text for token in doc]

    # 1. Comment Length (number of characters)
    comment_length = len(text)

    # 2. Word Count
    word_count = len(word_list)

    # 3. Average Word Length
    avg_word_length = sum(len(word) for word in word_list) / word_count if word_count > 0 else 0

    # 4. Unique Word Count
    unique_word_count = len(set(word_list))

    # 5. Lexical Diversity
    lexical_diversity = unique_word_count / word_count if word_count > 0 else 0

    # 6. Count of POS Tags
    pos_count = len([token.pos_ for token in doc])

    # 7. Proportion of POS Tags
    pos_tags = [token.pos_ for token in doc]
    pos_proportion = {tag: pos_tags.count(tag) / word_count for tag in set(pos_tags)} if word_count > 0 else {}

    return {
        'comment_length': comment_length,
        'word_count': word_count,
        'avg_word_length': avg_word_length,
        'unique_word_count': unique_word_count,
        'lexical_diversity': lexical_diversity,
        'pos_count': pos_count,
        **pos_proportion  # Flattening the POS proportions
    }


In [7]:
# Apply the custom feature extraction
train_custom_features = pd.DataFrame([extract_custom_features(text) for text in X_train_cleaned])
test_custom_features = pd.DataFrame([extract_custom_features(text) for text in X_test_cleaned])

In [8]:
train_custom_features.head()

Unnamed: 0,comment_length,word_count,avg_word_length,unique_word_count,lexical_diversity,pos_count,NOUN,VERB,CCONJ,INTJ,...,PRON,PART,PROPN,AUX,DET,NUM,ADP,X,PUNCT,SYM
0,570,85,5.717647,60,0.705882,85,0.305882,0.247059,0.047059,0.023529,...,0.011765,0.011765,0.176471,,,,,,,
1,9,2,4.0,2,1.0,2,0.5,,,,...,,,,,,,,,,
2,187,29,5.482759,25,0.862069,29,0.310345,0.241379,,,...,,,0.241379,0.034483,,,,,,
3,56,8,6.125,8,1.0,8,0.625,0.125,,,...,,,,,0.125,,,,,
4,235,31,6.612903,28,0.903226,31,0.354839,0.16129,,,...,,,0.129032,,,0.032258,,,,


In [9]:
# Replace NaN values in POS tag proportions with 0
train_custom_features.fillna(0, inplace=True)
test_custom_features.fillna(0, inplace=True)

In [10]:
test_custom_features.isnull().sum()

comment_length       0
word_count           0
avg_word_length      0
unique_word_count    0
lexical_diversity    0
pos_count            0
NOUN                 0
VERB                 0
ADP                  0
CCONJ                0
ADJ                  0
ADV                  0
PRON                 0
NUM                  0
PROPN                0
PART                 0
INTJ                 0
AUX                  0
SCONJ                0
DET                  0
X                    0
SYM                  0
PUNCT                0
dtype: int64

In [11]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train_cleaned)
X_test_tfidf = tfidf.transform(X_test_cleaned)

In [12]:
# Convert TF-IDF to DataFrame
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [13]:
# Combine TF-IDF and custom features
X_train_combined = pd.concat([X_train_tfidf_df.reset_index(drop=True), train_custom_features.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([X_test_tfidf_df.reset_index(drop=True), test_custom_features.reset_index(drop=True)], axis=1)

In [14]:
X_train_combined

Unnamed: 0,000,000 000,000 crore,000 rupee,100,100 crore,100 time,100 year,1000,1000 note,...,PRON,PART,PROPN,AUX,DET,NUM,ADP,X,PUNCT,SYM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011765,0.011765,0.176471,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.241379,0.034483,0.000,0.000000,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.125,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.129032,0.000000,0.000,0.032258,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.018182,0.163636,0.000000,0.000,0.018182,0.018182,0.0,0.0,0.0
29281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.066667,0.133333,0.000000,0.000,0.000000,0.066667,0.0,0.0,0.0
29282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0
29283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.009524,0.000000,0.047619,0.000,0.038095,0.000000,0.0,0.0,0.0


In [15]:
import lightgbm as lgb

In [16]:
model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 0.1,  # L2 regularization
    learning_rate= 0.08081298097796712,
    n_estimators= 367,
    max_depth= 20
)

In [17]:
# Fit the model on the resampled training data
model.fit(X_train_combined, y_train_cleaned)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.059553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135917
[LightGBM] [Info] Number of data points in the train set: 29285, number of used features: 4433
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [18]:
# Predict on the test set
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.856733133023764

In [19]:
from sklearn.metrics import classification_report
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.75      0.78      0.77      1660
           0       0.86      0.94      0.90      2517
           1       0.91      0.83      0.87      3145

    accuracy                           0.86      7322
   macro avg       0.84      0.85      0.85      7322
weighted avg       0.86      0.86      0.86      7322

