In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
# Drop missing and duplicate values
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df = df[~(df['clean_comment'].str.strip() == '')]

# Import necessary libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r"[^A-Za-z0-9\s!?.]", '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

# Apply the preprocessing function to the `clean_comment` column
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = df.dropna()

In [None]:
# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

# Split the cleaned data into train and test sets (80-20 split)


X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(
    X_cleaned,
    y_cleaned,
    test_size=0.2,
    random_state=42,
    stratify=y_cleaned
)


In [None]:
# Load spacy language model for POS tagging
nlp=spacy.load('en_core_web_sm')

In [None]:
def extract_custom_features(text):
  doc=nlp(text)

  word_list=[token.text for token in doc]

  comment_length=len(text)

  word_count=len(word_list)

  avg_word_length=sum(len(word)for word in word_list)/word_count if word_count>0 else 0

  unique_word_count=len(set(word_list))

  lexical_diversity=unique_word_count/word_count if word_count>0 else 0

  pos_count=len([token.pos_ for token  in doc])

  pos_tags=[token.pos_ for token in doc]

  pos_proportion={tag:pos_tags.count(tag)/word_count for tag in set(pos_tags)}


  return {
    'comment_length': comment_length,
    'word_count': word_count,
    'avg_word_length': avg_word_length,
    'unique_word_count': unique_word_count,
    'lexical_diversity': lexical_diversity,
    'pos_count': pos_count,
    **pos_proportion  # Flattening the POS proportions
}










In [None]:
train_custom_features=pd.DataFrame([extract_custom_features(comment) for comment in X_train_cleaned])
test_custom_features=pd.DataFrame([extract_custom_features(comment) for comment in X_test_cleaned])

In [None]:
train_custom_features.head()

Unnamed: 0,comment_length,word_count,avg_word_length,unique_word_count,lexical_diversity,pos_count,NOUN,ADV,ADJ,VERB,...,AUX,CCONJ,DET,PRON,X,ADP,INTJ,SCONJ,PUNCT,SYM
0,22,3,6.666667,3,1.0,3,0.333333,0.333333,0.333333,,...,,,,,,,,,,
1,368,58,5.362069,39,0.672414,58,0.327586,0.068966,0.034483,0.224138,...,0.068966,0.034483,,,,,,,,
2,51,9,4.777778,9,1.0,9,0.222222,,0.333333,0.333333,...,,,,,,,,,,
3,32,6,4.5,4,0.666667,6,,,,,...,,,,,,,,,,
4,59,9,5.666667,9,1.0,9,0.444444,0.222222,0.111111,0.111111,...,,,,,,,,,,


In [None]:
# Replace NaN values in POS tag proportions with 0
train_custom_features.fillna(0,inplace=True)
test_custom_features.fillna(0,inplace=True)

In [None]:
test_custom_features.isnull().sum()

Unnamed: 0,0
comment_length,0
word_count,0
avg_word_length,0
unique_word_count,0
lexical_diversity,0
pos_count,0
NOUN,0
ADV,0
ADJ,0
VERB,0


In [None]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf=TfidfVectorizer(ngram_range=(1,3),max_features=10000)
X_train_tfidf=tfidf.fit_transform(X_train_cleaned)
X_test_tfidf=tfidf.transform(X_test_cleaned)

In [None]:
# Convert TF-IDF to DataFrame
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
# 1️⃣ Get union of all POS tag columns
all_custom_columns = list(set(train_custom_features.columns).union(set(test_custom_features.columns)))

# 2️⃣ Reindex both to ensure same columns, missing ones will be filled with NaN
train_custom_features = train_custom_features.reindex(columns=all_custom_columns, fill_value=0)
test_custom_features = test_custom_features.reindex(columns=all_custom_columns, fill_value=0)

# ✅ Now concat
X_train_combined = pd.concat(
    [X_train_tfidf_df.reset_index(drop=True), train_custom_features.reset_index(drop=True)], axis=1
)
X_test_combined = pd.concat(
    [X_test_tfidf_df.reset_index(drop=True), test_custom_features.reset_index(drop=True)], axis=1
)


In [None]:
# Combine TF-IDF and custom features
##X_train_combined = pd.concat([X_train_tfidf_df.reset_index(drop=True), train_custom_features.reset_index(drop=True)], axis=1)
### X_test_combined = pd.concat([X_test_tfidf_df.reset_index(drop=True), test_custom_features.reset_index(drop=True)], axis=1)

In [None]:
X_train_combined

Unnamed: 0,000,000 000,000 crore,000 rupee,100,100 crore,100 year,1000,1000 note,101,...,NOUN,DET,X,word_count,NUM,comment_length,VERB,PROPN,ADP,SCONJ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,3,0.000000,22,0.000000,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.327586,0.0,0.0,58,0.034483,368,0.224138,0.189655,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.222222,0.0,0.0,9,0.000000,51,0.333333,0.111111,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,6,0.000000,32,0.000000,1.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.444444,0.0,0.0,9,0.000000,59,0.111111,0.111111,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,1.0,1,0.000000,11,0.000000,0.000000,0.0,0.0
29430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.437500,0.0,0.0,16,0.000000,111,0.187500,0.125000,0.0,0.0
29431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.250000,0.0,0.0,4,0.000000,23,0.250000,0.000000,0.0,0.0
29432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.382353,0.0,0.0,34,0.000000,204,0.235294,0.029412,0.0,0.0


In [None]:
import lightgbm as lgb

In [None]:
model=lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    metric='multi_logloss',
    is_unbalance=True,
    class_weight='balanced',
    learning_rate=0.09,
    n_estimators=360,
    max_depth=32,
    reg_alpha=3.053178589412354e-07,
    reg_lambda=7.198246850506435e-08
)

In [None]:
# Fit the model on the resampled training data
model.fit(X_train_combined, y_train_cleaned)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.413408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136760
[LightGBM] [Info] Number of data points in the train set: 29434, number of used features: 4449
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [None]:
# Predict on the test set
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.8598994428590842

In [None]:
from sklearn.metrics import classification_report
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.79      0.76      0.77      1650
           0       0.86      0.95      0.90      2555
           1       0.90      0.84      0.87      3154

    accuracy                           0.86      7359
   macro avg       0.85      0.85      0.85      7359
weighted avg       0.86      0.86      0.86      7359

