In [142]:
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import zero_one_loss as J01
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier

In [143]:
train_path = "aclImdb/train/labeledBow.feat"
test_path = "aclImdb/test/labeledBow.feat"

Xtr_full, ytr_full = load_svmlight_file(train_path)
X_test, y_test = load_svmlight_file(test_path, n_features=Xtr_full.shape[1])

In [144]:
print("Train Feature Shape:", Xtr_full.shape, ytr_full.shape)
print("Test Feature Shape:", X_test.shape, y_test.shape)

print("Unique labels:", np.unique(ytr_full))
print("Label Counts:")
print("Number of Positives:", np.sum(ytr_full == 1))
print("Number of Negatives:", np.sum(ytr_full == -1))


Train Feature Shape: (25000, 89527) (25000,)
Test Feature Shape: (25000, 89527) (25000,)
Unique labels: [ 1.  2.  3.  4.  7.  8.  9. 10.]
Label Counts:
Number of Positives: 5100
Number of Negatives: 0


In [145]:
train_sparcity = 1.0 - (Xtr_full.nnz / (Xtr_full.shape[0] * Xtr_full.shape[1]))
test_sparcity = 1.0 - (X_test.nnz / (X_test.shape[0] * X_test.shape[1]))

print("Train Sparcity:", train_sparcity)
print("Test Sparcity:", test_sparcity)

Train Sparcity: 0.9984555787639483
Test Sparcity: 0.9985076354619277


In [146]:
doc_lengths = np.diff(Xtr_full.indptr)

print("Average words per review:", np.mean(doc_lengths))
print("Median words per review:", np.median(doc_lengths))
print("Max words per review:", np.max(doc_lengths))
print("Min words per review:", np.min(doc_lengths))

Average words per review: 138.2674
Median words per review: 113.0
Max words per review: 672
Min words per review: 10


In [147]:
y_bin = np.where(ytr_full >= 7, 1, -1)

print("Positives:", np.sum(y_bin == 1))
print("Negatives:", np.sum(y_bin == -1))
print("Unique Values:", np.unique(y_bin))

Positives: 12500
Negatives: 12500
Unique Values: [-1  1]


In [148]:
Xtr, Xva, ytr, yva = train_test_split(Xtr_full, y_bin, test_size = 0.2, random_state=seed, stratify=y_bin)

In [149]:
rf = RandomForestClassifier(
    n_estimators=200, 
    criterion='entropy', 
    max_depth=None,  
    max_features='sqrt',
    min_samples_leaf=1,
    random_state=seed,
    n_jobs=-1
)

rf.fit(Xtr, ytr)

0,1,2
,n_estimators,200
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [150]:
pred_tr = rf.predict(Xtr)
pred_va = rf.predict(Xva)

print("Train Confusion matrix:")
print(confusion_matrix(yva, pred_va))
print()

print("Train Classification report:")
print(classification_report(yva, pred_va))

Train Confusion matrix:
[[2106  394]
 [ 366 2134]]

Train Classification report:
              precision    recall  f1-score   support

          -1       0.85      0.84      0.85      2500
           1       0.84      0.85      0.85      2500

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



In [151]:
K = 5000

word_counts = np.array(Xtr_full.sum(axis=0)).reshape(-1)
top_k_idx = word_counts.argsort()[::-1][:K]

print("Top K indices shape:", top_k_idx.shape)

Xtr_k = Xtr[:, top_k_idx]
Xva_k = Xva[:, top_k_idx]

print("Original shape:", Xtr.shape)
print("Reduced shape:", Xtr_k.shape)

Xtr_k = Xtr_k.toarray()
Xva_k = Xva_k.toarray()

Top K indices shape: (5000,)
Original shape: (20000, 89527)
Reduced shape: (20000, 5000)


In [152]:
gb = HistGradientBoostingClassifier(
    learning_rate = 0.1, 
    max_depth=None,
    max_leaf_nodes=31,
    random_state=seed
)

gb.fit(Xtr_k, ytr)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,max_iter,100
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [154]:
pred_tr_gb = gb.predict(Xtr_k)
pred_va_gb = gb.predict(Xva_k)

print("Confusion Matrix Val:")
print(confusion_matrix(yva, pred_va_gb))

print("Classification report (val):")
print(classification_report(yva, pred_va_gb))

Confusion Matrix Val:
[[2119  381]
 [ 339 2161]]
Classification report (val):
              precision    recall  f1-score   support

          -1       0.86      0.85      0.85      2500
           1       0.85      0.86      0.86      2500

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [155]:
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("y_test unique values:", np.unique(y_test))

X_test shape: (25000, 89527)
y_test shape: (25000,)
y_test unique values: [ 1.  2.  3.  4.  7.  8.  9. 10.]


In [None]:
yte_pred_rf = rf.predict(X_test)

y_test_bin = np.where(y_test >= 7, 1, -1)
print("Positives:", np.sum(y_test_bin == 1))
print("Negatives:", np.sum(y_test_bin == -1))
print("Unique Values:", np.unique(y_test_bin))

X_test_k = X_test[:, top_k_idx]
X_test_k1 = X_test_k.toarray()
yte_pred_gb = gb.predict(X_test_k1)

In [137]:
print("RF Test accuracy:", accuracy_score(y_test_bin, yte_pred_rf))

print("RF Confusion matrix (test):")
print(confusion_matrix(y_test_bin, yte_pred_rf))

print("RF Classification report (test):")
print(classification_report(y_test_bin, yte_pred_rf))

RF Test accuracy: 0.8534
RF Confusion matrix (test):
[[10585  1915]
 [ 1750 10750]]
RF Classification report (test):
              precision    recall  f1-score   support

          -1       0.86      0.85      0.85     12500
           1       0.85      0.86      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [138]:
print("GB Test accuracy:", accuracy_score(y_test_bin, yte_pred_gb))

print("\nGB Confusion matrix (test):")
print(confusion_matrix(y_test_bin, yte_pred_gb))

print("\nGB Classification report (test):")
print(classification_report(y_test_bin, yte_pred_gb))

GB Test accuracy: 0.85772

GB Confusion matrix (test):
[[10616  1884]
 [ 1673 10827]]

GB Classification report (test):
              precision    recall  f1-score   support

          -1       0.86      0.85      0.86     12500
           1       0.85      0.87      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



In [136]:
rf_01 = J01(y_test_bin, yte_pred_rf)
gb_01 = J01(y_test_bin, yte_pred_gb)

print("Random Forest J01:", rf_01)
print("Gradient Boosting J01:", gb_01)

Random Forest J01: 0.14659999999999995
Gradient Boosting J01: 0.14227999999999996
