In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier, early_stopping

**Load Data**

In [None]:
start_time = time.time()

df = pd.read_csv("amazon_review_ID.shuf.lrn.csv")
df = df.dropna(subset=["Class"])

X = df.drop(columns=["ID", "Class"], errors="ignore")
y = df["Class"]

le = LabelEncoder()
y_enc = le.fit_transform(y)

# smaller dev subset
df_small, _ = train_test_split(df, train_size=0.25, stratify=df["Class"], random_state=42)

**Reduce Features**


In [None]:
# Variance Threshold

vt = VarianceThreshold(threshold=0.01)
X_var = vt.fit_transform(X)
X_var_df = X.iloc[:, vt.get_support(indices=True)]
print(f"→ Remaining features: {X_var_df.shape[1]}")

# LightGBM feature importance
X_train_fs, X_val_fs, y_train_fs, y_val_fs = train_test_split(
    X_var_df, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

lgb_fs = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.01,
    num_leaves=63,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.0,
    min_child_samples=20,
    n_jobs=-1,
    random_state=42
)

lgb_fs.fit(
    X_train_fs, y_train_fs,
    eval_set=[(X_val_fs, y_val_fs)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(stopping_rounds=200)]
)

importances = pd.Series(lgb_fs.feature_importances_, index=X_var_df.columns)
importances = importances.sort_values(ascending=False)

cutoff = 0.85
cum_importance = np.cumsum(importances) / importances.sum()
n_keep = np.argmax(cum_importance >= cutoff) + 1
top_features = importances.index[:n_keep]
X_reduced = X_var_df[top_features]

# Correlation filter
corr_matrix = X_reduced.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.90)]
X_final = X_reduced.drop(columns=to_drop)
print(f"→ Dropped {len(to_drop)} highly correlated features. Final shape: {X_final.shape}")

**Save Small Data**

In [None]:
X_final = X_final.reset_index(drop=True)
pd.Series(y_enc).to_csv("amazon_labels.csv", index=False)
X_final.to_csv("amazon_features_reduced.csv.gz", index=False, compression="gzip")
joblib.dump(le, "label_encoder.joblib")