In [None]:
!pip install -U kaggle optuna xgboost 'ray[default]' imblearn scikit-learn scipy matplotlib seaborn modin
!mkdir ~/.kaggle

In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

## Imports

In [1]:
import modin.pandas as pd
import pathlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import ray
from scipy.special import softmax
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from cuml.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from cuml import RandomForestClassifier
from cuml.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from cuml.neighbors import NearestNeighbors

import optuna
import pickle

In [2]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

## Downloading Dataset

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/adrish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/adrish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/adrish/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/adrish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
!kaggle competitions download -c cs-506-midterm-fall-2024
!unzip cs-506-midterm-fall-2024.zip -d data/
!rm cs-506-midterm-fall-2024.zip

## Loading Data

In [6]:
ray.init(dashboard_host="0.0.0.0")

2024-10-28 11:51:29,475	INFO worker.py:1807 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://10.210.1.81:8265 [39m[22m


0,1
Python version:,3.11.10
Ray version:,2.38.0
Dashboard:,http://10.210.1.81:8265


In [7]:
df = pd.read_csv('data/train.csv')
submission = pd.read_csv('data/test.csv')
test_ids = submission.copy()['Id']

### EDA

In [8]:
df.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score
0,914403,B0009W5KHM,AV6QDP8Q0ONK4,2,2,1341014400,GOOD FUN FILM,While most straight to DVD films are not worth...,5.0
1,354887,6303079709,A2I8RXJN80A2D2,0,0,1168819200,Movie Review,"I have wanted this one for sometime, also. I ...",5.0
2,1407653,B004H0M2XC,A3FHV3RV8Z12E6,0,0,1386201600,When is it a good time to Consent?,Actually this was a pretty darn good indie fil...,4.0
3,1377458,B003ZJ9536,A12VLTA3ZHVPUY,1,1,1348704000,TRUTH,Episodes 37 to 72 of the series press on in a ...,5.0
4,475323,630574453X,A13NM1PES9OXVN,2,3,970012800,Intelligent and bittersweet -- stays with you,"I was really impressed with this movie, but wa...",3.0


In [9]:
df.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Time,Score
count,1697533.0,1697533.0,1697533.0,1697533.0,1485341.0
mean,848766.0,3.569048,5.301422,1262422000.0,4.110517
std,490035.7,17.27883,20.24445,128927700.0,1.197651
min,0.0,0.0,0.0,879379200.0,1.0
25%,424383.0,0.0,0.0,1164413000.0,4.0
50%,848766.0,1.0,1.0,1307491000.0,5.0
75%,1273149.0,3.0,5.0,1373242000.0,5.0
max,1697532.0,6084.0,6510.0,1406074000.0,5.0


#### Testing for Class Imbalance

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df._to_pandas(), x='Score')
plt.title("Score Distribution")
plt.show()

### Data Preprocessing

In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text)
    text = text.lower()

    # Remove URLs, special characters, numbers, and extra whitespace
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # URLs
    text = re.sub(r'\d+', '', text)  # Numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Special characters and punctuation

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and apply lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Rejoin tokens into a single string
    return ' '.join(cleaned_tokens)

df['Text'] = df['Text'].apply(clean_text)

#### Preparing Training Data

In [9]:
train_df = df[df['Score'].notna()][:100_000]
test_df = df[(df['Score'].isna()) & (df['Id'].isin(test_ids))]

Vectorizing Text

In [10]:
def prepare_data(dataframe):
    count_vec = CountVectorizer()
    counts = count_vec.fit_transform(dataframe['Text'].fillna("")._to_pandas())
    tf_transformer = TfidfTransformer(use_idf=False).fit(counts)
    text_features = tf_transformer.transform(counts).get()
    return count_vec, tf_transformer, text_features

In [11]:
count_vec, tf_transformer, train_X_structured = prepare_data(train_df)
train_y_structured = train_df[['Score']].astype(int)._to_pandas().to_numpy()

Oversampling data to reduce class imbalance

In [14]:
nn = NearestNeighbors(n_neighbors=10)
train_X_resampled, train_y_resampled = SMOTE(k_neighbors=nn).fit_resample(train_X_structured, train_y_structured)

[I] [11:53:10.179355] Unused keyword parameter: n_jobs during cuML estimator initialization


In [None]:
import collections
data = collections.Counter(train_y_resampled)
ax = sns.barplot(data).set(xlabel='Scores', ylabel='Count', title='Score Distribution')

In [15]:
le = LabelEncoder()
train_y_resampled = le.fit_transform(train_y_resampled)
classes = np.unique(train_y_resampled)

In [16]:
storage_name = "sqlite:///storage.db"

In [17]:
storage = optuna.storages.RDBStorage(url=storage_name, engine_kwargs={"connect_args": {"timeout": 100}})

### Training

In [18]:
def fit(X_train, y_train, model, model_name, batch_size):
    global classes
    X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train, random_state=42)
    if model_name in {"knn", "random_forest", "xgboost"}:
        model.fit(X_train_shuffled, y_train_shuffled)
    else:
        # Batch training loop
        n_batches = X_train_shuffled.shape[0] // batch_size
        for batch in range(n_batches + 1):
            start = batch * batch_size
            end = min((batch + 1) * batch_size, X_train_shuffled.shape[0])
            X_batch, y_batch = X_train_shuffled[start:end], y_train_shuffled[start:end]
            
            # Train incrementally where applicable
            if end - start == 0:
                break
            if batch % 1000 == 0:
                print("[%s] batch :%d/%d" % (model_name, batch, n_batches + 1))
            if model_name in {'logistic_regression', 'svc', 'multinomialnb', 'gaussiannb'}:
                if model_name == "gaussiannb":
                    if hasattr(X_batch, "toarray"):
                        X_batch = X_batch.toarray()
                model.partial_fit(X_batch, y_batch, classes=classes)
    return model

In [19]:
def fit_predict(model_name, params, X_train, y_train, X_val, y_val, trial_num=None):
    """Fits model with given params and returns score on validation set."""
    batch_size = 0
    if "batch_size" in params:
        batch_size = params.pop("batch_size")
    if model_name == 'random_forest':
        model = RandomForestClassifier(**params)
        print(model)
    elif model_name == 'logistic_regression':
        model = SGDClassifier(loss="log_loss", **params)
    elif model_name == 'svc':
        model = SGDClassifier(loss="hinge", **params)
    elif model_name == 'knn':
        model = KNeighborsClassifier(**params)
    elif model_name == 'xgboost':
        model = xgb.XGBClassifier(**params, use_label_encoder=False, device="gpu", eval_metric='mlogloss')
    elif model_name == "multinomialnb":
        model = MultinomialNB(**params)
    elif model_name == "gaussiannb":
        model = GaussianNB(**params)
    else:
        raise ValueError("Model not supported")

    model = fit(X_train, y_train, model, model_name, batch_size)
    if trial_num is not None:
        with open("%s-%d.pkl" % (model_name, trial_num), "wb") as f:
            pickle.dump(model, f)
    # Predict on validation set
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    return score

In [20]:
def get_objective(model_name, X_train, y_train, X_val, y_val):
    def objective(trial):
        if model_name == "random_forest":
            params = {
                "n_estimators": trial.suggest_int("rf/n_estimators", 50, 300),
                "max_depth": trial.suggest_int("rf/max_depth", 5, 20),
            }
        elif model_name == "logistic_regression":
            params = {
                "alpha": trial.suggest_float("lr/alpha", 1e-4, 1e-1, log=True),  # Regularization parameter for SGD
                "l1_ratio": trial.suggest_float("lr/l1_ratio", 1e-10, 1, log=True),
                "learning_rate": trial.suggest_categorical("lr/learning_rate", ["optimal", "invscaling", "adaptive"]),
                "penalty": trial.suggest_categorical("lr/penalty", ["l2", "l1", "elasticnet"]),
                "eta0": trial.suggest_float("lr/eta0", 5e-5, 5e-1, log=True),
                "batch_size": trial.suggest_categorical("lr/batch_size", [128, 256, 512, 2048])
            }
        elif model_name == "svc":
            params = {
                "alpha": trial.suggest_float("svc/alpha", 1e-4, 1e-1, log=True),  # Regularization parameter for SGD
                "l1_ratio": trial.suggest_float("svc/l1_ratio", 1e-10, 1, log=True),
                "learning_rate": trial.suggest_categorical("svc/learning_rate", ["optimal", "invscaling", "adaptive"]),
                "penalty": trial.suggest_categorical("svc/penalty", ["l2", "l1", "elasticnet"]),
                "eta0": trial.suggest_float("svc/eta0", 5e-5, 5e-1, log=True),
                "batch_size": trial.suggest_categorical("svc/batch_size", [128, 256, 512, 2048])
            }
        elif model_name == "knn":
            params = {
                "n_neighbors": trial.suggest_int("knn/n_neighbors", 10, 500),
                "p": trial.suggest_categorical("knn/p", [1, 2]),
                "leaf_size": trial.suggest_int("knn/leaf_size", 30, 100)
            }
        elif model_name == "xgboost":
            params = {
                "n_estimators": trial.suggest_int("xgb/n_estimators", 50, 300),
                "max_depth": trial.suggest_int("xgb/max_depth", 5, 20),
                "learning_rate": trial.suggest_float("xgb/learning_rate", 0.01, 0.3),
                "colsample_bytree": trial.suggest_float("xgb/colsample_bytree", 0.5, 1.0),
                "subsample": 0.1,
            }
        elif model_name == "multinomialnb":
            params = {
                "alpha": trial.suggest_float("multinomial/alpha", 0.1, 5.0),
                "batch_size": trial.suggest_categorical("multinomial/batch_size", [2**6, 2**8, 2**10, 2**12, 2**14])
            }
        elif model_name == "gaussiannb":
            params = {
                "var_smoothing": trial.suggest_float("gaussiannb/var_smoothing", 1e-10, 1e-5),
                "batch_size": trial.suggest_categorical("gaussiannb/batch_size", [2**6, 2**8, 2**10, 2**12, 2**14])
            }
        else:
            raise ValueError("Model not supported")
        
        score = fit_predict(model_name, params, X_train, y_train, X_val, y_val, trial.number)
        return score

    return objective

In [21]:
@ray.remote(num_cpus=2, num_gpus=1)
def optimize(model_name, study_name, storage_name, X_train, y_train, X_val, y_val):
    objective_fn = get_objective(model_name, X_train, y_train, X_val, y_val)
    storage = optuna.storages.RDBStorage(url=storage_name, engine_kwargs={"connect_args": {"timeout": 100}})
    study = optuna.load_study(study_name, storage)
    study.optimize(objective_fn, n_trials=1)

def launch_studies_ray(model_name, X_train_id, y_train_id, X_val_id, y_val_id, n_trials=50):
    refs = []
    for i in range(n_trials):
        refs.append(optimize.remote(model_name, model_name, storage_name, X_train_id, y_train_id, X_val_id, y_val_id))
    ray.get(refs)

In [22]:
X_train, X_val, y_train, y_val = train_test_split(train_X_resampled, train_y_resampled, test_size=0.2, random_state=42)

In [22]:
models = ["xgboost", "svc", "knn", "logistic_regression", "multinomialnb"]
for k in models:
    optuna.create_study(direction="maximize", study_name=k, storage=storage)

In [46]:
if not "X_train_id" in globals():
    X_train_id = ray.put(X_train)
    y_train_id = ray.put(y_train)
    X_val_id = ray.put(X_val)
    y_val_id = ray.put(y_val)

for model in models:
    print("Now Optimizing:", model)
    launch_studies_ray(model, X_train_id, y_train_id, X_val_id, y_val_id, n_trials=50)

Now Optimizing: xgboost


[2024-10-28 11:49:55,242 E 66753 67286] core_worker.cc:662: :info_message: Attempting to recover 308 lost objects by resubmitting their tasks. To disable object reconstruction, set @ray.remote(max_retries=0).


LocalRayletDiedError: The task's local raylet died. Check raylet.out for more information.

### Inference

#### Ensemble best of all the models

In [23]:
acc = []
best_models = []
for model in models:
    best_trial = optuna.load_study(study_name=model, storage=storage).best_trial
    acc.append(best_trial.value)
    with open("%s-%d.pkl" % (model, best_trial.number), "rb") as f:
        print(model)
        best_models.append(pickle.load(f))

xgboost
svc
knn
logistic_regression
multinomialnb


In [24]:
probabilities = softmax(np.asarray(acc))

In [25]:
test_X_structured = tf_transformer.transform(count_vec.transform(test_df['Text'].fillna("")._to_pandas())).get()

In [26]:
class MajorityVoting:
    def __init__(self, estimators, probabilities, classes, label_encoder):
        self._estimators = estimators
        self._probabilities = probabilities
        self._label_encoder = label_encoder
        self._classes = classes

    def one_hot(self, labels):
        onehot = np.zeros((len(labels), len(self._classes)))
        onehot[np.arange(len(labels)), labels] = 1.
        return onehot

    def predict(self, X):
        outs = []
        for prob, estimator in zip(self._probabilities, self._estimators):
            labels = estimator.predict(X)
            if hasattr(labels, "get"):
                labels = labels.get()
            outs.append(prob * self.one_hot(labels))
        outs = np.sum(outs, axis=0)
        return self._label_encoder.inverse_transform(np.argmax(outs, axis=1))

In [27]:
ensembled_model = MajorityVoting(best_models, probabilities, classes, le)
test_predictions = ensembled_model.predict(train_X_resampled)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [28]:
print(accuracy_score(le.inverse_transform(train_y_resampled), test_predictions))

0.834643808380538


In [29]:
test_predictions = ensembled_model.predict(test_X_structured)

In [30]:
submission = pd.DataFrame({"Id": submission['Id'], "Score": test_predictions})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.
